Source code for arsenal.nlp.annotation

import re
from arsenal.misc import force


[docs]class ParseError(Exception):
    """ Custom exception class used by this module. """
    pass

[docs]class Span(object):
    __slots__ = ('label','begins','ends')
    def __init__(self, label, begins, ends):
        self.label = label
        self.begins = begins
        self.ends = ends
    def __repr__(self):
        return 'Span(label=%r, begins=%r, ends=%r)' % (self.label, self.begins, self.ends)
    def __eq__(self, other):
        if isinstance(other, Span):
            return (self.label == other.label and self.begins == other.begins and self.ends == other.ends)
        else:
            return len(other) == 3 and (self.label == other[0] and self.begins == other[1] and self.ends == other[2])
    def __iter__(self):
        return iter((self.label, self.begins, self.ends))


WhitespaceLexer = re.compile('\S+')

Lexer = re.compile('|'.join(["http://\S+",                               # keep urls together.. might include invalid URL characters
                             "\S+@\S+",
                             "[0-9][0-9]?\s*\([0-9]?\)",                 # e.g. "7(4)" common in volume
                             "\(\s*[0-9][0-9]?\s*\)",                    # e.g. "(4)" common in volume
                             "[0-9]+\s*-\s*[0-9]+",                      # possible page number
                             "[0-9]+(?:st|nd|rd|th)",
                             "[A-Z]\.",
                             "Ph\.?[Dd]\.",
                             #"[Vv]ol(?:\.|ume)\s*[0-9]+",               # make "Vol. 3" one token.
                             "(?:Vol|Proc|Dept|Univ|No|Inc|Dr)\s*\.",
                             "pp\.",
                             "\(\s*[0-9][0-9][0-9][0-9][a-z]?\s*\)",     # e.g. "(1994)" year in parens
                             "[Ee]d(?:s?\.|itors?)",
                             #"\w+-\w+",
                             "\+[A-Z]+\+",
                             "[a-zA-Z]+(?:'s)?",
                             #"[0-9]+(?:\.[0-9]+)?",                      # 231 or 2.0, but not 2.
                             "[0-9]+",
                             "[()\"'\-\.,]",
                             "\S+"]))

TaggedText = re.compile("<([a-z0-9_]+)>([\w\W]+?)</([a-z0-9_]+)>|([^<>\s]+)", re.IGNORECASE)

[docs]def fromSGML(f, linegrouper="\n", bioencoding=False):
    for line in re.split(linegrouper, open(f).read()):
        if bioencoding:
            seq = sgml2bio(line)
        else:
            seq = sgml2seq(line)
        if seq:
            yield seq

[docs]@force
def sgml2segmentation(x, lexer=WhitespaceLexer):
    """
    >>> sgml2segmentation('<title>Cat in the Hat</title><author>Dr. Seuss</author>')
    [('title', ['Cat', 'in', 'the', 'Hat']), ('author', ['Dr.', 'Seuss'])]
    """
    x = x.strip().replace("\n", " +L+ ")
    for (tag, tagged, close, outside) in TaggedText.findall(x):
        if tag != close:
            raise ParseError("opening (%s) and closing (%s) tags do not match in sequence\n    %r\n" % (tag, close, x))
        if tagged:
            yield (tag, lexer.findall(tagged))
        else:
            for w in lexer.findall(outside):
                yield ("O", [w])

[docs]@force
def sgml2bio(x):
    """
    >>> sgml2bio('<title>Cat in the Hat</title><author>Dr. Seuss</author>')
    [('B-title', 'Cat'), ('I-title', 'in'), ('I-title', 'the'), ('I-title', 'Hat'), ('B-author', 'Dr.'), ('I-author', 'Seuss')]
    """
    for (tag, tokens) in sgml2segmentation(x):
        tokens = iter(tokens)
        yield ('B-' + tag, next(tokens))
        for w in tokens:
            yield ('I-' + tag, w)

[docs]@force
def sgml2seq(x):
    """
    >>> sgml2seq('<title>Cat in the Hat</title><author>Dr. Seuss</author>')
    [('title', 'Cat'), ('title', 'in'), ('title', 'the'), ('title', 'Hat'), ('author', 'Dr.'), ('author', 'Seuss')]
    """
    for (tag, tokens) in sgml2segmentation(x):
        for w in tokens:
            yield (tag, w)

[docs]def bracket2bio(x):
    """
    generate BIO-token pairs from bracket-style annotation.
    Note: splits text of spaces, so wordsplitting should already be done.

    >>> x = bracket2bio("[TITLE Cat in the Hat][AUTHOR Dr. Seuss]")
    >>> list(x)                                  #doctest:+NORMALIZE_WHITESPACE
    [('B-TITLE', 'Cat'), ('I-TITLE', 'in'), ('I-TITLE', 'the'),
     ('I-TITLE', 'Hat'), ('B-AUTHOR', 'Dr.'), ('I-AUTHOR', 'Seuss')]
    """
    if '\n' in x:
        raise ParseError('No newlines allowed in brack2bio annotation.')
    for label, tagged, word in re.findall('(?:(?:\[([A-Z0-9]+)\s+(.+?)\s*\]\s*)|(.+?)(?:\s+|$))', x):
        if word:
            yield ('O', word)
        else:
            words = iter(tagged.split())
            yield ('B-%s' % label, next(words))
            for w in words:
                if '[' in w or ']' in w:
                    raise ParseError('brackets can not appear within a word.')
                yield ('I-%s' % label, w)

# TIMV: we want something like a LineGroupIterator
[docs]def line_groups(text, pattern):
    """
    Very simple function for breaking up text into groups based on a
    single pattern.

    >>> list(line_groups("a BB c d BB", "BB"))
    ['a', 'c d']
    """
    for group in re.split(pattern, text):    # TODO: make this "lazier"
        group = group.strip()
        if group:
            yield group


[docs]def extract_contiguous(s, labeler=None):
    """
    >>> list(extract_contiguous(""))
    []

    >>> list(extract_contiguous("AAAA"))
    [Span(label='A', begins=0, ends=4)]

    >>> list(extract_contiguous("AABBC"))
    [Span(label='A', begins=0, ends=2), Span(label='B', begins=2, ends=4), Span(label='C', begins=4, ends=5)]

    >>> list(extract_contiguous("AABBB"))
    [Span(label='A', begins=0, ends=2), Span(label='B', begins=2, ends=5)]
    """
    if labeler is not None:
        s = map(labeler, s)
    prev = None
    b = e = 0
    for e, token in enumerate(s):
        if token != prev:
            if prev is not None:
                yield Span(prev, b, e)
            b = e
        prev = token
    # emit lingering bits
    if prev is not None:
        yield Span(prev, b, e + 1)


[docs]@force
def bio2span(seq, tagger=None, include_O=True):
    if tagger is not None:
        seq = map(tagger, seq)
    phrase = None
    intag = None
    for i, lbl in enumerate(seq):
        if lbl is None:
            lbl = 'O'
        label = lbl[2:]
        if lbl.startswith('B-'):
            if intag and phrase:
                yield phrase
            phrase = Span(label, i, i + 1)
        elif lbl.startswith('I-'):
            if intag == label:             # and youre still in the same span
                phrase.ends = i + 1
            else:                          # you're in a new span (hueristic correction)
                if phrase:
                    yield phrase
                phrase = Span(label, i, i + 1)
        else:
            if intag:                      # was in tag, now outiside ("O")
                if phrase:
                    yield phrase
                phrase = None
            if include_O:
                yield Span(lbl, i, i + 1)
        if lbl == 'O':
            intag = None
        else:
            intag = label
    if intag and phrase:                   # close any lingering spans
        yield phrase


if __name__ == '__main__':
    from arsenal.misc import piped
    def main():
        for line in piped() or []:
            for (label, w) in sgml2bio(line):
                print('%s\t%s' % (label, w))
            print()
    main()
Source code for arsenal.nlp.annotation

arsenal

Navigation

Related Topics