Source code for arsenal.nlp.tests.annotation
import re
from arsenal.nlp.annotation import sgml2bio, line_groups, bio2span
[docs]def equals_mod_whitespace(a,b):
""" check if strings are equal ignoring differences in whitespace. """
return re.sub('\s*', '', a) == re.sub('\s*', '', b)
[docs]def test_sgml_reconstruction():
reference_dataset = '/home/timv/projects/crf/data/tagged_references.txt'
with open(reference_dataset, 'r') as f:
for sgml in line_groups(f.read(), '<NEW.*?>'):
(labels, tokens) = list(zip(*sgml2bio(sgml)))
# convert spans to sgml
spans = bio2span(labels)
reconstructed = ' '.join('<%s>%s</%s>' % (l, ' '.join(tokens[b:e]), l) for (l,b,e) in spans)
assert equals_mod_whitespace(reconstructed, sgml), \
'reconstructed example should only differ in whitespace.'
print('passed sgml reconstruction test.')
if __name__ == '__main__':
test_sgml_reconstruction()