1

I am trying to convert a dataset to .spacy by converting it first in doc and then to DocBin. The whole dataset file is accessible via GoogleDocs.

I run the following function:

def converter(data, outputFile):
    nlp = spacy.blank("en") # load a new spacy model
    doc_bin = DocBin() # create a DocBin object

    for text, annot in tqdm(data): # data in previous format
        doc = nlp.make_doc(text) # create doc object from text    
        ents = []
        
        for start, end, label in annot["entities"]: # add character indexes
            # supported modes: strict, contract, expand
            span = doc.char_span(start, end, label=label, alignment_mode="strict")
            # to avoid having the traceback; 
            # TypeError: object of type 'NoneType' has no len()
            if span is None:
                pass
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        doc_bin.add(doc)
        
    doc_bin.to_disk(f"./{outputFile}.spacy") # save the docbin object
    return f"Processed {len(doc_bin)}"

After running the function on the dataset, I got the traceback: ValueError: [E1010] Unable to set entity information for token 27 which is included in more than one span in entities, blocked, missing or outside.

After taking a close look into dataset file to look for the text that this traceback been raised, I found the following:

[('HereLongText..(abstract)',
  {'entities': [('0', '27', 'SpecificDisease'),
    ('80', '93', 'SpecificDisease'),
    ('260', '278', 'SpecificDisease'),
    ('615', '628', 'SpecificDisease'),
    ('673', '691', 'SpecificDisease'),
    ('754', '772', 'SpecificDisease')]})]

I do not know how to solve this issue.

1 Answer 1

1

I think this should make your issue clear. Here's a slightly modified version of your code that has the same error.

import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

def converter(data, outputFile):
    nlp = spacy.blank("en")  # load a new spacy model
    doc_bin = DocBin()  # create a DocBin object

    for text, annot in tqdm(data):  # data in previous format
        doc = nlp.make_doc(text)  # create doc object from text
        ents = []

        for start, end, label in annot["entities"]:  # add character indexes
            # supported modes: strict, contract, expand

            span = doc.char_span(start, end, label=label, alignment_mode="strict")
            # to avoid having the traceback;
            # TypeError: object of type 'NoneType' has no len()
            if span is None:
                pass
            else:
                ents.append(span)
        doc.ents = ents  # label the text with the ents
        doc_bin.add(doc)

    doc_bin.to_disk(f"./{outputFile}.spacy")  # save the docbin object
    return f"Processed {len(doc_bin)}"


data = [("I like cheese", 
    {"entities": [
        (0, 1, "Sample"),
        (0, 1, "Sample"), # Same thing twice
        ]})]

converter(data, "out.txt")

Note that in the examples the exact same span has two annotations. If you remove one of those annotations, then you won't get the error.

You are probably getting the error because you have annotations that overlap and aren't usable.

1

Not the answer you're looking for? Browse other questions tagged or ask your own question.