I have this code that works well if I try to search exact words.
from spacy.lang.en import English
import spacy
#nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmatizer","ner"])
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Google"},
{"label": "COLOR", "pattern": "yellow"},
{"label": "COLOR", "pattern": "red"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]},
{"label": "DIN", "pattern": [{"TEXT" : {"REGEX": "DIN\d"}}]},
{"label": "DIAM", "pattern": [{"TEXT" : {"REGEX": "diameter\d"}}]},
{"label": "MATERIAL", "pattern": [{"LOWER": "zinc"}, {"LOWER": "plated"}]},
{"label": "MATERIAL", "pattern": [{"LOWER": "stainless"}, {"LOWER": "steel"}]},
{"label": "BRAND", "pattern": [{"LOWER": "cubitron"},{"LOWER": "ii"}]}
]
ruler.add_patterns(patterns)
doc = nlp("Google red yellow DIN 789 opening its first big zinc plated ffice in San Francisco")
print([(ent.text, ent.label_) for ent in doc.ents])
But the regex doesnt work for whole sentence but just for each token.
I tried to add something like this to add new entity but it doesnt still show the new label DIN in the output.
from spacy.tokens import Span
doc = nlp("Google red yellow DIN 180 opening its first big zinc plated ffice in San Francisco")
pattern = r"DIN\s\d"
original_ents = list(doc.ents)
mwt_ents = []
for match in re.finditer(pattern, doc.text):
start, end = match.span()
span = doc.char_span(start, end)
if span is not None:
mwt_ents.append((span.start, span.end, span.text))
for ent in mwt_ents:
start, end, name = ent
per_ent = Span(doc, start, end, label="DIN")
original_ents.append(per_ent)
doc.ents = original_ents
from spacy.util import filter_spans
filtered = filter_spans(original_ents)
doc.ents = filtered
for ent in doc.ents:
print (ent.text, ent.label_)
What all am I doing wrong? How can I add to the nlp model new rule based on regex that searches in the whole input? THANKS!!