Source code for anotator

from text_utils import stop_word_remove
from text_utils import adverb_remove
from text_utils import verb_remove
from text_utils import adjective_remove
from text_utils import special_symbols_remove
from fast_utils import split_with_indices
import settings
import argparse
import time

[docs]def dataset_NER_prepocess(dataset): """ Preprocess a dataset before training NER. Assuming That a clean dataset of Entities should not contain verbs, adverbs, adjectives and random symbols Args: dataset (list): list of strings for NER trainging Returns: list: processed dataset if Sucessful, None otherwise """ preprocessed = [] try: preprocessed = stop_word_remove(dataset) if not preprocessed: preprocessed = dataset preprocessed = adverb_remove(dataset) if not preprocessed: preprocessed = dataset preprocessed = verb_remove(dataset) if not preprocessed: preprocessed = dataset preprocessed = adjective_remove(dataset) if not preprocessed: preprocessed = dataset preprocessed = special_symbols_remove(dataset) except Exception as e: print(e) return None return preprocessed
[docs]def dataset_to_spacy(db, entity_label): """ Bring a dataset to a spacy trainable state Args: dataset (list): list of strings for NER trainging entity_label (str): designated label for the Entity Returns: list: The spacy training ready list if Sucessful, None otherwise """ train_data = [] try: #anyone missing context ?? for word in db[entity_label]: if not db[entity_label][word]['context']: continue for contexted_example in db[entity_label][word]['context']: entities = [] if len(word.split(" ")) > 1 : start = contexted_example.lower().find(word.lower()) end = start+len(word) entities.append((start, end, entity_label)) else: splits = list(split_with_indices(contexted_example)) for touple in splits: if word.lower() in contexted_example[touple[0]:touple[1]].lower(): entities.append((touple[0], touple[0] + len(word), entity_label)) train_data.append((contexted_example, {'entities': entities})) except Exception as e: print(e) return None return train_data
if __name__ == '__main__': settings.init() parser = argparse.ArgumentParser(description='Anotator options') entity_label = 'random_ent_' + str(time.time) parser.add_argument('-e', action="store", type=str, dest = 'entity_label', help ='designated entity label', default=entity_label) entity_label = parser.entity_label couchdb = settings.couchdb dataset_to_spacy(couchdb, entity_label)