from text_utils import stop_word_remove
from text_utils import adverb_remove
from text_utils import verb_remove
from text_utils import adjective_remove
from text_utils import special_symbols_remove
from fast_utils import split_with_indices
import settings
import argparse
import time
[docs]def dataset_NER_prepocess(dataset):
    """
    Preprocess a dataset before training NER.
    Assuming That a clean dataset of Entities should not contain
    verbs, adverbs, adjectives and random symbols
    Args:
        dataset (list): list of strings for NER trainging
    Returns:
        list: processed dataset if Sucessful, None otherwise
    """
    preprocessed = []
    try:
        preprocessed = stop_word_remove(dataset)
        if not preprocessed:
            preprocessed = dataset
        preprocessed = adverb_remove(dataset)
        if not preprocessed:
            preprocessed = dataset
        preprocessed = verb_remove(dataset)
        if not preprocessed:
            preprocessed = dataset
        preprocessed = adjective_remove(dataset)
        if not preprocessed:
            preprocessed = dataset
        preprocessed = special_symbols_remove(dataset)
    except Exception as e:
        print(e)
        return None
    return preprocessed 
[docs]def dataset_to_spacy(db, entity_label):
    """
    Bring a dataset to a spacy trainable state
    Args:
        dataset (list): list of strings for NER trainging
        entity_label (str): designated label for the Entity
    Returns:
        list: The spacy training ready list if Sucessful, None otherwise
    """
    train_data = []
    try:
        #anyone missing context ??
        for word in db[entity_label]:
            if not db[entity_label][word]['context']:
                continue
            for contexted_example in db[entity_label][word]['context']:
                entities = []
                if len(word.split(" ")) > 1 :
                    start = contexted_example.lower().find(word.lower())
                    end = start+len(word)
                    entities.append((start, end, entity_label))
                else:
                    splits = list(split_with_indices(contexted_example))
                    for touple in splits:
                        if word.lower() in contexted_example[touple[0]:touple[1]].lower():
                            entities.append((touple[0], touple[0] + len(word), entity_label))
                train_data.append((contexted_example, {'entities': entities}))
    except Exception as e:
        print(e)
        return None
    return train_data 
if __name__ == '__main__':
    settings.init()
    parser = argparse.ArgumentParser(description='Anotator options')
    entity_label = 'random_ent_' + str(time.time)
    parser.add_argument('-e', action="store", type=str, dest = 'entity_label', help ='designated entity label', default=entity_label)
    entity_label = parser.entity_label
    couchdb = settings.couchdb
    dataset_to_spacy(couchdb, entity_label)