Source code for text_utils

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from fast_utils import flatten
from fuzzywuzzy import fuzz

stop_words_cache = stopwords.words("english")

[docs]def fuzzy_word_remove(word_list): """ Remove words that are alike from the list Args: word_list (str): Arbitrary list of words Returns: list: The cleaned list """ try: temp = word_list for i in range(len(word_list) -1): for j in range(len(word_list) -1): if i != j: if (fuzz.token_sort_ratio(temp[i], temp[j]) > 75 ): temp.remove(temp[j]) except Exception as e: print(e) return temp
[docs]def text_segmentator(full_text): """ Segment the text on word by word basis, while removing stopwords Args: full_text_list (list): Text split in several lists Returns: list: The list of segmented words """ try: if not full_text: return None if all(x in [None, ''] for x in full_text): return None else: sentences = stop_word_remove(full_text) sentences = sent_tokenize(sentences) sentences = [word_tokenize(sent) for sent in sentences] sentences = flatten(sentences) #REMOVING NEW STEP sentences = special_symbols_remove(sentences) except Exception as e: print("text_segmentator error: "+ str(e)) return None return sentences
[docs]def stop_word_remove(full_text_list): """ Remove stopwords from a list of texts and return the list Args: full_text_list (str): Text split in several lists Returns: list: The initial list of strings with no stopwords if successful, None if failed """ try: full_list = [] for text in full_text_list: if text: full_text = " ".join([word for word in text.split() if word not in stop_words_cache]) full_text = full_text.strip() full_list.append(full_text) except Exception as e: print(e) return None return full_text
[docs]def adverb_remove(full_text_list): """ Remove adverbs from a list of texts and return the list Args: full_text_list (str): Text split in several lists Returns: list: The initial list of strings with no adverbs, if successful, None if failed """ try: tag_list = "RB" full_list = [] for text in full_text_list: if text: full_text = " ".join([word for word in text.split() if (tag_list not in pos_tag([word])[0][1])]) full_text = full_text.strip() full_list.append(full_text) except Exception as e: print(e) return None return full_list
[docs]def verb_remove(full_text_list): """ Remove verbs from a list of texts and return the list Args: full_text_list (str): Text split in several lists Returns: str: The initial list of strings with no verbs, if successful, None if failed """ try: tag_list = "VB" full_list = [] for text in full_text_list: if text: full_text = " ".join([word for word in text.split() if (tag_list not in pos_tag([word])[0][1])]) full_text = full_text.strip() full_list.append(full_text) except Exception as e: print(e) return None return full_list
[docs]def adjective_remove(full_text_list): """ Remove adjectives from a list of texts and return the whole list Args: full_text_list (str): Text split in several lists Returns: str: The initial list of strings with no adjectives, if successful, None if failed """ try: tag_list = "JJ" full_list = [] for text in full_text_list: if text: full_text = " ".join([word for word in text.split() if (tag_list not in pos_tag([word])[0][1])]) full_text = full_text.strip() full_list.append(full_text) except Exception as e: print(e) return None return full_list
[docs]def special_symbols_remove(full_text_list): """ Remove special symbols from a list of texts and return the list. (Few relevant symbols stay) Args: full_text_list (str): Text split in several lists Returns: list: The initial list of strings with no adjectives, if successful, None if failed """ try: new_value =[] keep_char_list = ['-','.',','] #SUBJECT TO CHANGE for text in full_text_list: new_string = ''.join(e for e in text if (e.isalnum() or e in [x for x in keep_char_list])) if new_string: new_value.append(new_string) except Exception as e: print(e) return new_value