import os
import time
from itertools import groupby
import re
import traceback
[docs]def log_to_text(full_metadata_list, file_name):
try:
if not (os.path.isfile(file_name + ".txt")):
file=open(file_name + ".txt", "w+", encoding='utf-8', errors="surrogateescape")
append_string = str(full_metadata_list)
file.write(append_string)
file.close()
else:
file=open(file_name + ".txt", "a+", encoding='utf-8', errors="surrogateescape")
append_string = str(full_metadata_list)
file.write(append_string)
except IOError as e:
print(e)
print(1)
[docs]def getopts(argv):
opts = {} # Empty dictionary to store key-value pairs.
while argv: # While there are arguments left to parse...
if argv[0][0] == '-': # Found a "-name value" pair.
opts[argv[0]] = argv[1] # Add key and value to the dictionary.
argv = argv[1:] # Reduce the argument list by copying it starting from index 1.
return opts
[docs]def remove_special_symbols(value):
new_value =[]
keep_char_list = ['@','.',',','!']
for string in value:
new_string = ''.join(e for e in string if (e.isalnum() or e in [x for x in keep_char_list]))
if new_string:
new_value.append(new_string)
return new_value
[docs]def flatten(chunkList):
sentences_split = []
for chunk in chunkList:
for word in chunk:
sentences_split.append(word)
return sentences_split
[docs]def split_with_indices(s, c=' '):
p = 0
for k, g in groupby(s, lambda x:x==c):
q = p + sum(1 for i in g)
if not k:
yield p, q # or p, q-1 if you are really sure you want that
p = q
[docs]def list_segmentor(seq, size):
newseq = []
splitsize = 1.0/max(1,size)*len(seq)
for i in range(size):
newseq.append(seq[int(round(i*splitsize)):int(round((i+1)*splitsize))])
return newseq
[docs]def exact_word_match(word, raw_sentence):
lister = []
try:
regexp_pattern = r"(?:^|\W)" + word + r"(?:$|\W)"
#regexp_verify = re.compile(regexp_pattern) #no Need to save
lister = re.findall(regexp_pattern, raw_sentence, flags=re.IGNORECASE)
except Exception as e:
print(str(traceback.format_exc()))
return False
return len(lister)>=1
[docs]def replace_all(text, dic):
for i, j in dic.items():
text = text.replace(i, j)
return text