paranormal-or-skeptic3/create_dictionary.py

47 lines
991 B
Python
Raw Normal View History

2020-12-15 16:40:10 +01:00
#!/usr/bin/python3
import sys
import pickle
from tokenizator import tokenize
def main():
dictionary = set()
train_y = []
documents = []
document_terms = {}
counter = 0
for line in sys.stdin:
#print(line)
if counter > 15000:
break
line = line.strip()
fields = line.split('\t')
if len(fields) < 2:
continue
label = fields[0].strip()
document = fields[1]
documents.append(document)
train_y.append(int(label))
terms = tokenize(document)
document_terms[counter] = terms
for t in terms:
dictionary.add(t)
counter += 1
word_to_index_mapping = {}
index = 1
for w in dictionary:
word_to_index_mapping[w] = index
index += 1
model = (dictionary, documents, document_terms, word_to_index_mapping, train_y)
pickle.dump(model, open('dictionary.pkl', 'wb'))
if __name__ == '__main__':
main()