47 lines
991 B
Python
47 lines
991 B
Python
#!/usr/bin/python3
|
|
|
|
|
|
import sys
|
|
import pickle
|
|
from tokenizator import tokenize
|
|
|
|
|
|
def main():
|
|
dictionary = set()
|
|
train_y = []
|
|
documents = []
|
|
document_terms = {}
|
|
|
|
counter = 0
|
|
for line in sys.stdin:
|
|
#print(line)
|
|
if counter > 15000:
|
|
break
|
|
line = line.strip()
|
|
fields = line.split('\t')
|
|
if len(fields) < 2:
|
|
continue
|
|
label = fields[0].strip()
|
|
document = fields[1]
|
|
documents.append(document)
|
|
train_y.append(int(label))
|
|
terms = tokenize(document)
|
|
document_terms[counter] = terms
|
|
|
|
for t in terms:
|
|
dictionary.add(t)
|
|
counter += 1
|
|
|
|
word_to_index_mapping = {}
|
|
index = 1
|
|
for w in dictionary:
|
|
word_to_index_mapping[w] = index
|
|
index += 1
|
|
|
|
model = (dictionary, documents, document_terms, word_to_index_mapping, train_y)
|
|
pickle.dump(model, open('dictionary.pkl', 'wb'))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|