Created vocabulary
This commit is contained in:
parent
a546cd9958
commit
2dcb39fdde
@ -4,19 +4,31 @@ import pickle
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
vocabulary=[]
|
vocabulary=[]
|
||||||
file_to_save=open("test.tsv","wb")
|
#word_to_index_mapping={}
|
||||||
|
#index_to_word_mapping={}
|
||||||
|
|
||||||
def define_vocabulary(file_to_learn_new_words):
|
file_to_save=open("test.tsv","w",encoding='utf-8')
|
||||||
with open(file_to_learn_new_words,encoding='utf-8') as file:
|
def define_vocabulary(file_to_learn_new_words,expected_path):
|
||||||
for line in file:
|
word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
|
||||||
#for word in re.findall(r"([a-zA-Z\-]+)", line):
|
with open(file_to_learn_new_words, encoding='utf-8') as in_file, open(expected_path, encoding='utf-8') as expected_file:
|
||||||
for word in line.split():
|
for line, exp in zip(in_file, expected_file):
|
||||||
vocabulary.append(word)
|
class_ = exp.rstrip('\n').replace(' ', '')
|
||||||
return vocabulary
|
text, timestamp = line.rstrip('\n').split('\t')
|
||||||
|
tokens = text.lower().split(' ')
|
||||||
|
for token in tokens:
|
||||||
|
if class_ == 'P':
|
||||||
|
word_counts['paranormal'][token] += 1
|
||||||
|
elif class_ == 'S':
|
||||||
|
word_counts['skeptic'][token] += 1
|
||||||
|
return word_counts
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
vocabulary=define_vocabulary('train/in.tsv')
|
vocabulary=define_vocabulary('train/in.tsv','train/expected.tsv')
|
||||||
file_to_save=vocabulary
|
ix=1
|
||||||
|
#for word in vocabulary:
|
||||||
|
#word_to_index_mapping(word)=ix
|
||||||
|
#index_to_word_mapping(ix)=word
|
||||||
|
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user