26 lines
679 B
Python
26 lines
679 B
Python
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
import sklearn
|
|
import pandas as pd
|
|
import math
|
|
import re
|
|
|
|
|
|
|
|
def create_dictionary(in_path):
|
|
tfDict = []
|
|
with open(in_path,encoding='utf-8') as in_file:
|
|
for line in in_file:
|
|
for word in re.findall(r"[\w]+",line):
|
|
tfDict.append(word)
|
|
return tfDict
|
|
|
|
|
|
def main():
|
|
created_dictionary=create_dictionary("train/in.tsv")
|
|
#tfidf = TfidfVectorizer(min_df=1,stop_words='english')
|
|
tfidf = TfidfVectorizer(stop_words='english')
|
|
x = tfidf.fit(created_dictionary)
|
|
y = tfidf.transform(x.vocabulary_)
|
|
|
|
main() |