Added create_dictionary and main
This commit is contained in:
parent
12f1942371
commit
604732ed31
9061
dev-0/in.tsv
Normal file
9061
dev-0/in.tsv
Normal file
File diff suppressed because one or more lines are too long
26
linear_regression.py
Normal file
26
linear_regression.py
Normal file
@ -0,0 +1,26 @@
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
import sklearn
|
||||
import pandas as pd
|
||||
import math
|
||||
import re
|
||||
|
||||
|
||||
|
||||
def create_dictionary(in_path):
|
||||
tfDict = []
|
||||
with open(in_path,encoding='utf-8') as in_file:
|
||||
for line in in_file:
|
||||
for word in re.findall(r"[\w]+",line):
|
||||
tfDict.append(word)
|
||||
return tfDict
|
||||
|
||||
|
||||
def main():
|
||||
created_dictionary=create_dictionary("train/in.tsv")
|
||||
#tfidf = TfidfVectorizer(min_df=1,stop_words='english')
|
||||
tfidf = TfidfVectorizer(stop_words='english')
|
||||
x = tfidf.fit(created_dictionary)
|
||||
y = tfidf.transform(x.vocabulary_)
|
||||
|
||||
main()
|
9082
test-A/in.tsv
Normal file
9082
test-A/in.tsv
Normal file
File diff suppressed because one or more lines are too long
281860
train/in.tsv
Normal file
281860
train/in.tsv
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user