Added create_dictionary and main

This commit is contained in:
Bartusiak 2020-04-19 19:30:57 +02:00
parent 12f1942371
commit 604732ed31
4 changed files with 300029 additions and 0 deletions

9061
dev-0/in.tsv Normal file

File diff suppressed because one or more lines are too long

26
linear_regression.py Normal file
View File

@ -0,0 +1,26 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import sklearn
import pandas as pd
import math
import re
def create_dictionary(in_path):
tfDict = []
with open(in_path,encoding='utf-8') as in_file:
for line in in_file:
for word in re.findall(r"[\w]+",line):
tfDict.append(word)
return tfDict
def main():
created_dictionary=create_dictionary("train/in.tsv")
#tfidf = TfidfVectorizer(min_df=1,stop_words='english')
tfidf = TfidfVectorizer(stop_words='english')
x = tfidf.fit(created_dictionary)
y = tfidf.transform(x.vocabulary_)
main()

9082
test-A/in.tsv Normal file

File diff suppressed because one or more lines are too long

281860
train/in.tsv Normal file

File diff suppressed because one or more lines are too long