TFIDF proj commit

This commit is contained in:
Bartusiak 2020-05-05 16:02:39 +02:00
parent d2b5466b05
commit 8420612d6f
6 changed files with 17718 additions and 120107 deletions

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,3 +1,4 @@
import csv
import pickle
from sklearn.decomposition import TruncatedSVD
@ -11,22 +12,16 @@ import re
from sklearn.linear_model import LinearRegression
def create_dictionary(in_path):
tfDict = []
i=0;
with open(in_path,encoding='utf-8') as in_file:
for line in in_file:
for word in re.findall(r"[\w]+",line):
tfDict.append(word)
i+=1
if(i>=50054):
break
return tfDict
##
def train():
created_dictionary=create_dictionary("train/in.tsv")
expected_dictionary=create_dictionary("train/expected.tsv");
created_dictionary=pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
#Pandas - służy do analizy danych (wszystko to co umożliwia SQL, Excel itd.)
#delimiter - alternatywny argument do separacji, header - numer wiersza, który ma być traktowany jako nazwa
#names - określenie nazw kolumn jak ich nie ma dajemy None
#quoting - służy do interpretacji pól
created_dictionary = created_dictionary["txt"][:100000]
print(created_dictionary)
expected_dictionary=pd.read_csv("train/expected.tsv", header=None)
expected_dictionary = expected_dictionary[:100000]
#tfidf = TfidfVectorizer(min_df=1,stop_words='english')
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
x = tfidf.fit_transform(created_dictionary)
@ -41,4 +36,5 @@ def train():
with open('tfidf_model.pkl', 'wb') as f:
pickle.dump(tfidf,f)
train()

View File

@ -1,29 +1,28 @@
import csv
import pickle
from typing import re
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
def create_dictionary(in_path):
tfDict = []
with open(in_path,encoding='utf-8') as in_file:
for line in in_file:
for word in re.findall(r"[\w]+",line):
tfDict.append(word)
return tfDict
def predict():
input_file = open("l_regression.pkl",'rb')
l_regression = pickle.load(input_file)
input_file = open("tfidf_model.pkl",'rb')
tfidf = pickle.load(input_file)
dev0 = create_dictionary("dev-0/in.tsv")
testA = create_dictionary("test-A/in.tsv")
dev0_vector = tfidf.fit_transform(dev0)
testA_vector = tfidf.fit_transform(testA)
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
devtxt = dev0["txt"]
testAtxt = testA["txt"]
print(testAtxt)
dev0_vector = tfidf.fit_transform(devtxt)
testA_vector = tfidf.fit_transform(testAtxt)
#print(testA_vector)
pca = TruncatedSVD(n_components=100)

File diff suppressed because it is too large Load Diff

Binary file not shown.