TFIDF proj commit

This commit is contained in:
Bartusiak 2020-05-05 16:02:39 +02:00
parent d2b5466b05
commit 8420612d6f
6 changed files with 17718 additions and 120107 deletions

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,3 +1,4 @@
import csv
import pickle import pickle
from sklearn.decomposition import TruncatedSVD from sklearn.decomposition import TruncatedSVD
@ -11,22 +12,16 @@ import re
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
def create_dictionary(in_path):
tfDict = []
i=0;
with open(in_path,encoding='utf-8') as in_file:
for line in in_file:
for word in re.findall(r"[\w]+",line):
tfDict.append(word)
i+=1
if(i>=50054):
break
return tfDict
##
def train(): def train():
created_dictionary=create_dictionary("train/in.tsv") created_dictionary=pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
expected_dictionary=create_dictionary("train/expected.tsv"); #Pandas - służy do analizy danych (wszystko to co umożliwia SQL, Excel itd.)
#delimiter - alternatywny argument do separacji, header - numer wiersza, który ma być traktowany jako nazwa
#names - określenie nazw kolumn jak ich nie ma dajemy None
#quoting - służy do interpretacji pól
created_dictionary = created_dictionary["txt"][:100000]
print(created_dictionary)
expected_dictionary=pd.read_csv("train/expected.tsv", header=None)
expected_dictionary = expected_dictionary[:100000]
#tfidf = TfidfVectorizer(min_df=1,stop_words='english') #tfidf = TfidfVectorizer(min_df=1,stop_words='english')
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
x = tfidf.fit_transform(created_dictionary) x = tfidf.fit_transform(created_dictionary)
@ -41,4 +36,5 @@ def train():
with open('tfidf_model.pkl', 'wb') as f: with open('tfidf_model.pkl', 'wb') as f:
pickle.dump(tfidf,f) pickle.dump(tfidf,f)
train() train()

View File

@ -1,29 +1,28 @@
import csv
import pickle import pickle
from typing import re from typing import re
import numpy as np import numpy as np
import pandas as pd
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD from sklearn.decomposition import TruncatedSVD
def create_dictionary(in_path):
tfDict = []
with open(in_path,encoding='utf-8') as in_file:
for line in in_file:
for word in re.findall(r"[\w]+",line):
tfDict.append(word)
return tfDict
def predict(): def predict():
input_file = open("l_regression.pkl",'rb') input_file = open("l_regression.pkl",'rb')
l_regression = pickle.load(input_file) l_regression = pickle.load(input_file)
input_file = open("tfidf_model.pkl",'rb') input_file = open("tfidf_model.pkl",'rb')
tfidf = pickle.load(input_file) tfidf = pickle.load(input_file)
dev0 = create_dictionary("dev-0/in.tsv") dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
testA = create_dictionary("test-A/in.tsv") testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
dev0_vector = tfidf.fit_transform(dev0)
testA_vector = tfidf.fit_transform(testA) devtxt = dev0["txt"]
testAtxt = testA["txt"]
print(testAtxt)
dev0_vector = tfidf.fit_transform(devtxt)
testA_vector = tfidf.fit_transform(testAtxt)
#print(testA_vector) #print(testA_vector)
pca = TruncatedSVD(n_components=100) pca = TruncatedSVD(n_components=100)

File diff suppressed because it is too large Load Diff

Binary file not shown.