TFIDF proj commit
This commit is contained in:
parent
d2b5466b05
commit
8420612d6f
68888
dev-0/out.tsv
68888
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
BIN
l_regression.pkl
BIN
l_regression.pkl
Binary file not shown.
@ -1,3 +1,4 @@
|
|||||||
|
import csv
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
from sklearn.decomposition import TruncatedSVD
|
from sklearn.decomposition import TruncatedSVD
|
||||||
@ -11,22 +12,16 @@ import re
|
|||||||
from sklearn.linear_model import LinearRegression
|
from sklearn.linear_model import LinearRegression
|
||||||
|
|
||||||
|
|
||||||
def create_dictionary(in_path):
|
|
||||||
tfDict = []
|
|
||||||
i=0;
|
|
||||||
with open(in_path,encoding='utf-8') as in_file:
|
|
||||||
for line in in_file:
|
|
||||||
for word in re.findall(r"[\w]+",line):
|
|
||||||
tfDict.append(word)
|
|
||||||
i+=1
|
|
||||||
if(i>=50054):
|
|
||||||
break
|
|
||||||
return tfDict
|
|
||||||
##
|
|
||||||
|
|
||||||
def train():
|
def train():
|
||||||
created_dictionary=create_dictionary("train/in.tsv")
|
created_dictionary=pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
|
||||||
expected_dictionary=create_dictionary("train/expected.tsv");
|
#Pandas - służy do analizy danych (wszystko to co umożliwia SQL, Excel itd.)
|
||||||
|
#delimiter - alternatywny argument do separacji, header - numer wiersza, który ma być traktowany jako nazwa
|
||||||
|
#names - określenie nazw kolumn jak ich nie ma dajemy None
|
||||||
|
#quoting - służy do interpretacji pól
|
||||||
|
created_dictionary = created_dictionary["txt"][:100000]
|
||||||
|
print(created_dictionary)
|
||||||
|
expected_dictionary=pd.read_csv("train/expected.tsv", header=None)
|
||||||
|
expected_dictionary = expected_dictionary[:100000]
|
||||||
#tfidf = TfidfVectorizer(min_df=1,stop_words='english')
|
#tfidf = TfidfVectorizer(min_df=1,stop_words='english')
|
||||||
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
|
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
|
||||||
x = tfidf.fit_transform(created_dictionary)
|
x = tfidf.fit_transform(created_dictionary)
|
||||||
@ -41,4 +36,5 @@ def train():
|
|||||||
with open('tfidf_model.pkl', 'wb') as f:
|
with open('tfidf_model.pkl', 'wb') as f:
|
||||||
pickle.dump(tfidf,f)
|
pickle.dump(tfidf,f)
|
||||||
|
|
||||||
|
|
||||||
train()
|
train()
|
23
predict.py
23
predict.py
@ -1,29 +1,28 @@
|
|||||||
|
import csv
|
||||||
import pickle
|
import pickle
|
||||||
from typing import re
|
from typing import re
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
from sklearn.decomposition import PCA
|
from sklearn.decomposition import PCA
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.decomposition import TruncatedSVD
|
from sklearn.decomposition import TruncatedSVD
|
||||||
|
|
||||||
def create_dictionary(in_path):
|
|
||||||
tfDict = []
|
|
||||||
with open(in_path,encoding='utf-8') as in_file:
|
|
||||||
for line in in_file:
|
|
||||||
for word in re.findall(r"[\w]+",line):
|
|
||||||
tfDict.append(word)
|
|
||||||
return tfDict
|
|
||||||
|
|
||||||
def predict():
|
def predict():
|
||||||
input_file = open("l_regression.pkl",'rb')
|
input_file = open("l_regression.pkl",'rb')
|
||||||
l_regression = pickle.load(input_file)
|
l_regression = pickle.load(input_file)
|
||||||
input_file = open("tfidf_model.pkl",'rb')
|
input_file = open("tfidf_model.pkl",'rb')
|
||||||
tfidf = pickle.load(input_file)
|
tfidf = pickle.load(input_file)
|
||||||
|
|
||||||
dev0 = create_dictionary("dev-0/in.tsv")
|
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
|
||||||
testA = create_dictionary("test-A/in.tsv")
|
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
|
||||||
dev0_vector = tfidf.fit_transform(dev0)
|
|
||||||
testA_vector = tfidf.fit_transform(testA)
|
devtxt = dev0["txt"]
|
||||||
|
testAtxt = testA["txt"]
|
||||||
|
print(testAtxt)
|
||||||
|
|
||||||
|
dev0_vector = tfidf.fit_transform(devtxt)
|
||||||
|
testA_vector = tfidf.fit_transform(testAtxt)
|
||||||
|
|
||||||
#print(testA_vector)
|
#print(testA_vector)
|
||||||
pca = TruncatedSVD(n_components=100)
|
pca = TruncatedSVD(n_components=100)
|
||||||
|
68888
test-A/out.tsv
68888
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
BIN
tfidf_model.pkl
BIN
tfidf_model.pkl
Binary file not shown.
Loading…
Reference in New Issue
Block a user