TFIDF proj commit

2020-05-05 16:02:39 +02:00 · 2020-05-05 16:02:39 +02:00 · 8420612d6f
commit 8420612d6f
parent d2b5466b05
6 changed files with 17718 additions and 120107 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/l_regression.pkl
+++ b/l_regression.pkl
--- a/linear_regression.py
+++ b/linear_regression.py
@ -1,3 +1,4 @@
 import csv
 import pickle
 from sklearn.decomposition import TruncatedSVD
@ -11,22 +12,16 @@ import re
 from sklearn.linear_model import LinearRegression
 def create_dictionary(in_path):
    tfDict = []
    i=0;
    with open(in_path,encoding='utf-8') as in_file:
        for line in in_file:
            for word in re.findall(r"[\w]+",line):
                tfDict.append(word)
                i+=1
            if(i>=50054):
                break
    return tfDict
 ##
 def train():
-    created_dictionary=create_dictionary("train/in.tsv")
+    created_dictionary=pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
-    expected_dictionary=create_dictionary("train/expected.tsv");
+    #Pandas - służy do analizy danych (wszystko to co umożliwia SQL, Excel itd.)
    #delimiter - alternatywny argument do separacji, header - numer wiersza, który ma być traktowany jako nazwa
    #names - określenie nazw kolumn jak ich nie ma dajemy None
    #quoting - służy do interpretacji pól
    created_dictionary = created_dictionary["txt"][:100000]
    print(created_dictionary)
    expected_dictionary=pd.read_csv("train/expected.tsv", header=None)
    expected_dictionary = expected_dictionary[:100000]
    #tfidf = TfidfVectorizer(min_df=1,stop_words='english')
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
    x = tfidf.fit_transform(created_dictionary)
@ -41,4 +36,5 @@ def train():
    with open('tfidf_model.pkl', 'wb') as f:
        pickle.dump(tfidf,f)
 train()
--- a/predict.py
+++ b/predict.py
@ -1,29 +1,28 @@
 import csv
 import pickle
 from typing import re
 import numpy as np
 import pandas as pd
 from sklearn.decomposition import PCA
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.decomposition import TruncatedSVD
 def create_dictionary(in_path):
    tfDict = []
    with open(in_path,encoding='utf-8') as in_file:
        for line in in_file:
            for word in re.findall(r"[\w]+",line):
                tfDict.append(word)
    return tfDict
 def predict():
    input_file = open("l_regression.pkl",'rb')
    l_regression = pickle.load(input_file)
    input_file = open("tfidf_model.pkl",'rb')
    tfidf = pickle.load(input_file)
-    dev0 = create_dictionary("dev-0/in.tsv")
+    dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
-    testA = create_dictionary("test-A/in.tsv")
+    testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
-    dev0_vector = tfidf.fit_transform(dev0)
+
-    testA_vector = tfidf.fit_transform(testA)
+    devtxt = dev0["txt"]
    testAtxt = testA["txt"]
    print(testAtxt)
    dev0_vector = tfidf.fit_transform(devtxt)
    testA_vector = tfidf.fit_transform(testAtxt)
    #print(testA_vector)
    pca = TruncatedSVD(n_components=100)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/tfidf_model.pkl
+++ b/tfidf_model.pkl