TFIDF proj commit

2020-05-05 19:28:18 +02:00 · 2020-05-05 19:28:18 +02:00 · f3b94430e8
commit f3b94430e8
parent d53ce23da2
8 changed files with 200016 additions and 200009 deletions
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
@ -99997,4 +99997,4 @@ Why is it that in the US people are just picking up random disguises that are no
 5429-7980-8121
 If I was a Bayern fan I'd be pumped that a player was so enthusiastic about coming to the club that he's doing this. Start of a beautiful relationship, hopefully.
 Honestly I enjoyed it quite a lot. Despite my getting tired of HG's, I found it to be a fun few hours.
-I have absolutely no idea! Vine?
+I have absolutely no idea! Vine?
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/l_regression.pkl
+++ b/l_regression.pkl
--- a/linear_regression.py
+++ b/linear_regression.py
@ -26,7 +26,7 @@ def train():
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
    x = tfidf.fit_transform(created_dictionary)
    #PCA - principal component analysis
-    pca = TruncatedSVD(n_components=100) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
+    pca = TruncatedSVD(n_components=120) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
    x_pca = pca.fit_transform(x)
    l_regression = LinearRegression()
    l_regression.fit(x_pca,expected_dictionary)
--- a/predict.py
+++ b/predict.py
@ -13,20 +13,23 @@ def predict():
    input_file = open("l_regression.pkl",'rb')
    l_regression = pickle.load(input_file)
    input_file = open("tfidf_model.pkl",'rb')
-    tfidf = pickle.load(input_file)
+    tfidf = pickle.load(input_file,encoding='UTF-8')

-    dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False)
-    testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False)
-    devtxt = dev0["txt"]
-    testAtxt = testA["txt"]
+    dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False, skip_blank_lines=False)
+    testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False, skip_blank_lines=False )
+    #devtxt = dev0["txt"]
+    #testAtxt = testA["txt"]

-    print(testAtxt)
-
-    dev0_vector = tfidf.fit_transform(devtxt)
-    testA_vector = tfidf.fit_transform(testAtxt)
+    #print(testAtxt)

+    dev0_vector = tfidf.fit_transform(dev0['txt'].apply(lambda dev0_vector: np.str_(dev0_vector)))
+    testA_vector = tfidf.fit_transform(testA['txt'].apply(lambda testA_vector: np.str_(testA_vector)))
+    #dev0_vector = tfidf.fit_transform(dev0['txt'].values.astype('U'))
+    #testA_vector = tfidf.fit_transform(testA['txt'].values.astype('U'))
+    #dev0_vector = tfidf.fit_transform(dev0['txt'],y=None)
+    #testA_vector = tfidf.fit_transform(testA['txt'],y=None)
    #print(testA_vector)
-    pca = TruncatedSVD(n_components=100)
+    pca = TruncatedSVD(n_components=120)

    dev0_pca = pca.fit_transform(dev0_vector)
    testA_pca = pca.fit_transform(testA_vector)
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
@ -99997,4 +99997,4 @@ Mine still is.
 MUSTY_BALLSACK and ANAL_QUEEN's incestual offspring.
 Good point on the Toxicology report: came back positive for marijuana, but that's it.
 This sounds right initially but then you see a fly head butt the same inch of glass 300 times. Flies are not clever creatures
-I don't actually "need" anything right now, but that never stops me from getting mite flavours
+I don't actually "need" anything right now, but that never stops me from getting mite flavours
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/tfidf_model.pkl
+++ b/tfidf_model.pkl