TFIDF proj commit

2020-05-05 14:47:21 +02:00 · 2020-05-05 14:47:21 +02:00 · 830a36db52
commit 830a36db52
parent 655535d50d
8 changed files with 99868 additions and 99858 deletions
--- a/.idea/GUESS_REDDIT_DATE_SUMO.iml
+++ b/.idea/GUESS_REDDIT_DATE_SUMO.iml
@ -7,4 +7,7 @@
    <orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
+  <component name="PyDocumentationSettings">
+    <option name="renderExternalDocumentation" value="true" />
+  </component>
 </module>
--- a/.idea/other.xml
+++ b/.idea/other.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PySciProjectComponent">
+    <option name="PY_SCI_VIEW" value="true" />
+    <option name="PY_SCI_VIEW_SUGGESTED" value="true" />
+  </component>
+</project>
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/l_regression.pkl
+++ b/l_regression.pkl
--- a/linear_regression.py
+++ b/linear_regression.py
@ -32,7 +32,7 @@ def train():
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
    x = tfidf.fit_transform(created_dictionary)
    #PCA - principal component analysis
-    pca = TruncatedSVD(n_components=100) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
+    pca = TruncatedSVD(n_components=200) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
    x_pca = pca.fit_transform(x)
    l_regression = LinearRegression()
    l_regression.fit(x_pca,expected_dictionary)
--- a/predict.py
+++ b/predict.py
@ -17,7 +17,7 @@ def predict():
    testA_vector = tfidf.fit_transform(testA)

    #print(testA_vector)
-    pca = TruncatedSVD(n_components=100)
+    pca = TruncatedSVD(n_components=200)

    dev0_pca = pca.fit_transform(dev0_vector)
    testA_pca = pca.fit_transform(testA_vector)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/tfidf_model.pkl
+++ b/tfidf_model.pkl