linear regression with tf-idf
This commit is contained in:
parent
ff153079a6
commit
0147e708a7
200000
dev-0/out.tsv
200000
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
29
predict_lrtfidf.py
Normal file
29
predict_lrtfidf.py
Normal file
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.linear_model import LinearRegression
|
||||
import pandas as pd
|
||||
import csv
|
||||
import pickle
|
||||
|
||||
def predict():
|
||||
reg = pickle.load(open("reg.model", "rb"))
|
||||
vect = pickle.load(open("vect.model", "rb"))
|
||||
|
||||
dev0 = pd.read_csv("dev-0/in_new.tsv", delimiter="\t", header=None, names=["text"], quoting=csv.QUOTE_NONE)
|
||||
testA = pd.read_csv("test-A/in_new.tsv", delimiter="\t", header=None, names=["text"], quoting=csv.QUOTE_NONE)
|
||||
|
||||
devdoc = dev0["text"]
|
||||
testdoc = testA["text"]
|
||||
|
||||
dev0_vectorizer = vect.transform(devdoc)
|
||||
testA_vectorizer = vect.transform(testdoc)
|
||||
|
||||
dev0_pca = pca.transform(dev0_vectorizer)
|
||||
testA_pca = pca.transform(testA_vectorizer)
|
||||
|
||||
y_dev = reg.predict(dev0_pca)
|
||||
y_test = reg.predict(testA_pca)
|
||||
|
||||
predict()
|
200000
test-A/out.tsv
200000
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
29
train_lrtfidf.py
Normal file
29
train_lrtfidf.py
Normal file
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.linear_model import LinearRegression
|
||||
import numpy as np
|
||||
import csv
|
||||
import pandas as pd
|
||||
import pickle
|
||||
|
||||
def train():
|
||||
train = pd.read_csv("train/in_new.tsv", delimiter="\t", header=None, names=["text"], quoting=csv.QUOTE_NONE)
|
||||
text = train["text"][:2000000]
|
||||
y = pd.read_csv("train/expected.tsv", header=None)
|
||||
y = y[:2000000]
|
||||
print(y)
|
||||
|
||||
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
|
||||
x = vect.fit_transform(text)
|
||||
pca = TruncatedSVD(n_components=120)
|
||||
x_pca = pca.fit_transform(x)
|
||||
|
||||
reg = LinearRegression()
|
||||
reg.fit(x_pca,y)
|
||||
|
||||
pickle.dump(reg, open("clf.model", "wb"))
|
||||
pickle.dump(vect, open("vectorizer.model", "wb"))
|
||||
|
||||
train()
|
Loading…
Reference in New Issue
Block a user