Compare commits
2 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
89e8437381 | ||
|
ff2f2b124f |
274628
dev-0/out.tsv
274628
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
313212
dev-1/out.tsv
313212
dev-1/out.tsv
File diff suppressed because it is too large
Load Diff
41
run.py
41
run.py
@ -1,7 +1,40 @@
|
|||||||
for dataset in 'dev-0', 'dev-1', 'test-A':
|
from stop_words import get_stop_words
|
||||||
with open(f'{dataset}/in.tsv') as f_in, open(f'{dataset}/out.tsv','w') as f_out:
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
for line in f_in:
|
from sklearn.linear_model import LogisticRegression
|
||||||
f_out.write('0.45\n')
|
import pandas as pd
|
||||||
|
import lzma
|
||||||
|
|
||||||
|
|
||||||
|
def read_file(f_name, f_ext=""):
|
||||||
|
if f_ext == "xz":
|
||||||
|
with lzma.open(f"{f_name}.{f_ext}") as file:
|
||||||
|
return [line.strip().decode("utf-8") for line in file.readlines()]
|
||||||
|
with open(f_name, encoding="utf-8") as file:
|
||||||
|
return [line.strip() for line in file.readlines()]
|
||||||
|
|
||||||
|
|
||||||
|
def write_file(dir_name, preds):
|
||||||
|
with open(f"{dir_name}/out.tsv", "a", encoding="utf-8") as file_out:
|
||||||
|
for val in preds:
|
||||||
|
file_out.writelines(f"{str(val)}")
|
||||||
|
|
||||||
|
|
||||||
|
x_data = read_file("train/in.tsv", "xz")
|
||||||
|
print(x_data[:5])
|
||||||
|
y_data = read_file("train/expected.tsv")
|
||||||
|
print(y_data[:5])
|
||||||
|
|
||||||
|
tfidf_vec = TfidfVectorizer(
|
||||||
|
use_idf=True, max_df=0.95, stop_words=get_stop_words("pl"))
|
||||||
|
x_vectorized = tfidf_vec.fit_transform(x_data)
|
||||||
|
|
||||||
|
model = LogisticRegression(max_iter=1000)
|
||||||
|
model.fit(x_vectorized, y_data)
|
||||||
|
|
||||||
|
# * Predictions
|
||||||
|
for data_dir in ["dev-0", "dev-1", "test-A"]:
|
||||||
|
print(f"running for the __{data_dir}__")
|
||||||
|
x_data = read_file(f"{data_dir}/in.tsv")
|
||||||
|
x_vectorized = tfidf_vec.transform(x_data)
|
||||||
|
y_predicted = model.predict(x_vectorized)
|
||||||
|
write_file(data_dir, y_predicted)
|
||||||
|
269236
test-A/out.tsv
269236
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user