1
0
forked from kubapok/retroc2
This commit is contained in:
Adrian 2022-05-17 00:49:33 +02:00
parent 647c099815
commit eeb51fec1d
18 changed files with 586743 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,49 @@
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
with open('train/train.tsv', 'r', encoding='utf8') as file:
train_data = pd.read_csv(file, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
def readFile(filename):
result = []
with open(filename, 'r', encoding="utf-8") as file:
for line in file:
text = line.split("\t")[0].strip()
result.append(text)
return result
def write_pred(filename, predictions):
with open(filename, "w") as file:
for pred in predictions:
file.write(str(pred) + "\n")
train_data = train_data[:10000]
X = train_data['Text']
Y = train_data['Begin']
model = make_pipeline(TfidfVectorizer(), LinearRegression())
model.fit(X, Y)
dev_0 = readFile('dev-0/in.tsv')
predict_dev_0 = model.predict(dev_0)
write_pred('dev-0/out.tsv', predict_dev_0)
dev_1 = readFile('dev-1/in.tsv')
predict_dev_1 = model.predict(dev_1)
write_pred('dev-1/out.tsv', predict_dev_1)
test_A = readFile('test-A/in.tsv')
predict_test_A = model.predict(test_A)
write_pred('test-A/out.tsv', predict_test_A)

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

20000
dev-0/meta.tsv Normal file

File diff suppressed because it is too large Load Diff

20000
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

11563
dev-1/meta.tsv Normal file

File diff suppressed because it is too large Load Diff

11563
dev-1/out.tsv Normal file

File diff suppressed because it is too large Load Diff

118
model.ipynb Normal file
View File

@ -0,0 +1,118 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 99,
"id": "8f5480f9-fa82-4150-acff-9309fdc43690",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"107463\n"
]
},
{
"data": {
"text/plain": [
"Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),\n",
" ('linearregression', LinearRegression())])"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error\n",
"\n",
"with open('train/train.tsv', 'r', encoding='utf8') as file:\n",
" train_data = pd.read_csv(file, sep='\\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])\n",
"\n",
"print(len(train_data)) \n",
"train_data = train_data[:10000]\n",
" \n",
"X = train_data['Text']\n",
"Y = train_data['Begin']\n",
"\n",
"\n",
"model = make_pipeline(TfidfVectorizer(), LinearRegression())\n",
"model.fit(X, Y)\n"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "02e89f1c-a2d0-4d41-94a2-aa86b257069d",
"metadata": {},
"outputs": [],
"source": [
"def readFile(filename):\n",
" result = []\n",
" with open(filename, 'r', encoding=\"utf-8\") as file:\n",
" for line in file:\n",
" text = line.split(\"\\t\")[0].strip()\n",
" result.append(text)\n",
" return result\n",
"\n",
"def write_pred(filename, predictions):\n",
" with open(filename, \"w\") as file:\n",
" for pred in predictions:\n",
" file.write(str(pred) + \"\\n\")\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "b85f5e22-eafb-41ee-aa2c-20c338d42701",
"metadata": {},
"outputs": [],
"source": [
"\n",
"dev_0 = readFile('dev-0/in.tsv')\n",
"predict_dev_0 = model.predict(dev_0)\n",
"write_pred('dev-0/out.tsv', predict_dev_0)\n",
"\n",
"dev_1 = readFile('dev-1/in.tsv')\n",
"predict_dev_1 = model.predict(dev_1)\n",
"write_pred('dev-1/out.tsv', predict_dev_1)\n",
"\n",
"test_A = readFile('test-A/in.tsv')\n",
"predict_test_A = model.predict(test_A)\n",
"write_pred('test-A/out.tsv', predict_test_A)\n",
"\n",
"\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

49
run.py Normal file
View File

@ -0,0 +1,49 @@
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
with open('train/train.tsv', 'r', encoding='utf8') as file:
train_data = pd.read_csv(file, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
def readFile(filename):
result = []
with open(filename, 'r', encoding="utf-8") as file:
for line in file:
text = line.split("\t")[0].strip()
result.append(text)
return result
def write_pred(filename, predictions):
with open(filename, "w") as file:
for pred in predictions:
file.write(str(pred) + "\n")
train_data = train_data[:10000]
X = train_data['Text']
Y = train_data['Begin']
model = make_pipeline(TfidfVectorizer(), LinearRegression())
model.fit(X, Y)
dev_0 = readFile('dev-0/in.tsv')
predict_dev_0 = model.predict(dev_0)
write_pred('dev-0/out.tsv', predict_dev_0)
dev_1 = readFile('dev-1/in.tsv')
predict_dev_1 = model.predict(dev_1)
write_pred('dev-1/out.tsv', predict_dev_1)
test_A = readFile('test-A/in.tsv')
predict_test_A = model.predict(test_A)
write_pred('test-A/out.tsv', predict_test_A)

File diff suppressed because one or more lines are too long

14220
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

107471
train/meta.tsv Normal file

File diff suppressed because it is too large Load Diff

107471
train/train.tsv Normal file

File diff suppressed because one or more lines are too long