Compare commits

...

16 Commits

11 changed files with 177081 additions and 0 deletions

View File

@ -0,0 +1,167 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "greenhouse-technician",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sklearn\n",
"import pandas as pd\n",
"from gzip import open as open_gz\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "acoustic-dividend",
"metadata": {},
"outputs": [],
"source": [
"def predict_year(x, path_out, model):\n",
" results = model.predict(x)\n",
" with open(path_out, 'wt') as file:\n",
" for r in results:\n",
" file.write(str(r) + '\\n') "
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "senior-harassment",
"metadata": {},
"outputs": [],
"source": [
"with open('train/train.tsv', 'r', encoding='utf8') as file:\n",
" train = pd.read_csv(file, sep='\\t', names=['Date1', 'Date2', 'Title', 'Author', 'Text'])\n",
" \n",
"#train = train[0:10000]\n",
"train_x = train['Text']\n",
"train['Date'] = (train['Date1'].astype(float) + train['Date2'].astype(float))/2\n",
"train_y=train['Date1']"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "polyphonic-coach",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),\n",
" ('linearregression', LinearRegression())])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = make_pipeline(TfidfVectorizer(), LinearRegression())\n",
"model.fit(train_x, train_y)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "varying-wright",
"metadata": {},
"outputs": [],
"source": [
"with open('dev-0/in.tsv', 'r', encoding='utf8') as file:\n",
" x_dev0 = pd.read_csv(file, header=None, sep='\\t')\n",
"x_dev0 = x_dev0[0] \n",
"x_dev0[19999] = 'nie jest'\n",
"x_dev0[20000] = 'nie wiem'"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "frozen-ticket",
"metadata": {},
"outputs": [],
"source": [
"with open('dev-1/in.tsv', 'r', encoding='utf8') as file:\n",
" x_dev1 = pd.read_csv(file, header=None, sep='\\t')\n",
"x_dev1 = x_dev1[0] "
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "8e3a18db-f966-45e4-b881-4b336f188055",
"metadata": {},
"outputs": [],
"source": [
"with open('test-A/in.tsv', 'r', encoding='utf8') as file:\n",
" x_test = pd.read_csv(file, header=None, sep='\\t')\n",
"x_test = x_test[0] "
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "traditional-amount",
"metadata": {},
"outputs": [],
"source": [
"#y_dev = pd.read_csv('dev-0/out.tsv',header = None, sep = '/t',engine = 'python')\n",
"#y_dev = y_dev[0]\n",
"#y_dev_exp = pd.read_csv('dev-0/expected.tsv',header = None, sep = '/t',engine = 'python')\n",
"#y_dev_exp = y_dev_exp[0]\n",
"#RMSE_dev = mean_squared_error(y_dev_exp, y_dev) "
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "close-clinton",
"metadata": {},
"outputs": [],
"source": [
"predict_year(x_dev0, 'dev-0/out.tsv', model)\n",
"predict_year(x_dev1,'dev-1/out.tsv', model)\n",
"predict_year(x_test,'test-A/out.tsv', model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "official-sweet",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,167 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "greenhouse-technician",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sklearn\n",
"import pandas as pd\n",
"from gzip import open as open_gz\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "acoustic-dividend",
"metadata": {},
"outputs": [],
"source": [
"def predict_year(x, path_out, model):\n",
" results = model.predict(x)\n",
" with open(path_out, 'wt') as file:\n",
" for r in results:\n",
" file.write(str(r) + '\\n') "
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "senior-harassment",
"metadata": {},
"outputs": [],
"source": [
"with open('train/train.tsv', 'r', encoding='utf8') as file:\n",
" train = pd.read_csv(file, sep='\\t', names=['Date1', 'Date2', 'Title', 'Author', 'Text'])\n",
" \n",
"#train = train[0:10000]\n",
"train_x = train['Text']\n",
"train['Date'] = (train['Date1'].astype(float) + train['Date2'].astype(float))/2\n",
"train_y=train['Date1']"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "polyphonic-coach",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),\n",
" ('linearregression', LinearRegression())])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = make_pipeline(TfidfVectorizer(), LinearRegression())\n",
"model.fit(train_x, train_y)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "varying-wright",
"metadata": {},
"outputs": [],
"source": [
"with open('dev-0/in.tsv', 'r', encoding='utf8') as file:\n",
" x_dev0 = pd.read_csv(file, header=None, sep='\\t')\n",
"x_dev0 = x_dev0[0] \n",
"x_dev0[19999] = 'nie jest'\n",
"x_dev0[20000] = 'nie wiem'"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "frozen-ticket",
"metadata": {},
"outputs": [],
"source": [
"with open('dev-1/in.tsv', 'r', encoding='utf8') as file:\n",
" x_dev1 = pd.read_csv(file, header=None, sep='\\t')\n",
"x_dev1 = x_dev1[0] "
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "8e3a18db-f966-45e4-b881-4b336f188055",
"metadata": {},
"outputs": [],
"source": [
"with open('test-A/in.tsv', 'r', encoding='utf8') as file:\n",
" x_test = pd.read_csv(file, header=None, sep='\\t')\n",
"x_test = x_test[0] "
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "traditional-amount",
"metadata": {},
"outputs": [],
"source": [
"#y_dev = pd.read_csv('dev-0/out.tsv',header = None, sep = '/t',engine = 'python')\n",
"#y_dev = y_dev[0]\n",
"#y_dev_exp = pd.read_csv('dev-0/expected.tsv',header = None, sep = '/t',engine = 'python')\n",
"#y_dev_exp = y_dev_exp[0]\n",
"#RMSE_dev = mean_squared_error(y_dev_exp, y_dev) "
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "close-clinton",
"metadata": {},
"outputs": [],
"source": [
"predict_year(x_dev0, 'dev-0/out.tsv', model)\n",
"predict_year(x_dev1,'dev-1/out.tsv', model)\n",
"predict_year(x_test,'test-A/out.tsv', model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "official-sweet",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

20000
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

11563
dev-1/out.tsv Normal file

File diff suppressed because it is too large Load Diff

167
retroc2.ipynb Normal file
View File

@ -0,0 +1,167 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "greenhouse-technician",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sklearn\n",
"import pandas as pd\n",
"from gzip import open as open_gz\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "acoustic-dividend",
"metadata": {},
"outputs": [],
"source": [
"def predict_year(x, path_out, model):\n",
" results = model.predict(x)\n",
" with open(path_out, 'wt') as file:\n",
" for r in results:\n",
" file.write(str(r) + '\\n') "
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "senior-harassment",
"metadata": {},
"outputs": [],
"source": [
"with open('train/train.tsv', 'r', encoding='utf8') as file:\n",
" train = pd.read_csv(file, sep='\\t', names=['Date1', 'Date2', 'Title', 'Author', 'Text'])\n",
" \n",
"#train = train[0:10000]\n",
"train_x = train['Text']\n",
"train['Date'] = (train['Date1'].astype(float) + train['Date2'].astype(float))/2\n",
"train_y=train['Date1']"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "polyphonic-coach",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),\n",
" ('linearregression', LinearRegression())])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = make_pipeline(TfidfVectorizer(), LinearRegression())\n",
"model.fit(train_x, train_y)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "varying-wright",
"metadata": {},
"outputs": [],
"source": [
"with open('dev-0/in.tsv', 'r', encoding='utf8') as file:\n",
" x_dev0 = pd.read_csv(file, header=None, sep='\\t')\n",
"x_dev0 = x_dev0[0] \n",
"x_dev0[19999] = 'nie jest'\n",
"x_dev0[20000] = 'nie wiem'"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "frozen-ticket",
"metadata": {},
"outputs": [],
"source": [
"with open('dev-1/in.tsv', 'r', encoding='utf8') as file:\n",
" x_dev1 = pd.read_csv(file, header=None, sep='\\t')\n",
"x_dev1 = x_dev1[0] "
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "8e3a18db-f966-45e4-b881-4b336f188055",
"metadata": {},
"outputs": [],
"source": [
"with open('test-A/in.tsv', 'r', encoding='utf8') as file:\n",
" x_test = pd.read_csv(file, header=None, sep='\\t')\n",
"x_test = x_test[0] "
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "traditional-amount",
"metadata": {},
"outputs": [],
"source": [
"#y_dev = pd.read_csv('dev-0/out.tsv',header = None, sep = '/t',engine = 'python')\n",
"#y_dev = y_dev[0]\n",
"#y_dev_exp = pd.read_csv('dev-0/expected.tsv',header = None, sep = '/t',engine = 'python')\n",
"#y_dev_exp = y_dev_exp[0]\n",
"#RMSE_dev = mean_squared_error(y_dev_exp, y_dev) "
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "close-clinton",
"metadata": {},
"outputs": [],
"source": [
"predict_year(x_dev0, 'dev-0/out.tsv', model)\n",
"predict_year(x_dev1,'dev-1/out.tsv', model)\n",
"predict_year(x_test,'test-A/out.tsv', model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "official-sweet",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

159
run.ipynb Normal file
View File

@ -0,0 +1,159 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "greenhouse-technician",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sklearn\n",
"import pandas as pd\n",
"from gzip import open as open_gz\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "acoustic-dividend",
"metadata": {},
"outputs": [],
"source": [
"def predict_year(x, path_out, model):\n",
" results = model.predict(x)\n",
" with open(path_out, 'wt') as file:\n",
" for r in results:\n",
" file.write(str(r) + '\\n') "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "78c79a98-8309-4c1c-b27d-faad2ee7a2af",
"metadata": {},
"outputs": [],
"source": [
"def read_file(filename):\n",
" result = []\n",
" with open(filename, 'r', encoding=\"utf-8\") as file:\n",
" for line in file:\n",
" text = line.split(\"\\t\")[0].strip()\n",
" result.append(text)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "senior-harassment",
"metadata": {},
"outputs": [],
"source": [
"with open('train/train.tsv', 'r', encoding='utf8') as file:\n",
" train = pd.read_csv(file, sep='\\t', names=['Begin', 'End', 'Title', 'Author', 'Text'])\n",
" \n",
"train = train[0:12000]\n",
"train_x = train['Text']\n",
"#train['Date'] = (train['Date1'].astype(float) + train['Date2'].astype(float))/2\n",
"train_y = train['Begin']"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "polyphonic-coach",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),\n",
" ('linearregression', LinearRegression())])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = make_pipeline(TfidfVectorizer(), LinearRegression())\n",
"model.fit(train_x, train_y)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "varying-wright",
"metadata": {},
"outputs": [],
"source": [
"x_dev_0 = read_file('dev-0/in.tsv')\n",
"predict_year(x_dev_0, 'dev-0/out.tsv', model)\n",
"x_dev_1 = read_file('dev-1/in.tsv')\n",
"predict_year(x_dev_1,'dev-1/out.tsv', model)\n",
"x_test = read_file('test-A/in.tsv')\n",
"predict_year(x_test,'test-A/out.tsv', model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "traditional-amount",
"metadata": {},
"outputs": [],
"source": [
"#y_dev = pd.read_csv('dev-0/out.tsv',header = None, sep = '/t',engine = 'python')\n",
"#y_dev = y_dev[0]\n",
"#y_dev_exp = pd.read_csv('dev-0/expected.tsv',header = None, sep = '/t',engine = 'python')\n",
"#y_dev_exp = y_dev_exp[0]\n",
"#RMSE_dev = mean_squared_error(y_dev_exp, y_dev) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "close-clinton",
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "official-sweet",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

41
run.py Normal file
View File

@ -0,0 +1,41 @@
import os
import sklearn
import pandas as pd
from gzip import open as open_gz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
def predict_year(x, path_out, model):
results = model.predict(x)
with open(path_out, 'wt') as file:
for r in results:
file.write(str(r) + '\n')
def read_file(filename):
result = []
with open(filename, 'r', encoding="utf-8") as file:
for line in file:
text = line.split("\t")[0].strip()
result.append(text)
return result
with open('train/train.tsv', 'r', encoding='utf8') as file:
train = pd.read_csv(file, sep='\t', names=['Start', 'End', 'Title', 'Author', 'Text'])
train = train[0:12000]
train_x = train['Text']
#train['Date'] = (train['Start'].astype(float) + train['End'].astype(float))/2
train_y = train['Start']
model = make_pipeline(TfidfVectorizer(), LinearRegression())
model.fit(train_x, train_y)
x_dev_0 = read_file('dev-0/in.tsv')
predict_year(x_dev_0, 'dev-0/out.tsv', model)
x_dev_1 = read_file('dev-1/in.tsv')
predict_year(x_dev_1,'dev-1/out.tsv', model)
x_test = read_file('test-A/in.tsv')
predict_year(x_test,'test-A/out.tsv', model)

14220
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

107471
train/train.tsv Normal file

File diff suppressed because one or more lines are too long