10 KiB
10 KiB
import csv
import numpy as np
import pandas as pd
import re
import string
import torch
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords # used for preprocessing
from nltk.stem import WordNetLemmatizer # used for preprocessing
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\akida\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\akida\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\akida\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date!
True
train_in = pd.read_csv("./train/in.tsv", delimiter="\t", names=["text", "date"], header=None)
train_exp = pd.read_csv("./train/expected.tsv", delimiter="\t", header=None)
dev_in = pd.read_csv("./dev-0/in.tsv", delimiter="\t", names=["text", "date"],header=None)
dev_exp = pd.read_csv("./dev-0/expected.tsv", delimiter="\t", header=None)
test_in = pd.read_csv("./test-A/in.tsv", delimiter="\t", names=["text", "date"], header=None)
train_in.drop('date', axis=1, inplace=True)
dev_in.drop('date', axis=1, inplace=True)
test_in.drop('date', axis=1, inplace=True)
train_set = train_in
train_set['result'] = train_exp
# train_set = train_set[:1000]
dev_set = dev_in
dev_set['result'] = dev_exp
test_set = test_in
test_set['result'] = pd.DataFrame(np.zeros(len(test_in), dtype=int))
len(train_set)
289541
def remove_urls(text):
return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",str(text)).split())
def text_lowercase(text):
return text.lower()
def remove_numbers(text):
return re.sub(r'\d+', '', text)
def remove_punctuation(text):
return text.translate(str.maketrans('', '', string.punctuation))
def remove_stopwords(text):
stop_words = set(stopwords.words('english'))
return [i for i in text if not i in stop_words]
def tokenize(text):
return word_tokenize(text)
def lemmatize(text):
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(token) for token in text]
def preprocess(dataset):
texts_column = []
for num, text in enumerate(dataset['text']):
if num % 10000 == 0:
print(num)
prep_text = remove_urls(text)
prep_text = text_lowercase(prep_text)
prep_text = remove_numbers(prep_text)
prep_text = remove_punctuation(prep_text)
prep_text = tokenize(prep_text)
prep_text = remove_stopwords(prep_text)
prep_text = lemmatize(prep_text)
pre_text = ' '.join(prep_text)
texts_column.append(pre_text)
dataset['text'] = texts_column
return dataset
train_set = train_set[:12000]
train_set = preprocess(train_set)
dev_set = preprocess(dev_set)
test_set = preprocess(dev_set)
0 0 0
train_set_copy = train_set
dev_set_copy = dev_set
test_set_copy = test_set
dictionary = set()
for i, text_line in enumerate(train_set_copy['text']):
if i % 5000 == 0:
print(i)
for word in text_line.split():
dictionary.add(word)
0 5000
word_index_map = {}
i = 0
for w in dictionary:
word_index_map[w] = i
i += 1
len(word_index_map)
21120
prep_x = []
for num, w in enumerate(train_set_copy['text']):
if num % 10000 == 0:
print(num)
a = np.zeros(len(word_index_map))
for word in w.split():
index = word_index_map[word]
a[index] = 1.
prep_x.append(a)
0
train_y = train_set_copy['result'].astype(float).tolist()
x = torch.tensor(prep_x, dtype=torch.float)
y = torch.tensor(train_y, dtype=torch.float)
k = torch.randn(len(dictionary), requires_grad=True)
rate = torch.tensor(0.001)
for i in range(1500):
y_predicted = torch.sigmoid(x @ k)
price = (-1 / y.size()[0]) * torch.sum(y * torch.log(y_predicted + 1e-10) + (1 - y) * torch.log(1 - y_predicted + 1e-10))
price.backward()
with torch.no_grad():
k -= rate * k.grad
k.requires_grad = True
if i % 200 == 0:
print(i)
0 200 400 600 800 1000 1200 1400
def prepare_data(dataset):
prep_x = []
for num, w in enumerate(dataset['text']):
if num % 1000 == 0:
print(num)
a = np.zeros(len(word_index_map))
for word in w.split():
if word in word_index_map:
index = word_index_map[word]
a[index] = 1.
prep_x.append(a)
return torch.tensor(prep_x, dtype=torch.float)
def predict(x, weights, save_path):
with open(save_path + '/out.tsv', 'wt', newline='') as f:
writer = csv.writer(f, delimiter='\t')
y = torch.sigmoid(x @ weights)
for value in y:
if value > 0.90:
value = torch.tensor([0.90])
elif value < 0.10:
value = torch.tensor([0.10])
writer.writerow([str(value.item())])
x_dev = prepare_data(dev_set_copy)
x_test = prepare_data(test_set_copy)
0 1000 2000 3000 4000 5000 0 1000 2000 3000 4000 5000
predict(x_dev, k, './dev-0')
predict(x_test, k, './test-A')