paranormal-or-skeptic4/zadanie.ipynb
2020-12-17 17:57:29 +01:00

10 KiB

import csv
import numpy as np
import pandas as pd
import re
import string
import torch

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords # used for preprocessing
from nltk.stem import WordNetLemmatizer # used for preprocessing
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akida\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akida\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akida\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
True
train_in = pd.read_csv("./train/in.tsv", delimiter="\t", names=["text", "date"], header=None)
train_exp = pd.read_csv("./train/expected.tsv", delimiter="\t", header=None)
dev_in = pd.read_csv("./dev-0/in.tsv", delimiter="\t", names=["text", "date"],header=None)
dev_exp = pd.read_csv("./dev-0/expected.tsv", delimiter="\t", header=None)
test_in = pd.read_csv("./test-A/in.tsv", delimiter="\t", names=["text", "date"], header=None)

train_in.drop('date', axis=1, inplace=True)
dev_in.drop('date', axis=1, inplace=True)
test_in.drop('date', axis=1, inplace=True)
train_set = train_in
train_set['result'] = train_exp
# train_set = train_set[:1000]

dev_set = dev_in
dev_set['result'] = dev_exp

test_set = test_in
test_set['result'] = pd.DataFrame(np.zeros(len(test_in), dtype=int))
len(train_set)
289541
def remove_urls(text):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",str(text)).split())

def text_lowercase(text):   
    return text.lower()

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return [i for i in text if not i in stop_words]

def tokenize(text):
    return word_tokenize(text)

def lemmatize(text):
    lemmatizer = WordNetLemmatizer() 
    return [lemmatizer.lemmatize(token) for token in text]

def preprocess(dataset):
    texts_column = []
    for num, text in enumerate(dataset['text']):
        if num % 10000 == 0:
            print(num)
        prep_text = remove_urls(text)
        prep_text = text_lowercase(prep_text)
        prep_text = remove_numbers(prep_text)
        prep_text = remove_punctuation(prep_text)
        prep_text = tokenize(prep_text)
        prep_text = remove_stopwords(prep_text)
        prep_text = lemmatize(prep_text)
        pre_text = ' '.join(prep_text)
        texts_column.append(pre_text)
    dataset['text'] = texts_column
    return dataset
train_set = train_set[:12000]
train_set = preprocess(train_set)
dev_set = preprocess(dev_set)
test_set = preprocess(dev_set)
0
0
0
train_set_copy = train_set
dev_set_copy = dev_set
test_set_copy = test_set
dictionary = set()

for i, text_line in enumerate(train_set_copy['text']):
    if i % 5000 == 0:
        print(i)
    for word in text_line.split():
        dictionary.add(word)
0
5000
word_index_map = {}
i = 0
for w in dictionary:
    word_index_map[w] = i
    i += 1
len(word_index_map)
21120
prep_x = []
for num, w in enumerate(train_set_copy['text']):
    if num % 10000 == 0:
        print(num)
    a = np.zeros(len(word_index_map))
    for word in w.split():
        index = word_index_map[word]
        a[index] = 1.
    prep_x.append(a)
0
train_y = train_set_copy['result'].astype(float).tolist()
x = torch.tensor(prep_x, dtype=torch.float)
y = torch.tensor(train_y, dtype=torch.float)
k = torch.randn(len(dictionary), requires_grad=True)
rate = torch.tensor(0.001)
for i in range(1500):
    y_predicted = torch.sigmoid(x @ k)
    price = (-1 / y.size()[0]) * torch.sum(y * torch.log(y_predicted + 1e-10) + (1 - y) * torch.log(1 - y_predicted + 1e-10))
    price.backward()
    with torch.no_grad():
        k -= rate * k.grad
    k.requires_grad = True
    if i % 200 == 0:
        print(i)
0
200
400
600
800
1000
1200
1400
def prepare_data(dataset):
    prep_x = []
    for num, w in enumerate(dataset['text']):
        if num % 1000 == 0:
            print(num)
        a = np.zeros(len(word_index_map))
        for word in w.split():
            if word in word_index_map:
                index = word_index_map[word]
                a[index] = 1.
        prep_x.append(a)
    return torch.tensor(prep_x, dtype=torch.float)
def predict(x, weights, save_path):
    with open(save_path + '/out.tsv', 'wt', newline='') as f:
        writer = csv.writer(f, delimiter='\t')
        y = torch.sigmoid(x @ weights)
        for value in y:
            if value > 0.90:
                value = torch.tensor([0.90])
            elif value < 0.10:
                value = torch.tensor([0.10])
            writer.writerow([str(value.item())])
x_dev = prepare_data(dev_set_copy)
x_test = prepare_data(test_set_copy)
0
1000
2000
3000
4000
5000
0
1000
2000
3000
4000
5000
predict(x_dev, k, './dev-0')
predict(x_test, k, './test-A')