challenging-america-word-ga.../run.ipynb
2022-04-10 21:13:49 +02:00

6.6 KiB

import lzma
import nltk

from nltk.tokenize import word_tokenize
from nltk import trigrams
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from statistics import mean
from wordcloud import WordCloud,STOPWORDS
from collections import defaultdict, Counter
import plotly.express as px
import pandas as pd
from tqdm import tqdm
from nltk import ngrams
import pandas as pd
import csv
import cleantext
import re
import string
model = defaultdict(lambda: defaultdict(lambda: 0))
train_file_in = pd.read_csv("train/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000)
train_file_out = pd.read_csv("train/expected.tsv", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000)
/tmp/ipykernel_755/1610768154.py:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.


  train_file_in = pd.read_csv("train/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000)
/tmp/ipykernel_755/1610768154.py:2: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.


  train_file_out = pd.read_csv("train/expected.tsv", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000)
stop_words= nltk.corpus.stopwords.words('english')

def get_20common_2grams(text, n):
    outputTrigrams = []
    n_grams = ngrams(nltk.tokenize.word_tokenize(text), n)
    for grams in n_grams:
        outputTrigrams.append(grams)
    return outputTrigrams

def get_20common_2grams_no_stop(text, n):
    tokenized_world = nltk.tokenize.word_tokenize(text)
    stop_words= nltk.corpus.stopwords.words('english') 
    tokenized_no_stop = [i for i in tokenized_world if i not in stop_words]
    n_grams = ngrams(tokenized_no_stop, n)
    return n_grams

def predict(word_before, word_after):
    print("tu jestem")
    prob_list = dict(Counter(model[(word_before, word_after)]).most_common(6)).items()
    predictions = []
    prob_sum = 0.0
    for key, value in prob_list:
        print("tu jestem .................................")
        prob_sum += value
        predictions.append(f'{key}:{value}')
    if prob_sum == 0.0:
        print("a teraz tu")
        return 'the:0:2 be:0.2 to:0.2 of:0.15 and:0.15 :0.1'
    remaining_prob = 1 - prob_sum
    if remaining_prob < 0.01:
        predictions.append(f':{0.01}')
    return ' '.join(predictions)

train = train_file_in[[6, 7]]
train = pd.concat([train, train_file_out], axis=1)

train["result"] = train[6] + train[0] + train[7]
for index, row in train.iterrows():
    lower= str(row["result"]).lower()
    new_doc = re.sub("s+"," ", lower)
    text_clean = "".join([i for i in new_doc if i not in string.punctuation])
    words = word_tokenize(text_clean)
    for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
        if w1 and w2 and w3:
            model[(w2, w3)][w1] += 1
for key in model:
    total_count = float(sum(model[key].values()))
    for value in model[key]:
        model[key][value] /= total_count
dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
test_a_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
with open('dev-0/out.tsv', 'w') as file:
    for index, row in dev_data.iterrows():
        lower= str(row[7]).lower()
        new_doc = re.sub("s+"," ", lower)
        text_clean = "".join([i for i in new_doc if i not in string.punctuation])
        words = word_tokenize(text_clean)
        if len(words) < 4:
            print(words)
            prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
        else:
            prediction = predict(words[0], words[1])
        file.write(prediction + '\n')