6.2 KiB
6.2 KiB
import lzma
import nltk
from nltk.tokenize import word_tokenize
from nltk import trigrams
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from statistics import mean
from wordcloud import WordCloud,STOPWORDS
from collections import defaultdict, Counter
import plotly.express as px
import pandas as pd
from tqdm import tqdm
from nltk import ngrams
import pandas as pd
import csv
import re
import string
model = defaultdict(lambda: defaultdict(lambda: 0))
setOf = set()
alpha = 0.01
train_file_in = pd.read_csv("train/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000)
train_file_out = pd.read_csv("train/expected.tsv", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000)
stop_words= nltk.corpus.stopwords.words('english')
def get_20common_2grams(text, n):
outputTrigrams = []
n_grams = ngrams(nltk.tokenize.word_tokenize(text), n)
for grams in n_grams:
outputTrigrams.append(grams)
return outputTrigrams
def get_20common_2grams_no_stop(text, n):
tokenized_world = nltk.tokenize.word_tokenize(text)
stop_words= nltk.corpus.stopwords.words('english')
tokenized_no_stop = [i for i in tokenized_world if i not in stop_words]
n_grams = ngrams(tokenized_no_stop, n)
return n_grams
def predict(word_before, word_after):
print("tu jestem")
prob_list = dict(Counter(model[(word_before, word_after)]).most_common(6)).items()
predictions = []
prob_sum = 0.0
for key, value in prob_list:
print("tu jestem .................................")
prob_sum += value
predictions.append(f'{key}:{value}')
if prob_sum == 0.0:
print("a teraz tu")
return 'the:0:2 be:0.2 to:0.2 of:0.15 and:0.15 :0.1'
remaining_prob = 1 - prob_sum
if remaining_prob < 0.01:
predictions.append(f':{0.01}')
return ' '.join(predictions)
train = train_file_in[[6, 7]]
train = pd.concat([train, train_file_out], axis=1)
train["result"] = train[6] + train[0] + train[7]
for index, row in train.iterrows():
lower= str(row["result"]).lower()
new_doc = re.sub("s+"," ", lower)
text_clean = "".join([i for i in new_doc if i not in string.punctuation])
words = word_tokenize(text_clean)
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
if w1 and w2 and w3:
model[(w2, w3)][w1] += 1
setOf.add(w1)
setOf.add(w2)
setOf.add(w3)
for words in model:
num_n_grams = float(sum(model[words].values()))
for word in model[words]:
model[words][word] = (model[words][word] + alpha) / (num_n_grams + alpha*len(vocab))
for key in model:
total_count = float(sum(model[key].values()))
for value in model[key]:
model[key][value] /= total_count
dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
test_a_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
with open('dev-0/out.tsv', 'w') as file:
for index, row in dev_data.iterrows():
lower= str(row[7]).lower()
new_doc = re.sub("s+"," ", lower)
text_clean = "".join([i for i in new_doc if i not in string.punctuation])
words = word_tokenize(text_clean)
if len(words) < 4:
print(words)
prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
else:
prediction = predict(words[0], words[1])
file.write(prediction + '\n')