#!/usr/bin/env python # coding: utf-8 # In[1]: import lzma def read_xz_file(file_path): data = [] with lzma.open(file_path, 'rt', encoding='utf-8') as f: for line in f: line = line.lower().replace("-\\n", "").replace("\\n", " ").replace("\xad", "").replace("\\\\n", " ").replace("\\\\", " ").replace("\n", " ") data.append(line) return data # In[2]: def read_tsv_file(file_path): data = [] with open(file_path, 'r', encoding='utf-8') as file: for line in file: line = line.strip().split('\t') # Rozdziel linie na elementy za pomocą tabulatora data.append(line) # Dodaj elementy do listy danych return data # In[3]: file_path = "train\\in.tsv.xz" # In[4]: data = read_xz_file(file_path) # In[5]: expected = read_tsv_file("train\\expected.tsv") # In[6]: corpus_before=[] corpus_after=[] for i in range(len(data)): corpus_before.append(str(data[i].split("\t")[6])) corpus_after.append(str(data[i].split("\t")[7])) # In[7]: for i in range(len(expected)): expected[i] = str(expected[i]).lower() # In[8]: corpus = [] for i in range(len(expected)): corpus.append(corpus_before[i] + " " + expected[i] + " " + corpus_after[i]) # In[9]: from collections import defaultdict from nltk import ngrams from nltk.tokenize import word_tokenize model = defaultdict(lambda: defaultdict(float)) dictionary = set() for line in corpus[:100000]: tokens = word_tokenize(line) for word1, word2, word3, word4 in ngrams(tokens, n=4, pad_right=True, pad_left=True): if word1 and word2 and word3 and word4: model[(word2, word3, word4)][word1] += 1 model[(word1, word2, word3)][word4] += 1 dictionary.update([word1, word2, word3, word4]) # In[15]: from collections import defaultdict from nltk import trigrams from nltk.tokenize import word_tokenize model_trigram = defaultdict(lambda: defaultdict(float)) dictionary_trigram = set() for line in corpus[:100000]: tokens = word_tokenize(line) for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True): if word1 and word2 and word3: model_trigram[(word2, word3)][word1] += 1 model_trigram[(word1, word2)][word3] += 1 dictionary_trigram.update([word1, word2, word3]) # In[18]: from collections import defaultdict from nltk import bigrams from nltk.tokenize import word_tokenize model_bigram = defaultdict(lambda: defaultdict(float)) dictionary_bigram = set() for line in corpus[:100000]: tokens = word_tokenize(line) for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True): if word1 and word2: model_bigram[word2][word1] += 1 model_bigram[word1][word2] += 1 dictionary_bigram.update([word1, word2]) # In[11]: smoothing = 0.0001 for trio in model: count_sum = sum(model[trio].values()) + smoothing * len(dictionary) for token in model[trio]: model[trio][token] = (model[trio][token] + smoothing) / count_sum # In[17]: smoothing = 0.0001 for trio in model_trigram: count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram) for token in model_trigram[trio]: model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum # In[19]: smoothing = 0.0001 for trio in model_bigram: count_sum = sum(model_bigram[trio].values()) + smoothing * len(dictionary_bigram) for token in model_bigram[trio]: model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum # In[21]: from collections import Counter default = "the:0.30000 of:0.20000 and:0.10000 to:0.10000 in:0.10000 a:0.10000 :0.10000" data = read_xz_file("dev-0\\in.tsv.xz") corpus_before=[] corpus_after=[] for i in range(len(data)): corpus_before.append(str(data[i].split("\t")[6])) corpus_after.append(str(data[i].split("\t")[7])) with open("dev-0\\out.tsv", "w", encoding="utf-8") as output: for text in corpus_before: tokens = word_tokenize(text) prediction = "" if len(tokens) >= 4: results = dict(model[(tokens[0], tokens[1], tokens[2])]) if results: prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6)) if prediction == "": trigram_results = dict(model_trigram[(tokens[0], tokens[1])]) if trigram_results: prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6)) if prediction == "": bigram_results = dict(model_bigram[tokens[0]]) if bigram_results: prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6)) if prediction == "": prediction = default output.write(str(prediction.replace("\n", "").strip() + "\n")) # In[ ]: # In[23]: from collections import Counter default = "the:0.30000 of:0.20000 and:0.10000 to:0.10000 in:0.10000 a:0.10000 :0.10000" data = read_xz_file("test-A\\in.tsv.xz") corpus_before=[] corpus_after=[] for i in range(len(data)): corpus_before.append(str(data[i].split("\t")[6])) corpus_after.append(str(data[i].split("\t")[7])) with open("test-A\\out.tsv", "w", encoding="utf-8") as output: for text in corpus_before: tokens = word_tokenize(text) prediction = "" if len(tokens) >= 4: results = dict(model[(tokens[0], tokens[1], tokens[2])]) if results: prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6)) if prediction == "": trigram_results = dict(model_trigram[(tokens[0], tokens[1])]) if trigram_results: prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6)) if prediction == "": bigram_results = dict(model_bigram[tokens[0]]) if bigram_results: prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6)) if prediction == "": prediction = default output.write(str(prediction.replace("\n", "").strip() + "\n")) # In[ ]: