challenging-america-word-ga.../run.py

#!/usr/bin/env python
# coding: utf-8

# In[1]:


import lzma
def read_xz_file(file_path):
    data = []
    with lzma.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            line = line.lower().replace("-\\n", "").replace("\\n", " ").replace("\xad", "").replace("\\\\n", " ").replace("\\\\", " ").replace("\n", " ")
            data.append(line)  
    return data


# In[2]:


def read_tsv_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip().split('\t')  # Rozdziel linie na elementy za pomocą tabulatora
            data.append(line)  # Dodaj elementy do listy danych
    return data


# In[3]:


file_path = "train\\in.tsv.xz"


# In[4]:


data = read_xz_file(file_path)


# In[5]:


expected = read_tsv_file("train\\expected.tsv")


# In[6]:


corpus_before=[]
corpus_after=[]
for i in range(len(data)):
    corpus_before.append(str(data[i].split("\t")[6]))
    corpus_after.append(str(data[i].split("\t")[7]))


# In[7]:


for i in range(len(expected)):
    expected[i] = str(expected[i]).lower()


# In[8]:


corpus = []
for i in range(len(expected)):
    corpus.append(corpus_before[i] + " " + expected[i] + " " + corpus_after[i])


# In[9]:


from collections import defaultdict
from nltk import trigrams
from nltk.tokenize import word_tokenize

model_trigram = defaultdict(lambda: defaultdict(float))
dictionary_trigram = set()
for line in corpus[:200000]:
            tokens = word_tokenize(line)
            for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):
                if word1 and word2 and word3:
                    model_trigram[(word2, word3)][word1] += 1
                    model_trigram[(word1, word2)][word3] += 1
                    dictionary_trigram.update([word1, word2, word3])


# In[10]:


from collections import defaultdict
from nltk import bigrams
from nltk.tokenize import word_tokenize

model_bigram = defaultdict(lambda: defaultdict(float))
dictionary_bigram = set()
for line in corpus[:200000]:
            tokens = word_tokenize(line)
            for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):
                if word1 and word2:
                    model_bigram[word2][word1] += 1
                    model_bigram[word1][word2] += 1
                    dictionary_bigram.update([word1, word2])


# In[11]:


smoothing = 0.0001
for trio in model_trigram:
    count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)
    for token in model_trigram[trio]:
        model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum


# In[12]:


smoothing = 0.0001
for trio in model_bigram:
    count_sum = sum(model_bigram[trio].values()) + smoothing * len(dictionary_bigram)
    for token in model_bigram[trio]:
        model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum


# In[19]:


from collections import Counter

default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"

data = read_xz_file("dev-0\\in.tsv.xz")
corpus_before=[]
corpus_after=[]
for i in range(len(data)):
    corpus_before.append(str(data[i].split("\t")[6]))
    corpus_after.append(str(data[i].split("\t")[7]))
    
with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
    for text in corpus_before:
        tokens = word_tokenize(text)
        prediction = ""

        if len(tokens) >= 3:
            results = dict(model_trigram[(tokens[0], tokens[1])])
            if results:
                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))

        if prediction == "":
            bigram_results = dict(model_bigram[tokens[0]])
            if bigram_results:
                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))

        if prediction == "":
            prediction = default

        output.write(str(prediction.replace("\n", "").strip() + "\n"))


# In[ ]:


# In[22]:


from collections import Counter

default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"

data = read_xz_file("test-A\\in.tsv.xz")
corpus_before=[]
corpus_after=[]
for i in range(len(data)):
    corpus_before.append(str(data[i].split("\t")[6]))
    corpus_after.append(str(data[i].split("\t")[7]))
    
with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
    for text in corpus_before:
        tokens = word_tokenize(text)
        prediction = ""

        if len(tokens) >= 3:
            results = dict(model_trigram[(tokens[0], tokens[1])])
            if results:
                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))

        if prediction == "":
            bigram_results = dict(model_bigram[tokens[0]])
            if bigram_results:
                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))

        if prediction == "":
            prediction = default

        output.write(str(prediction.replace("\n", "").strip() + "\n"))


# In[ ]:
fourgram 100k corpus 2024-04-23 20:46:13 +02:00			`#!/usr/bin/env python`
			`# coding: utf-8`

			`# In[1]:`


			`import lzma`
			`def read_xz_file(file_path):`
			`data = []`
			`with lzma.open(file_path, 'rt', encoding='utf-8') as f:`
			`for line in f:`
			`line = line.lower().replace("-\\n", "").replace("\\n", " ").replace("\xad", "").replace("\\\\n", " ").replace("\\\\", " ").replace("\n", " ")`
			`data.append(line)`
			`return data`


			`# In[2]:`


			`def read_tsv_file(file_path):`
			`data = []`
			`with open(file_path, 'r', encoding='utf-8') as file:`
			`for line in file:`
452662 2024-04-23 21:23:41 +02:00			`line = line.strip().split('\t') # Rozdziel linie na elementy za pomocą tabulatora`
			`data.append(line) # Dodaj elementy do listy danych`
fourgram 100k corpus 2024-04-23 20:46:13 +02:00			`return data`


			`# In[3]:`


			`file_path = "train\\in.tsv.xz"`


			`# In[4]:`


			`data = read_xz_file(file_path)`


			`# In[5]:`


			`expected = read_tsv_file("train\\expected.tsv")`


			`# In[6]:`


			`corpus_before=[]`
			`corpus_after=[]`
			`for i in range(len(data)):`
			`corpus_before.append(str(data[i].split("\t")[6]))`
			`corpus_after.append(str(data[i].split("\t")[7]))`


			`# In[7]:`


			`for i in range(len(expected)):`
			`expected[i] = str(expected[i]).lower()`


			`# In[8]:`


			`corpus = []`
			`for i in range(len(expected)):`
			`corpus.append(corpus_before[i] + " " + expected[i] + " " + corpus_after[i])`


			`# In[9]:`


452662 2024-04-23 21:23:41 +02:00			`from collections import defaultdict`
			`from nltk import trigrams`
			`from nltk.tokenize import word_tokenize`

			`model_trigram = defaultdict(lambda: defaultdict(float))`
			`dictionary_trigram = set()`
452662 trigram 2024-04-24 14:20:00 +02:00			`for line in corpus[:200000]:`
452662 2024-04-23 21:23:41 +02:00			`tokens = word_tokenize(line)`
			`for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):`
			`if word1 and word2 and word3:`
			`model_trigram[(word2, word3)][word1] += 1`
			`model_trigram[(word1, word2)][word3] += 1`
			`dictionary_trigram.update([word1, word2, word3])`
fourgram 100k corpus 2024-04-23 20:46:13 +02:00

452662 trigram 2024-04-24 14:20:00 +02:00			`# In[10]:`
fourgram 100k corpus 2024-04-23 20:46:13 +02:00

452662 2024-04-23 21:23:41 +02:00			`from collections import defaultdict`
			`from nltk import bigrams`
			`from nltk.tokenize import word_tokenize`

			`model_bigram = defaultdict(lambda: defaultdict(float))`
			`dictionary_bigram = set()`
452662 trigram 2024-04-24 14:20:00 +02:00			`for line in corpus[:200000]:`
452662 2024-04-23 21:23:41 +02:00			`tokens = word_tokenize(line)`
			`for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):`
			`if word1 and word2:`
			`model_bigram[word2][word1] += 1`
			`model_bigram[word1][word2] += 1`
			`dictionary_bigram.update([word1, word2])`
fourgram 100k corpus 2024-04-23 20:46:13 +02:00

			`# In[11]:`


452662 2024-04-23 21:23:41 +02:00			`smoothing = 0.0001`
			`for trio in model_trigram:`
			`count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)`
			`for token in model_trigram[trio]:`
			`model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum`


452662 trigram 2024-04-24 14:20:00 +02:00			`# In[12]:`
452662 2024-04-23 21:23:41 +02:00

			`smoothing = 0.0001`
			`for trio in model_bigram:`
			`count_sum = sum(model_bigram[trio].values()) + smoothing * len(dictionary_bigram)`
			`for token in model_bigram[trio]:`
			`model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum`


452662 trigram 2024-04-24 14:20:00 +02:00			`# In[19]:`
fourgram 100k corpus 2024-04-23 20:46:13 +02:00

			`from collections import Counter`

452662 trigram 2024-04-24 14:20:00 +02:00			`default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"`
fourgram 100k corpus 2024-04-23 20:46:13 +02:00
			`data = read_xz_file("dev-0\\in.tsv.xz")`
			`corpus_before=[]`
			`corpus_after=[]`
			`for i in range(len(data)):`
			`corpus_before.append(str(data[i].split("\t")[6]))`
			`corpus_after.append(str(data[i].split("\t")[7]))`

			`with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:`
			`for text in corpus_before:`
			`tokens = word_tokenize(text)`
452662 2024-04-23 21:23:41 +02:00			`prediction = ""`
fourgram 100k corpus 2024-04-23 20:46:13 +02:00
452662 trigram 2024-04-24 14:20:00 +02:00			`if len(tokens) >= 3:`
			`results = dict(model_trigram[(tokens[0], tokens[1])])`
452662 2024-04-23 21:23:41 +02:00			`if results:`
			`prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))`

			`if prediction == "":`
			`bigram_results = dict(model_bigram[tokens[0]])`
			`if bigram_results:`
			`prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))`
fourgram 100k corpus 2024-04-23 20:46:13 +02:00
			`if prediction == "":`
			`prediction = default`
452662 2024-04-23 21:23:41 +02:00
fourgram 100k corpus 2024-04-23 20:46:13 +02:00			`output.write(str(prediction.replace("\n", "").strip() + "\n"))`


452662 2024-04-23 21:23:41 +02:00			`# In[ ]:`





452662 trigram 2024-04-24 14:20:00 +02:00			`# In[22]:`
fourgram 100k corpus 2024-04-23 20:46:13 +02:00

			`from collections import Counter`

452662 trigram 2024-04-24 14:20:00 +02:00			`default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"`
fourgram 100k corpus 2024-04-23 20:46:13 +02:00
			`data = read_xz_file("test-A\\in.tsv.xz")`
			`corpus_before=[]`
			`corpus_after=[]`
			`for i in range(len(data)):`
			`corpus_before.append(str(data[i].split("\t")[6]))`
			`corpus_after.append(str(data[i].split("\t")[7]))`

			`with open("test-A\\out.tsv", "w", encoding="utf-8") as output:`
			`for text in corpus_before:`
			`tokens = word_tokenize(text)`
452662 2024-04-23 21:23:41 +02:00			`prediction = ""`
fourgram 100k corpus 2024-04-23 20:46:13 +02:00
452662 trigram 2024-04-24 14:20:00 +02:00			`if len(tokens) >= 3:`
			`results = dict(model_trigram[(tokens[0], tokens[1])])`
452662 2024-04-23 21:23:41 +02:00			`if results:`
			`prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))`

			`if prediction == "":`
			`bigram_results = dict(model_bigram[tokens[0]])`
			`if bigram_results:`
			`prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))`
fourgram 100k corpus 2024-04-23 20:46:13 +02:00
			`if prediction == "":`
			`prediction = default`
452662 2024-04-23 21:23:41 +02:00
fourgram 100k corpus 2024-04-23 20:46:13 +02:00			`output.write(str(prediction.replace("\n", "").strip() + "\n"))`


			`# In[ ]:`