fourgram 100k corpus

2024-04-23 20:46:13 +02:00 · 2024-04-23 20:46:13 +02:00 · 7743dd2472
commit 7743dd2472
parent 1450ea4378
3 changed files with 17812 additions and 17634 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -0,0 +1,178 @@
 #!/usr/bin/env python
 # coding: utf-8
 # In[1]:
 import lzma
 def read_xz_file(file_path):
    data = []
    with lzma.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            line = line.lower().replace("-\\n", "").replace("\\n", " ").replace("\xad", "").replace("\\\\n", " ").replace("\\\\", " ").replace("\n", " ")
            data.append(line)  
    return data
 # In[2]:
 def read_tsv_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip().split('\t')  # Rozdziel linie na elementy za pomocą tabulatora
            data.append(line)  # Dodaj elementy do listy danych
    return data
 # In[3]:
 file_path = "train\\in.tsv.xz"
 # In[4]:
 data = read_xz_file(file_path)
 # In[5]:
 expected = read_tsv_file("train\\expected.tsv")
 # In[6]:
 corpus_before=[]
 corpus_after=[]
 for i in range(len(data)):
    corpus_before.append(str(data[i].split("\t")[6]))
    corpus_after.append(str(data[i].split("\t")[7]))
 # In[7]:
 for i in range(len(expected)):
    expected[i] = str(expected[i]).lower()
 # In[8]:
 corpus = []
 for i in range(len(expected)):
    corpus.append(corpus_before[i] + " " + expected[i] + " " + corpus_after[i])
 # In[9]:
 from collections import defaultdict
 from nltk import ngrams
 from nltk.tokenize import word_tokenize
 model = defaultdict(lambda: defaultdict(float))
 dictionary = set()
 for line in corpus[:100000]:
            tokens = word_tokenize(line)
            for word1, word2, word3, word4 in ngrams(tokens, n=4, pad_right=True, pad_left=True):
                if word1 and word2 and word3 and word4:
                    model[(word2, word3, word4)][word1] += 1
                    model[(word1, word2, word3)][word4] += 1
                    dictionary.update([word1, word2, word3, word4])
 # In[10]:
 model2 = model.copy()
 # In[ ]:
 len(model)
 # In[11]:
 smoothing = 0.0001
 for trio in model:
    count_sum = sum(model[trio].values()) + smoothing * len(dictionary)
    for token in model[trio]:
        model[trio][token] = (model[trio][token] + smoothing) / count_sum
 # In[12]:
 from collections import Counter
 default = "the:0.30000 of:0.20000 and:0.10000 to:0.10000 in:0.10000 a:0.10000 :0.10000"
 data = read_xz_file("dev-0\\in.tsv.xz")
 corpus_before=[]
 corpus_after=[]
 for i in range(len(data)):
    corpus_before.append(str(data[i].split("\t")[6]))
    corpus_after.append(str(data[i].split("\t")[7]))
 with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
    for text in corpus_before:
        tokens = word_tokenize(text)
        if len(tokens) < 4:
            prediction = default
        results = dict(model[(tokens[0], tokens[1], tokens[2])])
        if not results:
            prediction = default
        prediction = ' '.join(
            f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
        if prediction == "":
            prediction = default
        output.write(str(prediction.replace("\n", "").strip() + "\n"))
 # In[13]:
 from collections import Counter
 default = "the:0.30000 of:0.20000 and:0.10000 to:0.10000 in:0.10000 a:0.10000 :0.10000"
 data = read_xz_file("test-A\\in.tsv.xz")
 corpus_before=[]
 corpus_after=[]
 for i in range(len(data)):
    corpus_before.append(str(data[i].split("\t")[6]))
    corpus_after.append(str(data[i].split("\t")[7]))
 with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
    for text in corpus_before:
        tokens = word_tokenize(text)
        if len(tokens) < 4:
            prediction = default
        results = dict(model[(tokens[0], tokens[1], tokens[2])])
        if not results:
            prediction = default
        prediction = ' '.join(
            f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
        if prediction == "":
            prediction = default
        output.write(str(prediction.replace("\n", "").strip() + "\n"))
 # In[ ]:
--- a/test-A/out.tsv
+++ b/test-A/out.tsv