Neural bigrams v2

2022-05-02 02:37:51 +02:00 · 2022-05-02 02:37:51 +02:00 · 762dc9dff7
commit 762dc9dff7
parent 37775b359b
3 changed files with 17705 additions and 17916 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -22,9 +22,11 @@ import pandas as pd
 def read_train_data(file):
    data = pd.read_csv(file, sep="\t", error_bad_lines=False, index_col=0, header=None)
    with open('input_train.txt', 'w') as f:
-        for index, row in data[:500000].iterrows():
+        for index, row in data.iterrows():
-            first_part = str(row[6]).replace('\\n', '')
+            first_part = str(row[6]).replace('\\n', " ").replace("'t", " not").replace("'s", " is").replace("'ll", " will").replace("'m", " am").replace("'ve", " have")
-            sec_part = str(row[7]).replace('\\n', '')
+            first_part = first_part.replace("'", "")
            sec_part = str(row[7]).replace('\\n', " ").replace("'t", " not").replace("'s", " is").replace("'ll", " will").replace("'m", " am").replace("'ve", " have")
            sec_part = sec_part.replace("'", "")
            if first_part != 'nan':
                f.write(first_part + '\n')
            if sec_part != 'nan':
@ -53,7 +55,7 @@ from itertools import islice
      for line in fh:
         yield get_words_from_line(line)
-  vocab_size = 30000
+  vocab_size = 50000
  vocab = build_vocab_from_iterator(
      get_word_lines_from_file('input_train.txt'),
@ -113,6 +115,10 @@ class Bigrams(IterableDataset):
 train_dataset = Bigrams('input_train.shuf.txt', vocab_size)
 torch.cuda.empty_cache()
 !nvidia-smi
 from torch.utils.data import DataLoader
@ -179,12 +185,13 @@ def predict_word(word):
    to_return += f':{total}'
  return to_return
-!pip install nltk
+!pip install nltk pandas
 from nltk.tokenize import RegexpTokenizer
 tokenizer = RegexpTokenizer(r"\w+")
 import csv
 import pandas as pd
 def generate_outputs(input_file, output_file):
    data = pd.read_csv(input_file, sep='\t', error_bad_lines=False, index_col=0, header=None, quoting=csv.QUOTE_NONE)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv