challenging-america-word-ga.../run.ipynb
2022-05-01 22:59:47 +02:00

27 KiB

SŁOWNIK

from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader
import csv
from tqdm import tqdm
from nltk import trigrams, word_tokenize
/usr/lib/python3/dist-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:35: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:597: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:836: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1097: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1344: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1480: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, positive=False):
/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:152: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  precompute=False, eps=np.finfo(np.float).eps,
/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:320: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, random_state=None,
/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:580: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=4 * np.finfo(np.float).eps, n_jobs=None,

DANE

VOCAB_SIZE = 20000
EMBED_SIZE = 100
CONTEXT_SIZE = 2
# hidden units
H = 100
def get_words_from_line(line):
    line = clean(line)
    line = line.rstrip()
    yield '<s>'
    for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
        yield m.group(0).lower()
    yield '</s>'


def get_word_lines_from_file(file_name):
    with open(file_name, 'r') as fh:
        for line in fh:
            yield get_words_from_line(line)
    
def clean(text):
    text = str(text).lower().replace("-\\\\n", "").replace("\\\\n", " ")
    return re.sub(r"\p{P}", "", text)
vocab = build_vocab_from_iterator(
    get_word_lines_from_file('train-300k.txt'),
    max_tokens = VOCAB_SIZE,
    specials = ['<unk>'])
!shuf < train-300k.txt > train-300k.shuf.txt

SIEĆ

class SimpleTrigramNeuralLanguageModel(nn.Module):
    def __init__(self, vocabulary_size, embedding_size, context_size, h):
        super(SimpleTrigramNeuralLanguageModel, self).__init__()
        self.context_size = context_size
        self.embedding_size = embedding_size
        self.embeddings = nn.Embedding(vocabulary_size, embedding_size)
        self.linear1 = nn.Linear(context_size * embedding_size, h)
        self.linear2 = nn.Linear(h, vocabulary_size, bias = False)
        self.softmax = nn.Softmax()

    def forward(self, x):
        embeds = self.embeddings(x).view((-1,self.context_size * self.embedding_size))
        out = torch.tanh(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = self.softmax(out)
        return log_probs


def look_ahead_iterator(gen):
    prev_1 = None
    prev_2 = None
    for item in gen:
        if prev_1 is not None and prev_2 is not None:
            yield (prev_1, prev_2, item)
        if prev_1 is None:
            prev_1 = item
        elif prev_2 is None:
            prev_2 = item

class Trigrams(IterableDataset):
    def __init__(self, text_file, vocabulary_size):
        self.vocab = build_vocab_from_iterator(
            get_word_lines_from_file(text_file),
            max_tokens = vocabulary_size,
            specials = ['<unk>'])
        self.vocab.set_default_index(self.vocab['<unk>'])
        self.vocabulary_size = vocabulary_size
        self.text_file = text_file

    def __iter__(self):
        return look_ahead_iterator((self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
model = SimpleTrigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE, CONTEXT_SIZE, H)
vocab.set_default_index(vocab['<unk>'])
# def decrease_train_set_size(lines_amount):
#     lines = []
#     with open('train.txt', 'r') as fh:
#             for line in fh:
#                 lines.append(line)
#                 lines_amount -= 1
#                 if(lines_amount == 0):
#                     break
#     with open('train-300k.txt', 'w') as fh:
#             for line in lines:
#                 fh.write(line)
#                 fh.write('\n')       
                
# decrease_train_set_size(300000)
train_dataset = Trigrams('train-300k.shuf.txt', VOCAB_SIZE)

PIERWSZE UCZENIE

device = 'cpu'
model = SimpleTrigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE, CONTEXT_SIZE, H).to(device)
data = DataLoader(train_dataset, batch_size=5000)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()

model.train()
step = 0
for x1, x2, y in data:
    x = torch.stack((x1,x2), 0)
    x = x.to(device)
    y = y.to(device)
    optimizer.zero_grad()
    ypredicted = model(x)
    loss = criterion(torch.log(ypredicted), y)
    if step % 100 == 0:
        print(step, loss)
    step += 1
    loss.backward()
    optimizer.step()

torch.save(model.state_dict(), 'model1.bin')
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:15: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  from ipykernel import kernelapp as app
0 tensor(9.9723, grad_fn=<NllLossBackward0>)
100 tensor(6.7220, grad_fn=<NllLossBackward0>)
200 tensor(6.7379, grad_fn=<NllLossBackward0>)
300 tensor(6.6070, grad_fn=<NllLossBackward0>)
400 tensor(6.8277, grad_fn=<NllLossBackward0>)
500 tensor(6.7266, grad_fn=<NllLossBackward0>)
600 tensor(6.5761, grad_fn=<NllLossBackward0>)
700 tensor(6.7834, grad_fn=<NllLossBackward0>)
800 tensor(6.6698, grad_fn=<NllLossBackward0>)
900 tensor(6.6914, grad_fn=<NllLossBackward0>)
1000 tensor(6.6602, grad_fn=<NllLossBackward0>)
1100 tensor(6.6094, grad_fn=<NllLossBackward0>)
1200 tensor(6.6578, grad_fn=<NllLossBackward0>)
1300 tensor(6.7287, grad_fn=<NllLossBackward0>)
1400 tensor(6.7498, grad_fn=<NllLossBackward0>)
1500 tensor(6.6921, grad_fn=<NllLossBackward0>)
1600 tensor(6.8111, grad_fn=<NllLossBackward0>)
1700 tensor(6.5730, grad_fn=<NllLossBackward0>)
1800 tensor(6.8943, grad_fn=<NllLossBackward0>)
1900 tensor(6.6941, grad_fn=<NllLossBackward0>)
2000 tensor(6.6191, grad_fn=<NllLossBackward0>)
2100 tensor(6.7121, grad_fn=<NllLossBackward0>)
2200 tensor(6.7634, grad_fn=<NllLossBackward0>)
2300 tensor(6.4635, grad_fn=<NllLossBackward0>)
2400 tensor(6.5395, grad_fn=<NllLossBackward0>)
2500 tensor(6.4345, grad_fn=<NllLossBackward0>)
2600 tensor(6.5351, grad_fn=<NllLossBackward0>)
2700 tensor(6.5506, grad_fn=<NllLossBackward0>)
2800 tensor(6.7326, grad_fn=<NllLossBackward0>)
2900 tensor(6.8471, grad_fn=<NllLossBackward0>)
3000 tensor(6.7213, grad_fn=<NllLossBackward0>)
3100 tensor(6.6697, grad_fn=<NllLossBackward0>)
3200 tensor(6.5017, grad_fn=<NllLossBackward0>)
3300 tensor(6.5743, grad_fn=<NllLossBackward0>)
3400 tensor(6.6956, grad_fn=<NllLossBackward0>)
3500 tensor(6.6287, grad_fn=<NllLossBackward0>)
3600 tensor(6.7025, grad_fn=<NllLossBackward0>)
3700 tensor(6.5614, grad_fn=<NllLossBackward0>)
3800 tensor(6.6845, grad_fn=<NllLossBackward0>)
3900 tensor(6.5496, grad_fn=<NllLossBackward0>)
4000 tensor(6.7311, grad_fn=<NllLossBackward0>)
4100 tensor(6.7542, grad_fn=<NllLossBackward0>)
4200 tensor(6.6301, grad_fn=<NllLossBackward0>)
4300 tensor(6.6172, grad_fn=<NllLossBackward0>)
4400 tensor(6.6682, grad_fn=<NllLossBackward0>)
4500 tensor(6.7084, grad_fn=<NllLossBackward0>)
4600 tensor(6.7211, grad_fn=<NllLossBackward0>)
4700 tensor(6.6184, grad_fn=<NllLossBackward0>)
4800 tensor(6.7681, grad_fn=<NllLossBackward0>)
4900 tensor(6.5696, grad_fn=<NllLossBackward0>)
5000 tensor(6.7709, grad_fn=<NllLossBackward0>)
5100 tensor(6.7046, grad_fn=<NllLossBackward0>)
5200 tensor(6.6074, grad_fn=<NllLossBackward0>)
5300 tensor(6.5759, grad_fn=<NllLossBackward0>)
5400 tensor(6.6311, grad_fn=<NllLossBackward0>)
5500 tensor(6.6226, grad_fn=<NllLossBackward0>)
5600 tensor(6.7386, grad_fn=<NllLossBackward0>)
5700 tensor(6.7140, grad_fn=<NllLossBackward0>)
5800 tensor(6.5882, grad_fn=<NllLossBackward0>)
5900 tensor(6.6433, grad_fn=<NllLossBackward0>)
6000 tensor(6.6778, grad_fn=<NllLossBackward0>)
6100 tensor(6.7124, grad_fn=<NllLossBackward0>)
6200 tensor(6.5534, grad_fn=<NllLossBackward0>)
6300 tensor(6.7435, grad_fn=<NllLossBackward0>)
6400 tensor(6.6929, grad_fn=<NllLossBackward0>)
6500 tensor(6.6021, grad_fn=<NllLossBackward0>)
9800 tensor(6.6594, grad_fn=<NllLossBackward0>)
9900 tensor(6.6072, grad_fn=<NllLossBackward0>)
10000 tensor(6.6441, grad_fn=<NllLossBackward0>)
10100 tensor(6.7004, grad_fn=<NllLossBackward0>)
10200 tensor(6.6086, grad_fn=<NllLossBackward0>)
10300 tensor(6.6379, grad_fn=<NllLossBackward0>)
10400 tensor(6.6874, grad_fn=<NllLossBackward0>)
10500 tensor(6.5827, grad_fn=<NllLossBackward0>)
10600 tensor(6.8673, grad_fn=<NllLossBackward0>)
10700 tensor(6.7024, grad_fn=<NllLossBackward0>)
10800 tensor(6.6442, grad_fn=<NllLossBackward0>)
10900 tensor(6.6290, grad_fn=<NllLossBackward0>)
11000 tensor(6.6476, grad_fn=<NllLossBackward0>)
11100 tensor(6.6478, grad_fn=<NllLossBackward0>)
11200 tensor(6.6045, grad_fn=<NllLossBackward0>)
11300 tensor(6.7457, grad_fn=<NllLossBackward0>)
11400 tensor(6.7079, grad_fn=<NllLossBackward0>)
11500 tensor(6.7284, grad_fn=<NllLossBackward0>)
11600 tensor(6.6763, grad_fn=<NllLossBackward0>)
11700 tensor(6.7629, grad_fn=<NllLossBackward0>)
11800 tensor(6.6131, grad_fn=<NllLossBackward0>)
11900 tensor(6.7501, grad_fn=<NllLossBackward0>)
12000 tensor(6.7680, grad_fn=<NllLossBackward0>)
12100 tensor(6.5243, grad_fn=<NllLossBackward0>)
12200 tensor(6.8293, grad_fn=<NllLossBackward0>)
12300 tensor(6.7489, grad_fn=<NllLossBackward0>)
12400 tensor(6.5965, grad_fn=<NllLossBackward0>)
12500 tensor(6.7072, grad_fn=<NllLossBackward0>)
12600 tensor(6.5717, grad_fn=<NllLossBackward0>)
12700 tensor(6.5866, grad_fn=<NllLossBackward0>)
12800 tensor(6.6545, grad_fn=<NllLossBackward0>)
12900 tensor(6.6316, grad_fn=<NllLossBackward0>)
13000 tensor(6.7430, grad_fn=<NllLossBackward0>)
13100 tensor(6.6374, grad_fn=<NllLossBackward0>)
13200 tensor(6.8072, grad_fn=<NllLossBackward0>)
13300 tensor(6.6405, grad_fn=<NllLossBackward0>)
13400 tensor(6.4336, grad_fn=<NllLossBackward0>)
13500 tensor(6.8307, grad_fn=<NllLossBackward0>)
13600 tensor(6.6995, grad_fn=<NllLossBackward0>)
13700 tensor(6.6014, grad_fn=<NllLossBackward0>)
13800 tensor(6.8696, grad_fn=<NllLossBackward0>)
13900 tensor(6.7318, grad_fn=<NllLossBackward0>)
14000 tensor(6.7131, grad_fn=<NllLossBackward0>)
14100 tensor(6.6522, grad_fn=<NllLossBackward0>)
14200 tensor(6.7575, grad_fn=<NllLossBackward0>)
14300 tensor(6.8551, grad_fn=<NllLossBackward0>)
14400 tensor(6.6822, grad_fn=<NllLossBackward0>)
14500 tensor(6.6775, grad_fn=<NllLossBackward0>)
14600 tensor(6.6479, grad_fn=<NllLossBackward0>)
14700 tensor(6.4803, grad_fn=<NllLossBackward0>)
14800 tensor(6.7839, grad_fn=<NllLossBackward0>)
14900 tensor(6.7139, grad_fn=<NllLossBackward0>)
15000 tensor(6.6516, grad_fn=<NllLossBackward0>)
15100 tensor(6.7407, grad_fn=<NllLossBackward0>)
15200 tensor(6.7723, grad_fn=<NllLossBackward0>)
15300 tensor(6.6148, grad_fn=<NllLossBackward0>)
15400 tensor(6.5857, grad_fn=<NllLossBackward0>)
15500 tensor(6.5296, grad_fn=<NllLossBackward0>)
15600 tensor(6.5889, grad_fn=<NllLossBackward0>)
15700 tensor(6.5253, grad_fn=<NllLossBackward0>)
15800 tensor(6.4268, grad_fn=<NllLossBackward0>)
15900 tensor(6.8929, grad_fn=<NllLossBackward0>)
16000 tensor(6.6716, grad_fn=<NllLossBackward0>)
16100 tensor(6.8106, grad_fn=<NllLossBackward0>)
16200 tensor(6.6950, grad_fn=<NllLossBackward0>)
16300 tensor(6.4829, grad_fn=<NllLossBackward0>)
16400 tensor(6.7628, grad_fn=<NllLossBackward0>)
16500 tensor(6.7170, grad_fn=<NllLossBackward0>)
16600 tensor(6.7825, grad_fn=<NllLossBackward0>)
16700 tensor(6.6456, grad_fn=<NllLossBackward0>)
16800 tensor(6.7462, grad_fn=<NllLossBackward0>)
16900 tensor(6.7378, grad_fn=<NllLossBackward0>)
17000 tensor(6.7779, grad_fn=<NllLossBackward0>)
17100 tensor(6.7084, grad_fn=<NllLossBackward0>)
17200 tensor(6.7092, grad_fn=<NllLossBackward0>)
17300 tensor(6.5689, grad_fn=<NllLossBackward0>)
17400 tensor(6.6913, grad_fn=<NllLossBackward0>)
17500 tensor(6.6689, grad_fn=<NllLossBackward0>)
17600 tensor(6.6477, grad_fn=<NllLossBackward0>)
17700 tensor(6.5198, grad_fn=<NllLossBackward0>)
17800 tensor(6.7119, grad_fn=<NllLossBackward0>)
17900 tensor(6.6387, grad_fn=<NllLossBackward0>)
18000 tensor(6.5867, grad_fn=<NllLossBackward0>)
18100 tensor(6.7001, grad_fn=<NllLossBackward0>)
18200 tensor(6.5907, grad_fn=<NllLossBackward0>)
18300 tensor(6.5760, grad_fn=<NllLossBackward0>)
18400 tensor(6.8202, grad_fn=<NllLossBackward0>)
18500 tensor(6.5118, grad_fn=<NllLossBackward0>)
18600 tensor(6.6571, grad_fn=<NllLossBackward0>)
18700 tensor(6.7786, grad_fn=<NllLossBackward0>)
18800 tensor(6.6524, grad_fn=<NllLossBackward0>)
18900 tensor(6.6925, grad_fn=<NllLossBackward0>)
19000 tensor(6.6848, grad_fn=<NllLossBackward0>)
19100 tensor(6.6592, grad_fn=<NllLossBackward0>)
model
SimpleTrigramNeuralLanguageModel(
  (embeddings): Embedding(20000, 100)
  (linear1): Linear(in_features=200, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=20000, bias=False)
  (softmax): Softmax(dim=None)
)

Predykcje

def predict(words):
    vocab = train_dataset.vocab
    ixs = torch.tensor(vocab.forward(words)).to(device)

    predictions = model(ixs)
    top_predictions = torch.topk(predictions[0], 5)
    top_indices = top_predictions.indices.tolist()
    top_probs = top_predictions.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    result_list = list(zip(top_words, top_probs))
    
    total_prob = 0.0
    str_prediction = ""

    for word, prob in result_list:
        total_prob += prob
        str_prediction += f"{word}:{prob} "

    if not total_prob:
        return "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"

    if 1 - total_prob >= 0.01:
        str_prediction += f":{1-total_prob}"
    else:
        str_prediction += f":0.01"
    return str_prediction

def predict_data(read_path, save_path):
    data = pd.read_csv(
        read_path, sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE
    )
    with open(save_path, "w", encoding="utf-8") as file:
        for _, row in tqdm(data.iterrows()):
            words = word_tokenize(clean(row[6]))
            if len(words) < 3:
                prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
            else:
                prediction = predict(words[-2:])
            file.write(prediction + "\n")
print("Predicting...")
print("Dev set")
predict_data("dev-0/in.tsv.xz", "dev-0/out.tsv")
Predicting...
Dev set
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.


  This is separate from the ipykernel package so we can avoid doing imports until
0it [00:00, ?it/s]/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:15: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  from ipykernel import kernelapp as app
10519it [00:17, 584.68it/s]
print("Test set")
predict_data("test-A/in.tsv.xz", "test-A/out.tsv")
Test set
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.


  
0it [00:00, ?it/s]/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:15: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  from ipykernel import kernelapp as app
7414it [00:11, 640.42it/s]