s478855 - BERT

This commit is contained in:
ulaniuk 2022-06-20 22:11:22 +02:00
parent 4a6e13712b
commit 67f4a46156
4 changed files with 7001 additions and 2275 deletions

File diff suppressed because it is too large Load Diff

4576
run2.ipynb Normal file

File diff suppressed because one or more lines are too long

150
run2.py Normal file
View File

@ -0,0 +1,150 @@
# -*- coding: utf-8 -*-
"""run2.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1hC17fLfkUeCrO84M6Hvy8haJF8AdEQ0T
"""
import torch
torch.cuda.is_available()
torch.cuda.device_count()
torch.cuda.current_device()
torch.cuda.device(0)
torch.cuda.get_device_name(0)
# run this cell, then restart the runtime before continuing
!pip install git+https://github.com/joeddav/transformers.git@data-collator-type-fix
!pip install nlp
!pip install transformers
!pip install datasets
from google.colab import drive
drive.mount('/content/drive')
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset, load_dataset
import re
import pandas as pd
from sklearn.model_selection import train_test_split
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
with open('/content/drive/MyDrive/eks/train/in.tsv', 'r', encoding='utf8') as f:
X_train = f.readlines()
with open('/content/drive/MyDrive/eks/train/expected.tsv', 'r', encoding='utf8') as f:
y_train = f.readlines()
with open('/content/drive/MyDrive/eks/dev-0/in.tsv', 'r', encoding='utf8') as f:
X_dev = f.readlines()
with open('/content/drive/MyDrive/eks/dev-0/expected.tsv', 'r', encoding='utf8') as f:
y_dev = f.readlines()
with open('/content/drive/MyDrive/eks/test-A/in.tsv', 'r', encoding='utf8') as f:
X_test = f.readlines()
for i, line in enumerate(X_train):
X_train[i] = re.sub(r'\t[0-9]+\n', '', line)
for i, line in enumerate(X_dev):
X_dev[i] = re.sub(r'\t[0-9]+\n', '', line)
for i, line in enumerate(X_test):
X_test[i] = re.sub(r'\t[0-9]+\n', '', line)
for i, line in enumerate(y_train):
y_train[i] = re.sub(r'\n', '', line)
for i, line in enumerate(y_dev):
y_dev[i] = re.sub(r'\n', '', line)
y_train = list(map(int, y_train))
df = pd.DataFrame({"text": X_train, "label": y_train})
df = df.sample(frac = 0.1)
df80 = df.sample(frac = 0.80)
df20 = df.drop(df80.index)
def tokenize(batch):
return tokenizer(batch['text'], padding=True, truncation=True)
train_dataset, test_dataset = Dataset.from_pandas(df80), Dataset.from_pandas(df20)
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=1,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
)
trainer = Trainer(
model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=test_dataset
)
trainer.train()
trainer.evaluate()
# Commented out IPython magic to ensure Python compatibility.
# %load_ext tensorboard
# %tensorboard --logdir logs
y_dev = list(map(int, y_dev))
y_test = [0 for _ in X_test]
df_dev = pd.DataFrame({"text": X_dev, "label": y_dev})
df_test = pd.DataFrame({"text": X_test, "label": y_test})
dev_dataset, testA_dataset = Dataset.from_pandas(df_dev), Dataset.from_pandas(df_test)
dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
testA_dataset = testA_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
dev_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
testA_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
y_pred_dev = trainer.predict(dev_dataset).label_ids
y_pred_test = trainer.predict(testA_dataset).label_ids
with open('/content/drive/MyDrive/eks/dev-0/out.tsv', 'wt') as f:
for pred in y_pred_dev:
f.write(str(pred)+'\n')
with open('/content/drive/MyDrive/eks/test-A/out.tsv', 'wt') as f:
for pred in y_pred_test:
f.write(str(pred)+'\n')

File diff suppressed because it is too large Load Diff