s478855 - BERT
This commit is contained in:
parent
4a6e13712b
commit
67f4a46156
1860
dev-0/out.tsv
1860
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
4576
run2.ipynb
Normal file
4576
run2.ipynb
Normal file
File diff suppressed because one or more lines are too long
150
run2.py
Normal file
150
run2.py
Normal file
@ -0,0 +1,150 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""run2.ipynb
|
||||
|
||||
Automatically generated by Colaboratory.
|
||||
|
||||
Original file is located at
|
||||
https://colab.research.google.com/drive/1hC17fLfkUeCrO84M6Hvy8haJF8AdEQ0T
|
||||
"""
|
||||
|
||||
import torch
|
||||
|
||||
torch.cuda.is_available()
|
||||
|
||||
torch.cuda.device_count()
|
||||
|
||||
torch.cuda.current_device()
|
||||
|
||||
torch.cuda.device(0)
|
||||
|
||||
torch.cuda.get_device_name(0)
|
||||
|
||||
# run this cell, then restart the runtime before continuing
|
||||
!pip install git+https://github.com/joeddav/transformers.git@data-collator-type-fix
|
||||
!pip install nlp
|
||||
!pip install transformers
|
||||
!pip install datasets
|
||||
|
||||
from google.colab import drive
|
||||
drive.mount('/content/drive')
|
||||
|
||||
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
|
||||
from nlp import load_dataset
|
||||
import torch
|
||||
import numpy as np
|
||||
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
||||
from datasets import Dataset, load_dataset
|
||||
import re
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
||||
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
|
||||
|
||||
with open('/content/drive/MyDrive/eks/train/in.tsv', 'r', encoding='utf8') as f:
|
||||
X_train = f.readlines()
|
||||
with open('/content/drive/MyDrive/eks/train/expected.tsv', 'r', encoding='utf8') as f:
|
||||
y_train = f.readlines()
|
||||
|
||||
with open('/content/drive/MyDrive/eks/dev-0/in.tsv', 'r', encoding='utf8') as f:
|
||||
X_dev = f.readlines()
|
||||
with open('/content/drive/MyDrive/eks/dev-0/expected.tsv', 'r', encoding='utf8') as f:
|
||||
y_dev = f.readlines()
|
||||
|
||||
with open('/content/drive/MyDrive/eks/test-A/in.tsv', 'r', encoding='utf8') as f:
|
||||
X_test = f.readlines()
|
||||
|
||||
for i, line in enumerate(X_train):
|
||||
X_train[i] = re.sub(r'\t[0-9]+\n', '', line)
|
||||
|
||||
for i, line in enumerate(X_dev):
|
||||
X_dev[i] = re.sub(r'\t[0-9]+\n', '', line)
|
||||
|
||||
for i, line in enumerate(X_test):
|
||||
X_test[i] = re.sub(r'\t[0-9]+\n', '', line)
|
||||
|
||||
for i, line in enumerate(y_train):
|
||||
y_train[i] = re.sub(r'\n', '', line)
|
||||
|
||||
for i, line in enumerate(y_dev):
|
||||
y_dev[i] = re.sub(r'\n', '', line)
|
||||
|
||||
y_train = list(map(int, y_train))
|
||||
|
||||
df = pd.DataFrame({"text": X_train, "label": y_train})
|
||||
|
||||
df = df.sample(frac = 0.1)
|
||||
|
||||
df80 = df.sample(frac = 0.80)
|
||||
df20 = df.drop(df80.index)
|
||||
|
||||
def tokenize(batch):
|
||||
return tokenizer(batch['text'], padding=True, truncation=True)
|
||||
|
||||
train_dataset, test_dataset = Dataset.from_pandas(df80), Dataset.from_pandas(df20)
|
||||
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
|
||||
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
|
||||
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
|
||||
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
|
||||
|
||||
def compute_metrics(pred):
|
||||
labels = pred.label_ids
|
||||
preds = pred.predictions.argmax(-1)
|
||||
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
|
||||
acc = accuracy_score(labels, preds)
|
||||
return {
|
||||
'accuracy': acc,
|
||||
'f1': f1,
|
||||
'precision': precision,
|
||||
'recall': recall
|
||||
}
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir='./results',
|
||||
num_train_epochs=1,
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=64,
|
||||
warmup_steps=500,
|
||||
weight_decay=0.01,
|
||||
logging_dir='./logs',
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
compute_metrics=compute_metrics,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=test_dataset
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
trainer.evaluate()
|
||||
|
||||
# Commented out IPython magic to ensure Python compatibility.
|
||||
# %load_ext tensorboard
|
||||
# %tensorboard --logdir logs
|
||||
|
||||
y_dev = list(map(int, y_dev))
|
||||
y_test = [0 for _ in X_test]
|
||||
|
||||
df_dev = pd.DataFrame({"text": X_dev, "label": y_dev})
|
||||
df_test = pd.DataFrame({"text": X_test, "label": y_test})
|
||||
|
||||
dev_dataset, testA_dataset = Dataset.from_pandas(df_dev), Dataset.from_pandas(df_test)
|
||||
|
||||
dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
|
||||
testA_dataset = testA_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
|
||||
dev_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
|
||||
testA_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
|
||||
|
||||
y_pred_dev = trainer.predict(dev_dataset).label_ids
|
||||
y_pred_test = trainer.predict(testA_dataset).label_ids
|
||||
|
||||
with open('/content/drive/MyDrive/eks/dev-0/out.tsv', 'wt') as f:
|
||||
for pred in y_pred_dev:
|
||||
f.write(str(pred)+'\n')
|
||||
|
||||
with open('/content/drive/MyDrive/eks/test-A/out.tsv', 'wt') as f:
|
||||
for pred in y_pred_test:
|
||||
f.write(str(pred)+'\n')
|
2690
test-A/out.tsv
2690
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user