XLM RoBERTa

This commit is contained in:
Karol Kaczmarek 2020-06-14 18:35:22 +02:00
parent ddce23e0d4
commit 8ce9cb5dac
11 changed files with 857431 additions and 0 deletions

9
0-prepare-data.sh Executable file
View File

@ -0,0 +1,9 @@
#!/user/bin/env bash
set -e
set -x
for dir_name in 'tran' 'dev-0' 'dev-1' 'test-A'
do
paster data/"${dir_name}"/expected.tsv data/"${dir_name}"/in.tsv > data/"${dir_name}"/data.tsv
done

91
1-train-base.py Executable file
View File

@ -0,0 +1,91 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import gzip
import logging
import lzma
from typing import Optional
import pandas as pd
from tqdm import tqdm
from simpletransformers.classification import ClassificationModel
logger = logging.getLogger(__name__)
def open_file(path, *args):
if path.endswith('gz'):
fopen = gzip.open
elif path.endswith('xz'):
fopen = lzma.open
else:
fopen = open
return fopen(path, *args)
def load_train(path: str, max_lines: Optional[int] = None) -> pd.DataFrame:
"""
Load train/validate data.
Args:
path: file path
max_lines: optional number of lines to read
Returns:
loaded data
"""
data = []
logger.info(f'Loading {path}')
with open_file(path, 'rt') as f:
for i, line in enumerate(tqdm(f)):
line = line.strip()
if '\t' not in line:
logger.error(f'Found empty line at position {i + 1}'
f' - SKIP THIS LINE')
continue
label_name, text = line.split('\t', maxsplit=2)
text = text.strip()
# LABEL should be string number
label_id = int(label_name)
data.append((text, label_id))
if max_lines is not None and i >= max_lines:
break
return pd.DataFrame(data)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger('transformers')
max_lines = None
train_df = load_train('data/train/train.tsv', max_lines=max_lines)
eval_df = load_train('data/dev-0/data.tsv')
seq = 512
model_name = 'xmlr_base'
args = {
'cache_dir': f'cache_dir-{model_name}/',
'output_dir': f'outputs-{model_name}-{seq}/',
'best_model_dir': f'outputs-{model_name}-{seq}/best_model',
'max_seq_length': seq,
'train_batch_size': 25,
'num_train_epochs': 1,
'evaluate_during_training': True,
'save_steps': 5000,
'evaluate_during_training_steps': 5000,
'use_cached_eval_features': True,
'reprocess_input_data': False,
}
model = ClassificationModel('xlmroberta', 'xlm-roberta-base', args=args,
num_labels=2, use_cuda=True, cuda_device=0)
logger.info(f'START TRAINING | ARGS: {model.args}')
model.train_model(train_df, eval_df=eval_df)

91
1-train-large.py Executable file
View File

@ -0,0 +1,91 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import gzip
import logging
import lzma
from typing import Optional
import pandas as pd
from tqdm import tqdm
from simpletransformers.classification import ClassificationModel
logger = logging.getLogger(__name__)
def open_file(path, *args):
if path.endswith('gz'):
fopen = gzip.open
elif path.endswith('xz'):
fopen = lzma.open
else:
fopen = open
return fopen(path, *args)
def load_train(path: str, max_lines: Optional[int] = None) -> pd.DataFrame:
"""
Load train/validate data.
Args:
path: file path
max_lines: optional number of lines to read
Returns:
loaded data
"""
data = []
logger.info(f'Loading {path}')
with open_file(path, 'rt') as f:
for i, line in enumerate(tqdm(f)):
line = line.strip()
if '\t' not in line:
logger.error(f'Found empty line at position {i + 1}'
f' - SKIP THIS LINE')
continue
label_name, text = line.split('\t', maxsplit=2)
text = text.strip()
# LABEL should be string number
label_id = int(label_name)
data.append((text, label_id))
if max_lines is not None and i >= max_lines:
break
return pd.DataFrame(data)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger('transformers')
max_lines = None
train_df = load_train('data/train/train.tsv', max_lines=max_lines)
eval_df = load_train('data/dev-0/data.tsv')
seq = 512
model_name = 'xmlr_large'
args = {
'cache_dir': f'cache_dir-{model_name}/',
'output_dir': f'outputs-{model_name}-{seq}/',
'best_model_dir': f'outputs-{model_name}-{seq}/best_model',
'max_seq_length': seq,
'train_batch_size': 10,
'num_train_epochs': 1,
'evaluate_during_training': True,
'save_steps': 5000,
'evaluate_during_training_steps': 5000,
'use_cached_eval_features': True,
'reprocess_input_data': False,
}
model = ClassificationModel('xlmroberta', 'xlm-roberta-large', args=args,
num_labels=2, use_cuda=True, cuda_device=0)
logger.info(f'START TRAINING | ARGS: {model.args}')
model.train_model(train_df, eval_df=eval_df)

82
2-eval-base.py Executable file
View File

@ -0,0 +1,82 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import gzip
import logging
import lzma
import os
from typing import List
import torch
from tqdm import tqdm
from simpletransformers.classification import ClassificationModel
logger = logging.getLogger(__name__)
def open_file(path, *args):
if path.endswith('gz'):
fopen = gzip.open
elif path.endswith('xz'):
fopen = lzma.open
else:
fopen = open
return fopen(path, *args)
def load_test(path: str) -> List[str]:
data = []
logger.debug(f'Loading {path}')
with open_file(path, 'rt') as f:
for line in tqdm(f):
line = line.strip()
data.append(line)
return data
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)
for model_name in ['outputs-xmlr_base-512']:
model_dir = os.path.join(model_name, 'best_model')
seq_len = 512
logger.info(f'Processing {model_name} (for sequence length: {seq_len})')
if 'base' in model_name:
model_type = 'base'
elif 'large' in model_name:
model_type = 'large'
else:
raise ValueError(f'Unknown model type in name {model_name}')
args = {
'max_seq_length': seq_len,
'eval_batch_size': 50,
'reprocess_input_data': True,
'sliding_window': False,
}
model = ClassificationModel('xlmroberta', f'{model_dir}',
num_labels=2, args=args,
use_cuda=True, cuda_device=0)
output_name = f'model=xlmr_{model_type}-seq_len={seq_len}'
for test_name in ['dev-0', 'dev-1', 'test-A']:
logger.info(f'Processing {test_name}')
test_data = load_test(f'data/{test_name}/in.tsv')
save_path = f'data/{test_name}/out-{output_name}.tsv'
class_predictions, raw_outputs = model.predict(test_data)
softmax_tensor = torch.nn.functional.softmax(torch.tensor(raw_outputs), dim=1)
logger.info(f'Saving predictions into {save_path}')
with open_file(save_path, 'wt') as w:
for line_id in range(softmax_tensor.size(0)):
line_probs = softmax_tensor[line_id]
# Get second class for (M class)
w.write(f'{line_probs[1].item()}\n')

82
2-eval-large.py Executable file
View File

@ -0,0 +1,82 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import gzip
import logging
import lzma
import os
from typing import List
import torch
from tqdm import tqdm
from simpletransformers.classification import ClassificationModel
logger = logging.getLogger(__name__)
def open_file(path, *args):
if path.endswith('gz'):
fopen = gzip.open
elif path.endswith('xz'):
fopen = lzma.open
else:
fopen = open
return fopen(path, *args)
def load_test(path: str) -> List[str]:
data = []
logger.debug(f'Loading {path}')
with open_file(path, 'rt') as f:
for line in tqdm(f):
line = line.strip()
data.append(line)
return data
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)
for model_name in ['outputs-xmlr_large-512']:
model_dir = os.path.join(model_name, 'best_model')
seq_len = 512
logger.info(f'Processing {model_name} (for sequence length: {seq_len})')
if 'base' in model_name:
model_type = 'base'
elif 'large' in model_name:
model_type = 'large'
else:
raise ValueError(f'Unknown model type in name {model_name}')
args = {
'max_seq_length': seq_len,
'eval_batch_size': 35,
'reprocess_input_data': True,
'sliding_window': False,
}
model = ClassificationModel('xlmroberta', f'{model_dir}',
num_labels=2, args=args,
use_cuda=True, cuda_device=0)
output_name = f'model=xlmr_{model_type}-seq_len={seq_len}'
for test_name in ['dev-0', 'dev-1', 'test-A']:
logger.info(f'Processing {test_name}')
test_data = load_test(f'data/{test_name}/in.tsv')
save_path = f'data/{test_name}/out-{output_name}.tsv'
class_predictions, raw_outputs = model.predict(test_data)
softmax_tensor = torch.nn.functional.softmax(torch.tensor(raw_outputs), dim=1)
logger.info(f'Saving predictions into {save_path}')
with open_file(save_path, 'wt') as w:
for line_id in range(softmax_tensor.size(0)):
line_probs = softmax_tensor[line_id]
# Get second class for (M class)
w.write(f'{line_probs[1].item()}\n')

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff