XLM RoBERTa
This commit is contained in:
parent
ddce23e0d4
commit
8ce9cb5dac
9
0-prepare-data.sh
Executable file
9
0-prepare-data.sh
Executable file
@ -0,0 +1,9 @@
|
||||
#!/user/bin/env bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
for dir_name in 'tran' 'dev-0' 'dev-1' 'test-A'
|
||||
do
|
||||
paster data/"${dir_name}"/expected.tsv data/"${dir_name}"/in.tsv > data/"${dir_name}"/data.tsv
|
||||
done
|
91
1-train-base.py
Executable file
91
1-train-base.py
Executable file
@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import gzip
|
||||
import logging
|
||||
import lzma
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
from simpletransformers.classification import ClassificationModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def open_file(path, *args):
|
||||
if path.endswith('gz'):
|
||||
fopen = gzip.open
|
||||
elif path.endswith('xz'):
|
||||
fopen = lzma.open
|
||||
else:
|
||||
fopen = open
|
||||
return fopen(path, *args)
|
||||
|
||||
|
||||
def load_train(path: str, max_lines: Optional[int] = None) -> pd.DataFrame:
|
||||
"""
|
||||
Load train/validate data.
|
||||
|
||||
Args:
|
||||
path: file path
|
||||
max_lines: optional number of lines to read
|
||||
|
||||
Returns:
|
||||
loaded data
|
||||
|
||||
"""
|
||||
data = []
|
||||
|
||||
logger.info(f'Loading {path}')
|
||||
with open_file(path, 'rt') as f:
|
||||
for i, line in enumerate(tqdm(f)):
|
||||
line = line.strip()
|
||||
if '\t' not in line:
|
||||
logger.error(f'Found empty line at position {i + 1}'
|
||||
f' - SKIP THIS LINE')
|
||||
continue
|
||||
|
||||
label_name, text = line.split('\t', maxsplit=2)
|
||||
text = text.strip()
|
||||
|
||||
# LABEL should be string number
|
||||
label_id = int(label_name)
|
||||
data.append((text, label_id))
|
||||
|
||||
if max_lines is not None and i >= max_lines:
|
||||
break
|
||||
|
||||
return pd.DataFrame(data)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
transformers_logger = logging.getLogger('transformers')
|
||||
|
||||
max_lines = None
|
||||
train_df = load_train('data/train/train.tsv', max_lines=max_lines)
|
||||
eval_df = load_train('data/dev-0/data.tsv')
|
||||
|
||||
seq = 512
|
||||
model_name = 'xmlr_base'
|
||||
args = {
|
||||
'cache_dir': f'cache_dir-{model_name}/',
|
||||
'output_dir': f'outputs-{model_name}-{seq}/',
|
||||
'best_model_dir': f'outputs-{model_name}-{seq}/best_model',
|
||||
'max_seq_length': seq,
|
||||
'train_batch_size': 25,
|
||||
'num_train_epochs': 1,
|
||||
'evaluate_during_training': True,
|
||||
'save_steps': 5000,
|
||||
'evaluate_during_training_steps': 5000,
|
||||
'use_cached_eval_features': True,
|
||||
'reprocess_input_data': False,
|
||||
}
|
||||
|
||||
model = ClassificationModel('xlmroberta', 'xlm-roberta-base', args=args,
|
||||
num_labels=2, use_cuda=True, cuda_device=0)
|
||||
|
||||
logger.info(f'START TRAINING | ARGS: {model.args}')
|
||||
model.train_model(train_df, eval_df=eval_df)
|
91
1-train-large.py
Executable file
91
1-train-large.py
Executable file
@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import gzip
|
||||
import logging
|
||||
import lzma
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
from simpletransformers.classification import ClassificationModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def open_file(path, *args):
|
||||
if path.endswith('gz'):
|
||||
fopen = gzip.open
|
||||
elif path.endswith('xz'):
|
||||
fopen = lzma.open
|
||||
else:
|
||||
fopen = open
|
||||
return fopen(path, *args)
|
||||
|
||||
|
||||
def load_train(path: str, max_lines: Optional[int] = None) -> pd.DataFrame:
|
||||
"""
|
||||
Load train/validate data.
|
||||
|
||||
Args:
|
||||
path: file path
|
||||
max_lines: optional number of lines to read
|
||||
|
||||
Returns:
|
||||
loaded data
|
||||
|
||||
"""
|
||||
data = []
|
||||
|
||||
logger.info(f'Loading {path}')
|
||||
with open_file(path, 'rt') as f:
|
||||
for i, line in enumerate(tqdm(f)):
|
||||
line = line.strip()
|
||||
if '\t' not in line:
|
||||
logger.error(f'Found empty line at position {i + 1}'
|
||||
f' - SKIP THIS LINE')
|
||||
continue
|
||||
|
||||
label_name, text = line.split('\t', maxsplit=2)
|
||||
text = text.strip()
|
||||
|
||||
# LABEL should be string number
|
||||
label_id = int(label_name)
|
||||
data.append((text, label_id))
|
||||
|
||||
if max_lines is not None and i >= max_lines:
|
||||
break
|
||||
|
||||
return pd.DataFrame(data)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
transformers_logger = logging.getLogger('transformers')
|
||||
|
||||
max_lines = None
|
||||
train_df = load_train('data/train/train.tsv', max_lines=max_lines)
|
||||
eval_df = load_train('data/dev-0/data.tsv')
|
||||
|
||||
seq = 512
|
||||
model_name = 'xmlr_large'
|
||||
args = {
|
||||
'cache_dir': f'cache_dir-{model_name}/',
|
||||
'output_dir': f'outputs-{model_name}-{seq}/',
|
||||
'best_model_dir': f'outputs-{model_name}-{seq}/best_model',
|
||||
'max_seq_length': seq,
|
||||
'train_batch_size': 10,
|
||||
'num_train_epochs': 1,
|
||||
'evaluate_during_training': True,
|
||||
'save_steps': 5000,
|
||||
'evaluate_during_training_steps': 5000,
|
||||
'use_cached_eval_features': True,
|
||||
'reprocess_input_data': False,
|
||||
}
|
||||
|
||||
model = ClassificationModel('xlmroberta', 'xlm-roberta-large', args=args,
|
||||
num_labels=2, use_cuda=True, cuda_device=0)
|
||||
|
||||
logger.info(f'START TRAINING | ARGS: {model.args}')
|
||||
model.train_model(train_df, eval_df=eval_df)
|
82
2-eval-base.py
Executable file
82
2-eval-base.py
Executable file
@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import gzip
|
||||
import logging
|
||||
import lzma
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from simpletransformers.classification import ClassificationModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def open_file(path, *args):
|
||||
if path.endswith('gz'):
|
||||
fopen = gzip.open
|
||||
elif path.endswith('xz'):
|
||||
fopen = lzma.open
|
||||
else:
|
||||
fopen = open
|
||||
return fopen(path, *args)
|
||||
|
||||
|
||||
def load_test(path: str) -> List[str]:
|
||||
data = []
|
||||
|
||||
logger.debug(f'Loading {path}')
|
||||
with open_file(path, 'rt') as f:
|
||||
for line in tqdm(f):
|
||||
line = line.strip()
|
||||
data.append(line)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
transformers_logger = logging.getLogger('transformers')
|
||||
transformers_logger.setLevel(logging.WARNING)
|
||||
|
||||
for model_name in ['outputs-xmlr_base-512']:
|
||||
model_dir = os.path.join(model_name, 'best_model')
|
||||
seq_len = 512
|
||||
logger.info(f'Processing {model_name} (for sequence length: {seq_len})')
|
||||
|
||||
if 'base' in model_name:
|
||||
model_type = 'base'
|
||||
elif 'large' in model_name:
|
||||
model_type = 'large'
|
||||
else:
|
||||
raise ValueError(f'Unknown model type in name {model_name}')
|
||||
|
||||
args = {
|
||||
'max_seq_length': seq_len,
|
||||
'eval_batch_size': 50,
|
||||
'reprocess_input_data': True,
|
||||
'sliding_window': False,
|
||||
}
|
||||
|
||||
model = ClassificationModel('xlmroberta', f'{model_dir}',
|
||||
num_labels=2, args=args,
|
||||
use_cuda=True, cuda_device=0)
|
||||
output_name = f'model=xlmr_{model_type}-seq_len={seq_len}'
|
||||
|
||||
for test_name in ['dev-0', 'dev-1', 'test-A']:
|
||||
logger.info(f'Processing {test_name}')
|
||||
test_data = load_test(f'data/{test_name}/in.tsv')
|
||||
save_path = f'data/{test_name}/out-{output_name}.tsv'
|
||||
|
||||
class_predictions, raw_outputs = model.predict(test_data)
|
||||
softmax_tensor = torch.nn.functional.softmax(torch.tensor(raw_outputs), dim=1)
|
||||
|
||||
logger.info(f'Saving predictions into {save_path}')
|
||||
with open_file(save_path, 'wt') as w:
|
||||
for line_id in range(softmax_tensor.size(0)):
|
||||
line_probs = softmax_tensor[line_id]
|
||||
# Get second class for (M class)
|
||||
w.write(f'{line_probs[1].item()}\n')
|
82
2-eval-large.py
Executable file
82
2-eval-large.py
Executable file
@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import gzip
|
||||
import logging
|
||||
import lzma
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from simpletransformers.classification import ClassificationModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def open_file(path, *args):
|
||||
if path.endswith('gz'):
|
||||
fopen = gzip.open
|
||||
elif path.endswith('xz'):
|
||||
fopen = lzma.open
|
||||
else:
|
||||
fopen = open
|
||||
return fopen(path, *args)
|
||||
|
||||
|
||||
def load_test(path: str) -> List[str]:
|
||||
data = []
|
||||
|
||||
logger.debug(f'Loading {path}')
|
||||
with open_file(path, 'rt') as f:
|
||||
for line in tqdm(f):
|
||||
line = line.strip()
|
||||
data.append(line)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
transformers_logger = logging.getLogger('transformers')
|
||||
transformers_logger.setLevel(logging.WARNING)
|
||||
|
||||
for model_name in ['outputs-xmlr_large-512']:
|
||||
model_dir = os.path.join(model_name, 'best_model')
|
||||
seq_len = 512
|
||||
logger.info(f'Processing {model_name} (for sequence length: {seq_len})')
|
||||
|
||||
if 'base' in model_name:
|
||||
model_type = 'base'
|
||||
elif 'large' in model_name:
|
||||
model_type = 'large'
|
||||
else:
|
||||
raise ValueError(f'Unknown model type in name {model_name}')
|
||||
|
||||
args = {
|
||||
'max_seq_length': seq_len,
|
||||
'eval_batch_size': 35,
|
||||
'reprocess_input_data': True,
|
||||
'sliding_window': False,
|
||||
}
|
||||
|
||||
model = ClassificationModel('xlmroberta', f'{model_dir}',
|
||||
num_labels=2, args=args,
|
||||
use_cuda=True, cuda_device=0)
|
||||
output_name = f'model=xlmr_{model_type}-seq_len={seq_len}'
|
||||
|
||||
for test_name in ['dev-0', 'dev-1', 'test-A']:
|
||||
logger.info(f'Processing {test_name}')
|
||||
test_data = load_test(f'data/{test_name}/in.tsv')
|
||||
save_path = f'data/{test_name}/out-{output_name}.tsv'
|
||||
|
||||
class_predictions, raw_outputs = model.predict(test_data)
|
||||
softmax_tensor = torch.nn.functional.softmax(torch.tensor(raw_outputs), dim=1)
|
||||
|
||||
logger.info(f'Saving predictions into {save_path}')
|
||||
with open_file(save_path, 'wt') as w:
|
||||
for line_id in range(softmax_tensor.size(0)):
|
||||
line_probs = softmax_tensor[line_id]
|
||||
# Get second class for (M class)
|
||||
w.write(f'{line_probs[1].item()}\n')
|
137314
dev-0/out-model=xlmr_base-seq_len=512.tsv
Normal file
137314
dev-0/out-model=xlmr_base-seq_len=512.tsv
Normal file
File diff suppressed because it is too large
Load Diff
137314
dev-0/out-model=xlmr_large-seq_len=512.tsv
Normal file
137314
dev-0/out-model=xlmr_large-seq_len=512.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/out-model=xlmr_base-seq_len=512.tsv
Normal file
156606
dev-1/out-model=xlmr_base-seq_len=512.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/out-model=xlmr_large-seq_len=512.tsv
Normal file
156606
dev-1/out-model=xlmr_large-seq_len=512.tsv
Normal file
File diff suppressed because it is too large
Load Diff
134618
test-A/out-model=xlmr_base-seq_len=512.tsv
Normal file
134618
test-A/out-model=xlmr_base-seq_len=512.tsv
Normal file
File diff suppressed because it is too large
Load Diff
134618
test-A/out-model=xlmr_large-seq_len=512.tsv
Normal file
134618
test-A/out-model=xlmr_large-seq_len=512.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user