Compare commits
No commits in common. "xlm_roberta" and "master" have entirely different histories.
xlm_robert
...
master
@ -1,9 +0,0 @@
|
|||||||
#!/user/bin/env bash
|
|
||||||
|
|
||||||
set -e
|
|
||||||
set -x
|
|
||||||
|
|
||||||
for dir_name in 'tran' 'dev-0' 'dev-1' 'test-A'
|
|
||||||
do
|
|
||||||
paster data/"${dir_name}"/expected.tsv data/"${dir_name}"/in.tsv > data/"${dir_name}"/data.tsv
|
|
||||||
done
|
|
@ -1,91 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import gzip
|
|
||||||
import logging
|
|
||||||
import lzma
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from simpletransformers.classification import ClassificationModel
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def open_file(path, *args):
|
|
||||||
if path.endswith('gz'):
|
|
||||||
fopen = gzip.open
|
|
||||||
elif path.endswith('xz'):
|
|
||||||
fopen = lzma.open
|
|
||||||
else:
|
|
||||||
fopen = open
|
|
||||||
return fopen(path, *args)
|
|
||||||
|
|
||||||
|
|
||||||
def load_train(path: str, max_lines: Optional[int] = None) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Load train/validate data.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
path: file path
|
|
||||||
max_lines: optional number of lines to read
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
loaded data
|
|
||||||
|
|
||||||
"""
|
|
||||||
data = []
|
|
||||||
|
|
||||||
logger.info(f'Loading {path}')
|
|
||||||
with open_file(path, 'rt') as f:
|
|
||||||
for i, line in enumerate(tqdm(f)):
|
|
||||||
line = line.strip()
|
|
||||||
if '\t' not in line:
|
|
||||||
logger.error(f'Found empty line at position {i + 1}'
|
|
||||||
f' - SKIP THIS LINE')
|
|
||||||
continue
|
|
||||||
|
|
||||||
label_name, text = line.split('\t', maxsplit=2)
|
|
||||||
text = text.strip()
|
|
||||||
|
|
||||||
# LABEL should be string number
|
|
||||||
label_id = int(label_name)
|
|
||||||
data.append((text, label_id))
|
|
||||||
|
|
||||||
if max_lines is not None and i >= max_lines:
|
|
||||||
break
|
|
||||||
|
|
||||||
return pd.DataFrame(data)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
transformers_logger = logging.getLogger('transformers')
|
|
||||||
|
|
||||||
max_lines = None
|
|
||||||
train_df = load_train('data/train/train.tsv', max_lines=max_lines)
|
|
||||||
eval_df = load_train('data/dev-0/data.tsv')
|
|
||||||
|
|
||||||
seq = 512
|
|
||||||
model_name = 'xmlr_base'
|
|
||||||
args = {
|
|
||||||
'cache_dir': f'cache_dir-{model_name}/',
|
|
||||||
'output_dir': f'outputs-{model_name}-{seq}/',
|
|
||||||
'best_model_dir': f'outputs-{model_name}-{seq}/best_model',
|
|
||||||
'max_seq_length': seq,
|
|
||||||
'train_batch_size': 25,
|
|
||||||
'num_train_epochs': 1,
|
|
||||||
'evaluate_during_training': True,
|
|
||||||
'save_steps': 5000,
|
|
||||||
'evaluate_during_training_steps': 5000,
|
|
||||||
'use_cached_eval_features': True,
|
|
||||||
'reprocess_input_data': False,
|
|
||||||
}
|
|
||||||
|
|
||||||
model = ClassificationModel('xlmroberta', 'xlm-roberta-base', args=args,
|
|
||||||
num_labels=2, use_cuda=True, cuda_device=0)
|
|
||||||
|
|
||||||
logger.info(f'START TRAINING | ARGS: {model.args}')
|
|
||||||
model.train_model(train_df, eval_df=eval_df)
|
|
@ -1,91 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import gzip
|
|
||||||
import logging
|
|
||||||
import lzma
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from simpletransformers.classification import ClassificationModel
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def open_file(path, *args):
|
|
||||||
if path.endswith('gz'):
|
|
||||||
fopen = gzip.open
|
|
||||||
elif path.endswith('xz'):
|
|
||||||
fopen = lzma.open
|
|
||||||
else:
|
|
||||||
fopen = open
|
|
||||||
return fopen(path, *args)
|
|
||||||
|
|
||||||
|
|
||||||
def load_train(path: str, max_lines: Optional[int] = None) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Load train/validate data.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
path: file path
|
|
||||||
max_lines: optional number of lines to read
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
loaded data
|
|
||||||
|
|
||||||
"""
|
|
||||||
data = []
|
|
||||||
|
|
||||||
logger.info(f'Loading {path}')
|
|
||||||
with open_file(path, 'rt') as f:
|
|
||||||
for i, line in enumerate(tqdm(f)):
|
|
||||||
line = line.strip()
|
|
||||||
if '\t' not in line:
|
|
||||||
logger.error(f'Found empty line at position {i + 1}'
|
|
||||||
f' - SKIP THIS LINE')
|
|
||||||
continue
|
|
||||||
|
|
||||||
label_name, text = line.split('\t', maxsplit=2)
|
|
||||||
text = text.strip()
|
|
||||||
|
|
||||||
# LABEL should be string number
|
|
||||||
label_id = int(label_name)
|
|
||||||
data.append((text, label_id))
|
|
||||||
|
|
||||||
if max_lines is not None and i >= max_lines:
|
|
||||||
break
|
|
||||||
|
|
||||||
return pd.DataFrame(data)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
transformers_logger = logging.getLogger('transformers')
|
|
||||||
|
|
||||||
max_lines = None
|
|
||||||
train_df = load_train('data/train/train.tsv', max_lines=max_lines)
|
|
||||||
eval_df = load_train('data/dev-0/data.tsv')
|
|
||||||
|
|
||||||
seq = 512
|
|
||||||
model_name = 'xmlr_large'
|
|
||||||
args = {
|
|
||||||
'cache_dir': f'cache_dir-{model_name}/',
|
|
||||||
'output_dir': f'outputs-{model_name}-{seq}/',
|
|
||||||
'best_model_dir': f'outputs-{model_name}-{seq}/best_model',
|
|
||||||
'max_seq_length': seq,
|
|
||||||
'train_batch_size': 10,
|
|
||||||
'num_train_epochs': 1,
|
|
||||||
'evaluate_during_training': True,
|
|
||||||
'save_steps': 5000,
|
|
||||||
'evaluate_during_training_steps': 5000,
|
|
||||||
'use_cached_eval_features': True,
|
|
||||||
'reprocess_input_data': False,
|
|
||||||
}
|
|
||||||
|
|
||||||
model = ClassificationModel('xlmroberta', 'xlm-roberta-large', args=args,
|
|
||||||
num_labels=2, use_cuda=True, cuda_device=0)
|
|
||||||
|
|
||||||
logger.info(f'START TRAINING | ARGS: {model.args}')
|
|
||||||
model.train_model(train_df, eval_df=eval_df)
|
|
@ -1,82 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import gzip
|
|
||||||
import logging
|
|
||||||
import lzma
|
|
||||||
import os
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from simpletransformers.classification import ClassificationModel
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def open_file(path, *args):
|
|
||||||
if path.endswith('gz'):
|
|
||||||
fopen = gzip.open
|
|
||||||
elif path.endswith('xz'):
|
|
||||||
fopen = lzma.open
|
|
||||||
else:
|
|
||||||
fopen = open
|
|
||||||
return fopen(path, *args)
|
|
||||||
|
|
||||||
|
|
||||||
def load_test(path: str) -> List[str]:
|
|
||||||
data = []
|
|
||||||
|
|
||||||
logger.debug(f'Loading {path}')
|
|
||||||
with open_file(path, 'rt') as f:
|
|
||||||
for line in tqdm(f):
|
|
||||||
line = line.strip()
|
|
||||||
data.append(line)
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
transformers_logger = logging.getLogger('transformers')
|
|
||||||
transformers_logger.setLevel(logging.WARNING)
|
|
||||||
|
|
||||||
for model_name in ['outputs-xmlr_base-512']:
|
|
||||||
model_dir = os.path.join(model_name, 'best_model')
|
|
||||||
seq_len = 512
|
|
||||||
logger.info(f'Processing {model_name} (for sequence length: {seq_len})')
|
|
||||||
|
|
||||||
if 'base' in model_name:
|
|
||||||
model_type = 'base'
|
|
||||||
elif 'large' in model_name:
|
|
||||||
model_type = 'large'
|
|
||||||
else:
|
|
||||||
raise ValueError(f'Unknown model type in name {model_name}')
|
|
||||||
|
|
||||||
args = {
|
|
||||||
'max_seq_length': seq_len,
|
|
||||||
'eval_batch_size': 50,
|
|
||||||
'reprocess_input_data': True,
|
|
||||||
'sliding_window': False,
|
|
||||||
}
|
|
||||||
|
|
||||||
model = ClassificationModel('xlmroberta', f'{model_dir}',
|
|
||||||
num_labels=2, args=args,
|
|
||||||
use_cuda=True, cuda_device=0)
|
|
||||||
output_name = f'model=xlmr_{model_type}-seq_len={seq_len}'
|
|
||||||
|
|
||||||
for test_name in ['dev-0', 'dev-1', 'test-A']:
|
|
||||||
logger.info(f'Processing {test_name}')
|
|
||||||
test_data = load_test(f'data/{test_name}/in.tsv')
|
|
||||||
save_path = f'data/{test_name}/out-{output_name}.tsv'
|
|
||||||
|
|
||||||
class_predictions, raw_outputs = model.predict(test_data)
|
|
||||||
softmax_tensor = torch.nn.functional.softmax(torch.tensor(raw_outputs), dim=1)
|
|
||||||
|
|
||||||
logger.info(f'Saving predictions into {save_path}')
|
|
||||||
with open_file(save_path, 'wt') as w:
|
|
||||||
for line_id in range(softmax_tensor.size(0)):
|
|
||||||
line_probs = softmax_tensor[line_id]
|
|
||||||
# Get second class for (M class)
|
|
||||||
w.write(f'{line_probs[1].item()}\n')
|
|
@ -1,82 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import gzip
|
|
||||||
import logging
|
|
||||||
import lzma
|
|
||||||
import os
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from simpletransformers.classification import ClassificationModel
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def open_file(path, *args):
|
|
||||||
if path.endswith('gz'):
|
|
||||||
fopen = gzip.open
|
|
||||||
elif path.endswith('xz'):
|
|
||||||
fopen = lzma.open
|
|
||||||
else:
|
|
||||||
fopen = open
|
|
||||||
return fopen(path, *args)
|
|
||||||
|
|
||||||
|
|
||||||
def load_test(path: str) -> List[str]:
|
|
||||||
data = []
|
|
||||||
|
|
||||||
logger.debug(f'Loading {path}')
|
|
||||||
with open_file(path, 'rt') as f:
|
|
||||||
for line in tqdm(f):
|
|
||||||
line = line.strip()
|
|
||||||
data.append(line)
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
transformers_logger = logging.getLogger('transformers')
|
|
||||||
transformers_logger.setLevel(logging.WARNING)
|
|
||||||
|
|
||||||
for model_name in ['outputs-xmlr_large-512']:
|
|
||||||
model_dir = os.path.join(model_name, 'best_model')
|
|
||||||
seq_len = 512
|
|
||||||
logger.info(f'Processing {model_name} (for sequence length: {seq_len})')
|
|
||||||
|
|
||||||
if 'base' in model_name:
|
|
||||||
model_type = 'base'
|
|
||||||
elif 'large' in model_name:
|
|
||||||
model_type = 'large'
|
|
||||||
else:
|
|
||||||
raise ValueError(f'Unknown model type in name {model_name}')
|
|
||||||
|
|
||||||
args = {
|
|
||||||
'max_seq_length': seq_len,
|
|
||||||
'eval_batch_size': 35,
|
|
||||||
'reprocess_input_data': True,
|
|
||||||
'sliding_window': False,
|
|
||||||
}
|
|
||||||
|
|
||||||
model = ClassificationModel('xlmroberta', f'{model_dir}',
|
|
||||||
num_labels=2, args=args,
|
|
||||||
use_cuda=True, cuda_device=0)
|
|
||||||
output_name = f'model=xlmr_{model_type}-seq_len={seq_len}'
|
|
||||||
|
|
||||||
for test_name in ['dev-0', 'dev-1', 'test-A']:
|
|
||||||
logger.info(f'Processing {test_name}')
|
|
||||||
test_data = load_test(f'data/{test_name}/in.tsv')
|
|
||||||
save_path = f'data/{test_name}/out-{output_name}.tsv'
|
|
||||||
|
|
||||||
class_predictions, raw_outputs = model.predict(test_data)
|
|
||||||
softmax_tensor = torch.nn.functional.softmax(torch.tensor(raw_outputs), dim=1)
|
|
||||||
|
|
||||||
logger.info(f'Saving predictions into {save_path}')
|
|
||||||
with open_file(save_path, 'wt') as w:
|
|
||||||
for line_id in range(softmax_tensor.size(0)):
|
|
||||||
line_probs = softmax_tensor[line_id]
|
|
||||||
# Get second class for (M class)
|
|
||||||
w.write(f'{line_probs[1].item()}\n')
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user