Compare commits

..

No commits in common. "xlm_roberta" and "master" have entirely different histories.

11 changed files with 0 additions and 857431 deletions

View File

@ -1,9 +0,0 @@
#!/user/bin/env bash
set -e
set -x
for dir_name in 'tran' 'dev-0' 'dev-1' 'test-A'
do
paster data/"${dir_name}"/expected.tsv data/"${dir_name}"/in.tsv > data/"${dir_name}"/data.tsv
done

View File

@ -1,91 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import gzip
import logging
import lzma
from typing import Optional
import pandas as pd
from tqdm import tqdm
from simpletransformers.classification import ClassificationModel
logger = logging.getLogger(__name__)
def open_file(path, *args):
if path.endswith('gz'):
fopen = gzip.open
elif path.endswith('xz'):
fopen = lzma.open
else:
fopen = open
return fopen(path, *args)
def load_train(path: str, max_lines: Optional[int] = None) -> pd.DataFrame:
"""
Load train/validate data.
Args:
path: file path
max_lines: optional number of lines to read
Returns:
loaded data
"""
data = []
logger.info(f'Loading {path}')
with open_file(path, 'rt') as f:
for i, line in enumerate(tqdm(f)):
line = line.strip()
if '\t' not in line:
logger.error(f'Found empty line at position {i + 1}'
f' - SKIP THIS LINE')
continue
label_name, text = line.split('\t', maxsplit=2)
text = text.strip()
# LABEL should be string number
label_id = int(label_name)
data.append((text, label_id))
if max_lines is not None and i >= max_lines:
break
return pd.DataFrame(data)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger('transformers')
max_lines = None
train_df = load_train('data/train/train.tsv', max_lines=max_lines)
eval_df = load_train('data/dev-0/data.tsv')
seq = 512
model_name = 'xmlr_base'
args = {
'cache_dir': f'cache_dir-{model_name}/',
'output_dir': f'outputs-{model_name}-{seq}/',
'best_model_dir': f'outputs-{model_name}-{seq}/best_model',
'max_seq_length': seq,
'train_batch_size': 25,
'num_train_epochs': 1,
'evaluate_during_training': True,
'save_steps': 5000,
'evaluate_during_training_steps': 5000,
'use_cached_eval_features': True,
'reprocess_input_data': False,
}
model = ClassificationModel('xlmroberta', 'xlm-roberta-base', args=args,
num_labels=2, use_cuda=True, cuda_device=0)
logger.info(f'START TRAINING | ARGS: {model.args}')
model.train_model(train_df, eval_df=eval_df)

View File

@ -1,91 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import gzip
import logging
import lzma
from typing import Optional
import pandas as pd
from tqdm import tqdm
from simpletransformers.classification import ClassificationModel
logger = logging.getLogger(__name__)
def open_file(path, *args):
if path.endswith('gz'):
fopen = gzip.open
elif path.endswith('xz'):
fopen = lzma.open
else:
fopen = open
return fopen(path, *args)
def load_train(path: str, max_lines: Optional[int] = None) -> pd.DataFrame:
"""
Load train/validate data.
Args:
path: file path
max_lines: optional number of lines to read
Returns:
loaded data
"""
data = []
logger.info(f'Loading {path}')
with open_file(path, 'rt') as f:
for i, line in enumerate(tqdm(f)):
line = line.strip()
if '\t' not in line:
logger.error(f'Found empty line at position {i + 1}'
f' - SKIP THIS LINE')
continue
label_name, text = line.split('\t', maxsplit=2)
text = text.strip()
# LABEL should be string number
label_id = int(label_name)
data.append((text, label_id))
if max_lines is not None and i >= max_lines:
break
return pd.DataFrame(data)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger('transformers')
max_lines = None
train_df = load_train('data/train/train.tsv', max_lines=max_lines)
eval_df = load_train('data/dev-0/data.tsv')
seq = 512
model_name = 'xmlr_large'
args = {
'cache_dir': f'cache_dir-{model_name}/',
'output_dir': f'outputs-{model_name}-{seq}/',
'best_model_dir': f'outputs-{model_name}-{seq}/best_model',
'max_seq_length': seq,
'train_batch_size': 10,
'num_train_epochs': 1,
'evaluate_during_training': True,
'save_steps': 5000,
'evaluate_during_training_steps': 5000,
'use_cached_eval_features': True,
'reprocess_input_data': False,
}
model = ClassificationModel('xlmroberta', 'xlm-roberta-large', args=args,
num_labels=2, use_cuda=True, cuda_device=0)
logger.info(f'START TRAINING | ARGS: {model.args}')
model.train_model(train_df, eval_df=eval_df)

View File

@ -1,82 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import gzip
import logging
import lzma
import os
from typing import List
import torch
from tqdm import tqdm
from simpletransformers.classification import ClassificationModel
logger = logging.getLogger(__name__)
def open_file(path, *args):
if path.endswith('gz'):
fopen = gzip.open
elif path.endswith('xz'):
fopen = lzma.open
else:
fopen = open
return fopen(path, *args)
def load_test(path: str) -> List[str]:
data = []
logger.debug(f'Loading {path}')
with open_file(path, 'rt') as f:
for line in tqdm(f):
line = line.strip()
data.append(line)
return data
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)
for model_name in ['outputs-xmlr_base-512']:
model_dir = os.path.join(model_name, 'best_model')
seq_len = 512
logger.info(f'Processing {model_name} (for sequence length: {seq_len})')
if 'base' in model_name:
model_type = 'base'
elif 'large' in model_name:
model_type = 'large'
else:
raise ValueError(f'Unknown model type in name {model_name}')
args = {
'max_seq_length': seq_len,
'eval_batch_size': 50,
'reprocess_input_data': True,
'sliding_window': False,
}
model = ClassificationModel('xlmroberta', f'{model_dir}',
num_labels=2, args=args,
use_cuda=True, cuda_device=0)
output_name = f'model=xlmr_{model_type}-seq_len={seq_len}'
for test_name in ['dev-0', 'dev-1', 'test-A']:
logger.info(f'Processing {test_name}')
test_data = load_test(f'data/{test_name}/in.tsv')
save_path = f'data/{test_name}/out-{output_name}.tsv'
class_predictions, raw_outputs = model.predict(test_data)
softmax_tensor = torch.nn.functional.softmax(torch.tensor(raw_outputs), dim=1)
logger.info(f'Saving predictions into {save_path}')
with open_file(save_path, 'wt') as w:
for line_id in range(softmax_tensor.size(0)):
line_probs = softmax_tensor[line_id]
# Get second class for (M class)
w.write(f'{line_probs[1].item()}\n')

View File

@ -1,82 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import gzip
import logging
import lzma
import os
from typing import List
import torch
from tqdm import tqdm
from simpletransformers.classification import ClassificationModel
logger = logging.getLogger(__name__)
def open_file(path, *args):
if path.endswith('gz'):
fopen = gzip.open
elif path.endswith('xz'):
fopen = lzma.open
else:
fopen = open
return fopen(path, *args)
def load_test(path: str) -> List[str]:
data = []
logger.debug(f'Loading {path}')
with open_file(path, 'rt') as f:
for line in tqdm(f):
line = line.strip()
data.append(line)
return data
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)
for model_name in ['outputs-xmlr_large-512']:
model_dir = os.path.join(model_name, 'best_model')
seq_len = 512
logger.info(f'Processing {model_name} (for sequence length: {seq_len})')
if 'base' in model_name:
model_type = 'base'
elif 'large' in model_name:
model_type = 'large'
else:
raise ValueError(f'Unknown model type in name {model_name}')
args = {
'max_seq_length': seq_len,
'eval_batch_size': 35,
'reprocess_input_data': True,
'sliding_window': False,
}
model = ClassificationModel('xlmroberta', f'{model_dir}',
num_labels=2, args=args,
use_cuda=True, cuda_device=0)
output_name = f'model=xlmr_{model_type}-seq_len={seq_len}'
for test_name in ['dev-0', 'dev-1', 'test-A']:
logger.info(f'Processing {test_name}')
test_data = load_test(f'data/{test_name}/in.tsv')
save_path = f'data/{test_name}/out-{output_name}.tsv'
class_predictions, raw_outputs = model.predict(test_data)
softmax_tensor = torch.nn.functional.softmax(torch.tensor(raw_outputs), dim=1)
logger.info(f'Saving predictions into {save_path}')
with open_file(save_path, 'wt') as w:
for line_id in range(softmax_tensor.size(0)):
line_probs = softmax_tensor[line_id]
# Get second class for (M class)
w.write(f'{line_probs[1].item()}\n')

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff