msc-smolak/bibliography.bib



@incollection { gonito2016,
	title = {Gonito.net -- Open Platform for Research Competition, Cooperation and Reproducibility},
	author = "Grali{\'n}ski, Filip and Jaworski, Rafa{\l} and Borchmann, {\L}ukasz and Wierzcho{\'n}, Piotr",
	editor = "Branco, António and Calzolari , Nicoletta and Choukri, Khalid",
	booktitle = {Proceedings of the 4REAL Workshop: Workshop on Research Results Reproducibility and Resources Citation in Science and Technology of Language},
	year = "2016",
	pages = "13-20"
}

@inproceedings{stanislawek-etal-2019-named,
    title = "Named Entity Recognition - Is There a Glass Ceiling?",
    author = "Stanislawek, Tomasz  and
      Wr{\'o}blewska, Anna  and
      W{\'o}jcicka, Alicja  and
      Ziembicki, Daniel  and
      Biecek, Przemyslaw",
    booktitle = "Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)",
    month = nov,
    year = "2019",
    address = "Hong Kong, China",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/K19-1058",
    doi = "10.18653/v1/K19-1058",
    pages = "624--633",
    abstract = "Recent developments in Named Entity Recognition (NER) have resulted in better and better models. However, is there a glass ceiling? Do we know which types of errors are still hard or even impossible to correct? In this paper, we present a detailed analysis of the types of errors in state-of-the-art machine learning (ML) methods. Our study illustrates weak and strong points of the Stanford, CMU, FLAIR, ELMO and BERT models, as well as their shared limitations. We also introduce new techniques for improving annotation, training process, and for checking model quality and stability.",
}

@misc{borchmann2019searching,
    title={Searching for Legal Clauses by Analogy. Few-shot Semantic Retrieval Shared Task},
    author={Łukasz Borchmann and Dawid Wiśniewski and Andrzej Gretkowski and Izabela Kosmala and Dawid Jurkiewicz and Łukasz Szałkiewicz and Gabriela Pałka and Karol Kaczmarek and Agnieszka Kaliska and Filip Graliński},
    year={2019},
    eprint={1911.03911},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@inproceedings{gralinski-etal-2019-geval,
    title = "{GE}val: Tool for Debugging {NLP} Datasets and Models",
    author = "Grali{\'n}ski, Filip  and
      Wr{\'o}blewska, Anna  and
      Stanis{\l}awek, Tomasz  and
      Grabowski, Kamil  and
      G{\'o}recki, Tomasz",
    booktitle = "Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP",
    month = aug,
    year = "2019",
    address = "Florence, Italy",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W19-4826",
    pages = "254--262",
    abstract = "This paper presents a simple but general and effective method to debug the output of machine learning (ML) supervised models, including neural networks. The algorithm looks for features that lower the evaluation metric in such a way that it cannot be ascribed to chance (as measured by their p-values). Using this method {--} implemented as MLEval tool {--} you can find: (1) anomalies in test sets, (2) issues in preprocessing, (3) problems in the ML model itself. It can give you an insight into what can be improved in the datasets and/or the model. The same method can be used to compare ML models or different versions of the same model. We present the tool, the theory behind it and use cases for text-based models of various types.",
}

@inproceedings{Borchmann2018,
title = {Approaching nested named entity recognition with parallel LSTM-CRFs},
author = {Łukasz Borchmann and Andrzej Gretkowski and Filip Graliński},
editor = {Maciej Ogrodniczuk and Łukasz Kobyliński},
url = {http://www.borchmann.pl/wp-content/uploads/2018/10/borchmann-lukasz.pdf},
year = {2018},
date = {2018-10-19},
booktitle = {Proceedings of the PolEval 2018 Workshop},
pages = {63-73},
publisher = {Institute of Computer Science, Polish Academy of Science},
address = {Warszawa},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}


@article{DBLP:journals/corr/HewlettLJPFHKB16,
  author    = {Daniel Hewlett and
               Alexandre Lacoste and
               Llion Jones and
               Illia Polosukhin and
               Andrew Fandrianto and
               Jay Han and
               Matthew Kelcey and
               David Berthelot},
  title     = "{WikiReading: {A} Novel Large-scale Language Understanding Task over
               Wikipedia}",
  journal   = {CoRR},
  volume    = {abs/1608.03542},
  year      = {2016},
  url       = {http://arxiv.org/abs/1608.03542},
  archivePrefix = {arXiv},
  eprint    = {1608.03542},
  timestamp = {Mon, 13 Aug 2018 16:46:41 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/HewlettLJPFHKB16},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{byte-level2018kenter,
  title={Byte-level Machine Reading across Morphologically Varied Languages},
  author={Tom Kenter and Llion Jones and Daniel Hewlett},
  booktitle={Proceedings of the The Thirty-Second AAAI Conference on Artificial Intelligence (AAAI-18)},
  year={2018}
}

@article{DBLP:journals/corr/SutskeverVL14,
  author    = {Ilya Sutskever and
               Oriol Vinyals and
               Quoc V. Le},
  title     = "{Sequence to Sequence Learning with Neural Networks}",
  journal   = {CoRR},
  volume    = {abs/1409.3215},
  year      = {2014},
  url       = {http://arxiv.org/abs/1409.3215},
  archivePrefix = {arXiv},
  eprint    = {1409.3215},
  timestamp = {Mon, 13 Aug 2018 16:48:06 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/SutskeverVL14},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1811-04284,
  author    = {Hainan Xu and
               Shuoyang Ding and
               Shinji Watanabe},
  title     = "{Improving End-to-end Speech Recognition with Pronunciation-assisted
               Sub-word Modeling}",
  journal   = {CoRR},
  volume    = {abs/1811.04284},
  year      = {2018},
  url       = {http://arxiv.org/abs/1811.04284},
  archivePrefix = {arXiv},
  eprint    = {1811.04284},
  timestamp = {Fri, 23 Nov 2018 12:43:51 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1811-04284},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{holt-chisholm-2018-extracting,
    title = "Extracting structured data from invoices",
    author = "Holt, Xavier  and
      Chisholm, Andrew",
    booktitle = "Proceedings of the Australasian Language Technology Association Workshop 2018",
    month = dec,
    year = "2018",
    address = "Dunedin, New Zealand",
    url = "https://www.aclweb.org/anthology/U18-1006",
    pages = "53--59",
    abstract = "Business documents encode a wealth of information in a format tailored to human consumption {--} i.e. aesthetically disbursed natural language text, graphics and tables. We address the task of extracting key fields (e.g. the amount due on an invoice) from a wide-variety of potentially unseen document formats. In contrast to traditional template driven extraction systems, we introduce a content-driven machine-learning approach which is both robust to noise and generalises to unseen document formats. In a comparison of our approach with alternative invoice extraction systems, we observe an absolute accuracy gain of 20{\textbackslash}{\%} across compared fields, and a 25{\textbackslash}{\%}{--}94{\textbackslash}{\%} reduction in extraction latency.",
}

@article{DBLP:journals/corr/abs-1907-11692,
  author    = {Yinhan Liu and
               Myle Ott and
               Naman Goyal and
               Jingfei Du and
               Mandar Joshi and
               Danqi Chen and
               Omer Levy and
               Mike Lewis and
               Luke Zettlemoyer and
               Veselin Stoyanov},
  title     = "{RoBERTa: {A} Robustly Optimized {BERT} Pretraining Approach}",
  journal   = {CoRR},
  volume    = {abs/1907.11692},
  year      = {2019},
  url       = {http://arxiv.org/abs/1907.11692},
  archivePrefix = {arXiv},
  eprint    = {1907.11692},
  timestamp = {Thu, 01 Aug 2019 08:59:33 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1907-11692},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{ke2018focused,
  title={Focused hierarchical rnns for conditional sequence processing},
  author={Ke, Nan Rosemary and Zolna, Konrad and Sordoni, Alessandro and Lin, Zhouhan and Trischler, Adam and Bengio, Yoshua and Pineau, Joelle and Charlin, Laurent and Pal, Chris},
  journal={arXiv preprint arXiv:1806.04342},
  year={2018}
}


@article{DBLP:journals/corr/abs-1901-02860,
  author    = {Zihang Dai and
               Zhilin Yang and
               Yiming Yang and
               Jaime G. Carbonell and
               Quoc V. Le and
               Ruslan Salakhutdinov},
  title     = "{Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context}",
  journal   = {CoRR},
  volume    = {abs/1901.02860},
  year      = {2019},
  url       = {http://arxiv.org/abs/1901.02860},
  archivePrefix = {arXiv},
  eprint    = {1901.02860},
  timestamp = {Fri, 01 Feb 2019 13:39:59 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1901-02860},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1906-08237,
  author    = {Zhilin Yang and
               Zihang Dai and
               Yiming Yang and
               Jaime G. Carbonell and
               Ruslan Salakhutdinov and
               Quoc V. Le},
  title     = "{XLNet: Generalized Autoregressive Pretraining for Language Understanding}",
  journal   = {CoRR},
  volume    = {abs/1906.08237},
  year      = {2019},
  url       = {http://arxiv.org/abs/1906.08237},
  archivePrefix = {arXiv},
  eprint    = {1906.08237},
  timestamp = {Mon, 24 Jun 2019 17:28:45 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1906-08237},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1907-06170,
  author    = {Marcin Junczys{-}Dowmunt},
  title     = "{Microsoft Translator at {WMT} 2019: Towards Large-Scale Document-Level
               Neural Machine Translation}",
  journal   = {CoRR},
  volume    = {abs/1907.06170},
  year      = {2019},
  url       = {http://arxiv.org/abs/1907.06170},
  archivePrefix = {arXiv},
  eprint    = {1907.06170},
  timestamp = {Wed, 17 Jul 2019 10:27:36 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1907-06170},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{akbik2018coling,
  title={Contextual String Embeddings for Sequence Labeling},
  author={Akbik, Alan and Blythe, Duncan and Vollgraf, Roland},
  booktitle = {{COLING} 2018, 27th International Conference on Computational Linguistics},
  pages     = {1638--1649},
  year      = {2018}
}


@article{luo2019named,
  title={Named Entity Recognition Only from Word Embeddings},
  author={Luo, Ying and Zhao, Hai and Zhan, Junlang},
  journal={arXiv preprint arXiv:1909.00164},
  year={2019}
}

@article{tu2018learning,
  title={Learning to remember translation history with a continuous cache},
  author={Tu, Zhaopeng and Liu, Yang and Shi, Shuming and Zhang, Tong},
  journal={Transactions of the Association for Computational Linguistics},
  volume={6},
  pages={407--420},
  year={2018},
  publisher={MIT Press}
}

@article{miculicich2018document,
  title={Document-level neural machine translation with hierarchical attention networks},
  author={Miculicich, Lesly and Ram, Dhananjay and Pappas, Nikolaos and Henderson, James},
  journal={arXiv preprint arXiv:1809.01576},
  year={2018}
}

@article{DBLP:journals/corr/abs-1907-05242,
  author    = {Guillaume Lample and
               Alexandre Sablayrolles and
               Marc'Aurelio Ranzato and
               Ludovic Denoyer and
               Herv{\'{e}} J{\'{e}}gou},
  title     = "{Large Memory Layers with Product Keys}",
  journal   = {CoRR},
  volume    = {abs/1907.05242},
  year      = {2019},
  url       = {http://arxiv.org/abs/1907.05242},
  archivePrefix = {arXiv},
  eprint    = {1907.05242},
  timestamp = {Wed, 17 Jul 2019 10:27:36 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1907-05242},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{DBLP:journals/corr/abs-1907-01686,
  author    = {Xin Zhang and
               An Yang and
               Sujian Li and
               Yizhong Wang},
  title     = {Machine Reading Comprehension: a Literature Review},
  journal   = {CoRR},
  volume    = {abs/1907.01686},
  year      = {2019},
  url       = {http://arxiv.org/abs/1907.01686},
  archivePrefix = {arXiv},
  eprint    = {1907.01686},
  timestamp = {Mon, 08 Jul 2019 14:12:33 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1907-01686},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1810-04805,
  author    = {Jacob Devlin and
               Ming{-}Wei Chang and
               Kenton Lee and
               Kristina Toutanova},
  title     = "{{BERT:} Pre-training of Deep Bidirectional Transformers for Language
               Understanding}",
  journal   = {CoRR},
  volume    = {abs/1810.04805},
  year      = {2018},
  url       = {http://arxiv.org/abs/1810.04805},
  archivePrefix = {arXiv},
  eprint    = {1810.04805},
  timestamp = {Tue, 30 Oct 2018 20:39:56 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1810-04805},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{DBLP:journals/corr/abs-1809-08799,
  author    = {Anoop R. Katti and
               Christian Reisswig and
               Cordula Guder and
               Sebastian Brarda and
               Steffen Bickel and
               Johannes H{\"{o}}hne and
               Jean Baptiste Faddoul},
  title     = {Chargrid: Towards Understanding 2D Documents},
  journal   = {CoRR},
  volume    = {abs/1809.08799},
  year      = {2018},
  url       = {http://arxiv.org/abs/1809.08799},
  archivePrefix = {arXiv},
  eprint    = {1809.08799},
  timestamp = {Fri, 05 Oct 2018 11:34:52 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1809-08799},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/HamiltonYL17,
  author    = {William L. Hamilton and
               Rex Ying and
               Jure Leskovec},
  title     = {Inductive Representation Learning on Large Graphs},
  journal   = {CoRR},
  volume    = {abs/1706.02216},
  year      = {2017},
  url       = {http://arxiv.org/abs/1706.02216},
  archivePrefix = {arXiv},
  eprint    = {1706.02216},
  timestamp = {Mon, 13 Aug 2018 16:46:12 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/HamiltonYL17},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1810-00826,
  author    = {Keyulu Xu and
               Weihua Hu and
               Jure Leskovec and
               Stefanie Jegelka},
  title     = {How Powerful are Graph Neural Networks?},
  journal   = {CoRR},
  volume    = {abs/1810.00826},
  year      = {2018},
  url       = {http://arxiv.org/abs/1810.00826},
  archivePrefix = {arXiv},
  eprint    = {1810.00826},
  timestamp = {Tue, 30 Oct 2018 10:49:09 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1810-00826},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{matuschek2008measuring,
  title={Measuring text similarity with dynamic time warping},
  author={Matuschek, Michael and Schl{\"u}ter, Tim and Conrad, Stefan},
  booktitle={Proceedings of the 2008 international symposium on Database engineering \& applications},
  pages={263--267},
  year={2008},
  organization={ACM}
}

@article{simonyan2014very,
  title={Very deep convolutional networks for large-scale image recognition},
  author={Simonyan, Karen and Zisserman, Andrew},
  journal={arXiv preprint arXiv:1409.1556},
  year={2014}
}

@article{lecun2015lenet,
  title={LeNet-5, convolutional neural networks},
  author={LeCun, Yann and others},
  journal={URL: http://yann. lecun. com/exdb/lenet},
  volume={20},
  pages={5},
  year={2015}
}

@article{bergstra2012random,
  title={Random Search for Hyper-Parameter Optimization},
  author={Bergstra, James and Bengio, Yoshua},
  journal={Journal of Machine Learning Research},
  volume={13},
  pages={281--305},
  year={2012}
}

@incollection{hinton2012practical,
  title={A practical guide to training restricted Boltzmann machines},
  author={Hinton, Geoffrey E},
  booktitle={Neural networks: Tricks of the trade},
  pages={599--619},
  year={2012},
  publisher={Springer}
}

@book{hedges2014statistical,
  title={Statistical methods for meta-analysis},
  author={Hedges, Larry V and Olkin, Ingram},
  year={2014},
  publisher={Academic press}
}

@inproceedings{koehn2004statistical,
  title={Statistical significance tests for machine translation evaluation},
  author={Koehn, Philipp},
  booktitle={Proceedings of the 2004 conference on empirical methods in natural language processing},
  pages={388--395},
  year={2004}
}

@article{DBLP:journals/corr/abs-1904-01685,
  author    = {Jeremy Nixon and
               Mike Dusenberry and
               Linchuan Zhang and
               Ghassen Jerfel and
               Dustin Tran},
  title     = {Measuring Calibration in Deep Learning},
  journal   = {CoRR},
  volume    = {abs/1904.01685},
  year      = {2019},
  url       = {http://arxiv.org/abs/1904.01685},
  archivePrefix = {arXiv},
  eprint    = {1904.01685},
  timestamp = {Wed, 24 Apr 2019 12:21:25 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1904-01685},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{settles2012active,
  title={Active learning},
  author={Settles, Burr},
  journal={Synthesis Lectures on Artificial Intelligence and Machine Learning},
  volume={6},
  number={1},
  pages={1--114},
  year={2012},
  publisher={Morgan \& Claypool Publishers}
}

@incollection{NIPS2017_7062,
title = {A Unified Approach to Interpreting Model Predictions},
author = {Lundberg, Scott M and Lee, Su-In},
booktitle = {Advances in Neural Information Processing Systems 30},
editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
pages = {4765--4774},
year = {2017},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf}
}

@article{austin2014graphical,
  title={Graphical assessment of internal and external calibration of logistic regression models by using loess smoothers},
  author={Austin, Peter C and Steyerberg, Ewout W},
  journal={Statistics in medicine},
  volume={33},
  number={3},
  pages={517--535},
  year={2014},
  publisher={Wiley Online Library}
}

@article{lan2019albert,
  title={ALBERT: A Lite BERT for Self-supervised Learning of Language Representations},
  author={Lan, Zhenzhong and Chen, Mingda and Goodman, Sebastian and Gimpel, Kevin and Sharma, Piyush and Soricut, Radu},
  journal={arXiv preprint arXiv:1909.11942},
  year={2019}
}

@article{jiao2019tinybert,
  title={TinyBERT: Distilling BERT for Natural Language Understanding},
  author={Jiao, Xiaoqi and Yin, Yichun and Shang, Lifeng and Jiang, Xin and Chen, Xiao and Li, Linlin and Wang, Fang and Liu, Qun},
  journal={arXiv preprint arXiv:1909.10351},
  year={2019}
}

@article{wiewel2019localizing,
  title={Localizing Catastrophic Forgetting in Neural Networks},
  author={Wiewel, Felix and Yang, Bin},
  journal={arXiv preprint arXiv:1906.02568},
  year={2019}
}

@article{hinton2015distilling,
  title={Distilling the knowledge in a neural network},
  author={Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff},
  journal={arXiv preprint arXiv:1503.02531},
  year={2015}
}

@article{hubara2017quantized,
  title={Quantized neural networks: Training neural networks with low precision weights and activations},
  author={Hubara, Itay and Courbariaux, Matthieu and Soudry, Daniel and El-Yaniv, Ran and Bengio, Yoshua},
  journal={The Journal of Machine Learning Research},
  volume={18},
  number={1},
  pages={6869--6898},
  year={2017},
  publisher={JMLR. org}
}

@article{bao2019few,
  title={Few-shot Text Classification with Distributional Signatures},
  author={Bao, Yujia and Wu, Menghua and Chang, Shiyu and Barzilay, Regina},
  journal={arXiv preprint arXiv:1908.06039},
  year={2019}
}

@article{DBLP:journals/corr/NarayanGCS17,
  author    = {Shashi Narayan and
               Claire Gardent and
               Shay B. Cohen and
               Anastasia Shimorina},
  title     = {Split and Rephrase},
  journal   = {CoRR},
  volume    = {abs/1707.06971},
  year      = {2017},
  url       = {http://arxiv.org/abs/1707.06971},
  archivePrefix = {arXiv},
  eprint    = {1707.06971},
  timestamp = {Mon, 13 Aug 2018 16:48:49 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/NarayanGCS17},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1906-01038,
  author    = {Christina Niklaus and
               Matthias Cetto and
               Andr{\'{e}} Freitas and
               Siegfried Handschuh},
  title     = {Transforming Complex Sentences into a Semantic Hierarchy},
  journal   = {CoRR},
  volume    = {abs/1906.01038},
  year      = {2019},
  url       = {http://arxiv.org/abs/1906.01038},
  archivePrefix = {arXiv},
  eprint    = {1906.01038},
  timestamp = {Thu, 13 Jun 2019 13:36:00 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1906-01038},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{goldberger2005neighbourhood,
  title={Neighbourhood components analysis},
  author={Goldberger, Jacob and Hinton, Geoffrey E and Roweis, Sam T and Salakhutdinov, Ruslan R},
  booktitle={Advances in neural information processing systems},
  pages={513--520},
  year={2005}
}

@article{hyvarinen2000independent,
  title={Independent component analysis: algorithms and applications},
  author={Hyv{\"a}rinen, Aapo and Oja, Erkki},
  journal={Neural networks},
  volume={13},
  number={4-5},
  pages={411--430},
  year={2000},
  publisher={Elsevier}
}

@article{DBLP:journals/corr/abs-1804-00079,
  author    = {Sandeep Subramanian and
               Adam Trischler and
               Yoshua Bengio and
               Christopher J. Pal},
  title     = {Learning General Purpose Distributed Sentence Representations via
               Large Scale Multi-task Learning},
  journal   = {CoRR},
  volume    = {abs/1804.00079},
  year      = {2018},
  url       = {http://arxiv.org/abs/1804.00079},
  archivePrefix = {arXiv},
  eprint    = {1804.00079},
  timestamp = {Mon, 13 Aug 2018 16:47:55 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1804-00079},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/ConneauKSBB17,
  author    = {Alexis Conneau and
               Douwe Kiela and
               Holger Schwenk and
               Lo{\"{\i}}c Barrault and
               Antoine Bordes},
  title     = {Supervised Learning of Universal Sentence Representations from Natural
               Language Inference Data},
  journal   = {CoRR},
  volume    = {abs/1705.02364},
  year      = {2017},
  url       = {http://arxiv.org/abs/1705.02364},
  archivePrefix = {arXiv},
  eprint    = {1705.02364},
  timestamp = {Mon, 13 Aug 2018 16:48:46 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/ConneauKSBB17},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@proceedings{2019:3322640,
 title = {ICAIL '19: Proceedings of the Seventeenth International Conference on Artificial Intelligence and Law},
 year = {2019},
 isbn = {978-1-4503-6754-7},
 location = {Montreal, QC, Canada},
 publisher = {ACM},
 address = {New York, NY, USA},
key = {{$\!\!$}} ,
}


@article{GOODMAN2001403,
title = "A bit of progress in language modeling",
journal = "Computer Speech \& Language",
volume = "15",
number = "4",
pages = "403-434",
year = "2001",
issn = "0885-2308",
doi = "10.1006/csla.2001.0174",
OPTurl = "http://www.sciencedirect.com/science/article/pii/S0885230801901743",
author = "Joshua T. Goodman"
}

@article{DBLP:journals/corr/cs-CL-9905001,
  author    = {Rebecca Hwa},
  title     = {Supervised Grammar Induction Using Training Data with Limited Constituent Information},
  journal   = {CoRR},
  volume    = {cs.CL/9905001},
  note = {Version 1},
  year      = {1999},
  url       = {http://arxiv.org/abs/cs.CL/9905001},
  timestamp = {Wed, 07 Jun 2017 14:41:01 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/cs-CL-9905001},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@book{Jurafsky+Martin:2009a,
  author    = {Jurafsky, Daniel and Martin, James H.},
  title     = {Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition},
  publisher = {Pearson Prentice Hall},
  year      = 2009,
  edition   = {Second}
}

@inproceedings{Maxwell2008ConceptAC,
  title={Concept and Context in Legal Information Retrieval},
  author={K. Tamsin Maxwell and Burkhard Schafer},
  booktitle={JURIX},
  year={2008}
}

@misc{41224,
title	= {Efficient Estimation of Word Representations in Vector Space},
author	= {Tomas Mikolov and Kai Chen and Greg S. Corrado and Jeffrey Dean},
year	= {2013},
URL	= {http://arxiv.org/abs/1301.3781}
}

@misc{wieting2019training,
    title={No Training Required: Exploring Random Encoders for Sentence Classification},
    author={John Wieting and Douwe Kiela},
    year={2019},
    eprint={1901.10444},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@inproceedings{jiao-etal-2018-convolutional,
    title = "Convolutional Neural Network for Universal Sentence Embeddings",
    author = "Jiao, Xiaoqi  and
      Wang, Fang  and
      Feng, Dan",
    booktitle = "Proceedings of the 27th International Conference on Computational Linguistics",
    month = aug,
    year = "2018",
    address = "Santa Fe, New Mexico, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/C18-1209",
    pages = "2470--2481",
    abstract = "This paper proposes a simple CNN model for creating general-purpose sentence embeddings that can transfer easily across domains and can also act as effective initialization for downstream tasks. Recently, averaging the embeddings of words in a sentence has proven to be a surprisingly successful and efficient way of obtaining sentence embeddings. However, these models represent a sentence, only in terms of features of words or uni-grams in it. In contrast, our model (CSE) utilizes both features of words and n-grams to encode sentences, which is actually a generalization of these bag-of-words models. The extensive experiments demonstrate that CSE performs better than average models in transfer learning setting and exceeds the state of the art in supervised learning setting by initializing the parameters with the pre-trained sentence embeddings.",
}

@misc{zhang2018learning,
    title={Learning Universal Sentence Representations with Mean-Max Attention Autoencoder},
    author={Minghua Zhang and Yunfang Wu and Weikang Li and Wei Li},
    year={2018},
    eprint={1809.06590},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@misc{conneau2017supervised,
    title={Supervised Learning of Universal Sentence Representations from Natural Language Inference Data},
    author={Alexis Conneau and Douwe Kiela and Holger Schwenk and Loic Barrault and Antoine Bordes},
    year={2017},
    eprint={1705.02364},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@article{DBLP:journals/corr/abs-1804-00079,
  author    = {Sandeep Subramanian and
               Adam Trischler and
               Yoshua Bengio and
               Christopher J. Pal},
  title     = {Learning General Purpose Distributed Sentence Representations via
               Large Scale Multi-task Learning},
  journal   = {CoRR},
  volume    = {abs/1804.00079},
  year      = {2018},
  url       = {http://arxiv.org/abs/1804.00079},
  archivePrefix = {arXiv},
  eprint    = {1804.00079},
  timestamp = {Mon, 13 Aug 2018 16:47:55 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1804-00079},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@incollection{NIPS2015_5950,
title = {Skip-Thought Vectors},
author = {Kiros, Ryan and Zhu, Yukun and Salakhutdinov, Ruslan R and Zemel, Richard and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},
booktitle = {Advances in Neural Information Processing Systems 28},
editor = {C. Cortes and N. D. Lawrence and D. D. Lee and M. Sugiyama and R. Garnett},
pages = {3294--3302},
year = {2015},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/5950-skip-thought-vectors.pdf}
}

@misc{ionescu2019vector,
    title="{Vector of Locally-Aggregated Word Embeddings (VLAWE): A Novel Document-level Representation}",
    author={Radu Tudor Ionescu and Andrei M. Butnaru},
    year={2019},
    eprint={1902.08850},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@article{Yang2018ZerotrainingSE,
  title={Zero-training Sentence Embedding via Orthogonal Basis},
  author={Ziyi Yang and Chenguang Zhu and Weizhu Chen},
  journal={ArXiv},
  year={2018},
  volume={abs/1810.00438}
}

@misc{shen2018baseline,
    title={Baseline Needs More Love: On Simple Word-Embedding-Based Models and Associated Pooling Mechanisms},
    author={Dinghan Shen and Guoyin Wang and Wenlin Wang and Martin Renqiang Min and Qinliang Su and Yizhe Zhang and Chunyuan Li and Ricardo Henao and Lawrence Carin},
    year={2018},
    eprint={1805.09843},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@article{DBLP:journals/corr/abs-1902-06423,
  author    = {Florian Mai and
               Lukas Galke and
               Ansgar Scherp},
  title     = {{CBOW} Is Not All You Need: Combining {CBOW} with the Compositional
               Matrix Space Model},
  journal   = {CoRR},
  volume    = {abs/1902.06423},
  year      = {2019},
  url       = {http://arxiv.org/abs/1902.06423},
  archivePrefix = {arXiv},
  eprint    = {1902.06423},
  timestamp = {Tue, 21 May 2019 18:03:37 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1902-06423},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1809-04262,
  author    = {Rashmi Nagpal and
               Chetna Wadhwa and
               Mallika Gupta and
               Samiulla Shaikh and
               Sameep Mehta and
               Vikram Goyal},
  title     = {Extracting Fairness Policies from Legal Documents},
  journal   = {CoRR},
  volume    = {abs/1809.04262},
  year      = {2018},
  url       = {http://arxiv.org/abs/1809.04262},
  archivePrefix = {arXiv},
  eprint    = {1809.04262},
  timestamp = {Fri, 05 Oct 2018 11:34:52 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1809-04262},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@ARTICLE {doi:10.1080/00437956.1954.11659520,
    author    = "Zellig S. Harris",
    title     = "Distributional Structure",
    journal   = "WORD",
    year      = "1954",
    volume    = "10",
    number    = "2-3",
    pages     = "146-162",
    publisher = "Routledge",
    doi       = "10.1080/00437956.1954.11659520",
}


@article{Halko:2011:FSR:2078879.2078881,
 author = {Halko, N. and Martinsson, P. G. and Tropp, J. A.},
 title = {Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions},
 journal = {SIAM Rev.},
 issue_date = {May 2011},
 volume = {53},
 number = {2},
 month = may,
 year = {2011},
 issn = {0036-1445},
 pages = {217--288},
 numpages = {72},
 url = {http://dx.doi.org/10.1137/090771806},
 doi = {10.1137/090771806},
 acmid = {2078881},
 publisher = {Society for Industrial and Applied Mathematics},
 address = {Philadelphia, PA, USA},
 keywords = {Johnson-Lindenstrauss lemma, dimension reduction, eigenvalue decomposition, interpolative decomposition, matrix approximation, parallel algorithm, pass-efficient algorithm, principal component analysis, random matrix, randomized algorithm, rank-revealing QR factorization, singular value decomposition, streaming algorithm},
}

@book{books/daglib/0031897,
  added-at = {2013-10-06T00:00:00.000+0200},
  author = {Büttcher, Stefan and Clarke, Charles L. A. and Cormack, Gordon V.},
  biburl = {https://www.bibsonomy.org/bibtex/2e679957b4a1bdf252c3a33296397f434/dblp},
  ee = {http://mitpress.mit.edu/books/information-retrieval},
  interhash = {8aba6a7cd5e81a4c68dc1d6c9102fe7b},
  intrahash = {e679957b4a1bdf252c3a33296397f434},
  isbn = {978-0-262-02651-2},
  keywords = {dblp},
  pages = {I-XXIV, 1-606},
  publisher = {MIT Press},
  timestamp = {2013-10-08T11:35:46.000+0200},
  title = {Information Retrieval - Implementing and Evaluating Search Engines.},
  year = 2010
}

@article{Wolf2019HuggingFacesTS,
  title="{HuggingFace's Transformers: State-of-the-art Natural Language Processing}",
  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},
  journal={ArXiv},
  year={2019},
  volume={abs/1910.03771}
}

@misc{gillick2018endtoend,
    title={End-to-End Retrieval in Continuous Space},
    author={Daniel Gillick and Alessandro Presta and Gaurav Singh Tomar},
    year={2018},
    eprint={1811.08008},
    archivePrefix={arXiv},
    primaryClass={cs.IR}
}

@misc{almarwani2019efficient,
    title={Efficient Sentence Embedding using Discrete Cosine Transform},
    author={Nada Almarwani and Hanan Aldarmaki and Mona Diab},
    year={2019},
    eprint={1909.03104},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@misc{wu2018word,
    title="{Word Mover's Embedding: From Word2Vec to Document Embedding}",
    author={Lingfei Wu and Ian E. H. Yen and Kun Xu and Fangli Xu and Avinash Balakrishnan and Pin-Yu Chen and Pradeep Ravikumar and Michael J. Witbrock},
    year={2018},
    eprint={1811.01713},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@misc{yang2019xlnet,
  abstract = {With the capability of modeling bidirectional contexts, denoising
autoencoding based pretraining like BERT achieves better performance than
pretraining approaches based on autoregressive language modeling. However,
relying on corrupting the input with masks, BERT neglects dependency between
the masked positions and suffers from a pretrain-finetune discrepancy. In light
of these pros and cons, we propose XLNet, a generalized autoregressive
pretraining method that (1) enables learning bidirectional contexts by
maximizing the expected likelihood over all permutations of the factorization
order and (2) overcomes the limitations of BERT thanks to its autoregressive
formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the
state-of-the-art autoregressive model, into pretraining. Empirically, XLNet
outperforms BERT on 20 tasks, often by a large margin, and achieves
state-of-the-art results on 18 tasks including question answering, natural
language inference, sentiment analysis, and document ranking.},
  added-at = {2019-07-03T19:07:32.000+0200},
  author = {Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Ruslan and Le, Quoc V.},
  biburl = {https://www.bibsonomy.org/bibtex/2b758258da935db4bc1a57b5f6c9d94c6/deepforce},
  description = {[1906.08237] XLNet: Generalized Autoregressive Pretraining for Language Understanding},
  interhash = {cd85caa3241071a53ea5c86eadae8de8},
  intrahash = {b758258da935db4bc1a57b5f6c9d94c6},
  keywords = {language_modeling nlp tpu transfer_learning},
  note = {cite arxiv:1906.08237Comment: Pretrained models and code are available at  https://github.com/zihangdai/xlnet},
  timestamp = {2019-07-03T19:07:32.000+0200},
  title = "{XLNet: Generalized Autoregressive Pretraining for Language Understanding}",
  url = {http://arxiv.org/abs/1906.08237},
  year = 2019
}


@InProceedings{conneau2018xnli,
  author = "Conneau, Alexis
        and Rinott, Ruty
        and Lample, Guillaume
        and Williams, Adina
        and Bowman, Samuel R.
        and Schwenk, Holger
        and Stoyanov, Veselin",
  title = "{XNLI: Evaluating Cross-lingual Sentence Representations}",
  booktitle = "Proceedings of the 2018 Conference on Empirical Methods
               in Natural Language Processing",
  year = "2018",
  publisher = "Association for Computational Linguistics",
  location = "Brussels, Belgium",
}

@inproceedings{Radford2018ImprovingLU,
  title={Improving Language Understanding by Generative Pre-Training},
  author={Alec Radford},
  year={2018}
}

@inproceedings{reimers-2019-sentence-bert,
    title = "{Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks}",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "http://arxiv.org/abs/1908.10084",
}

@INPROCEEDINGS{Pennington14glove:global,
    author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
    title = "{Glove: Global vectors for word representation}",
    booktitle = {In EMNLP},
    year = {2014}
}

@inproceedings{Kano2017OverviewOC,
  title="{Overview of COLIEE 2017}",
  author={Yoshinobu Kano and Mi Young Kim and Randy Goebel and Ken Satoh},
  booktitle={COLIEE@ICAIL},
  year={2017}
}

@inproceedings{snli:emnlp2015,
	Author = {Bowman, Samuel R. and Angeli, Gabor and Potts, Christopher and Manning, Christopher D.},
	Booktitle = {Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
	Publisher = {Association for Computational Linguistics},
	Title = {A large annotated corpus for learning natural language inference},
	Year = {2015}
}

@inproceedings{Williams2017ABC,
  title={A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference},
  author={Adina Williams and Nikita Nangia and Samuel R. Bowman},
  booktitle={NAACL-HLT},
  year={2017}
}

@inproceedings{Peters:2018,
  author={Peters, Matthew E. and  Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke},
  title={Deep contextualized word representations},
  booktitle={Proc. of NAACL},
  year={2018}
}

@inproceedings{Wang2019GeneralizingFA,
  title={Generalizing from a Few Examples: A Survey on Few-Shot Learning},
  author={Yaqing Wang and Quanming Yao and James Kwok and Lionel M. Ni},
  year={2019}
}

@article{DBLP:journals/corr/VaswaniSPUJGKP17,
  author    = {Ashish Vaswani and
               Noam Shazeer and
               Niki Parmar and
               Jakob Uszkoreit and
               Llion Jones and
               Aidan N. Gomez and
               Lukasz Kaiser and
               Illia Polosukhin},
  title     = {Attention Is All You Need},
  journal   = {CoRR},
  volume    = {abs/1706.03762},
  year      = {2017},
  url       = {http://arxiv.org/abs/1706.03762},
  archivePrefix = {arXiv},
  eprint    = {1706.03762},
  timestamp = {Mon, 13 Aug 2018 16:48:37 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/VaswaniSPUJGKP17},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{arora2017asimple,
	author = {Sanjeev Arora and Yingyu Liang and Tengyu Ma},
	title = {A Simple but Tough-to-Beat Baseline for Sentence Embeddings},
	booktitle = {International Conference on Learning Representations},
	year = {2017}
}


@inproceedings{zhao2019moverscore,
  title = "{MoverScore: Text Generation Evaluating with Contextualized Embeddings and Earth Mover Distance}",
  month = {August},
  year = {2019},
  author = {Zhao, Wei and Peyrard, Maxime and Liu, Fei and Gao, Yang and Meyer, Christian M. and Eger, Steffen},
  address = {Hong Kong, China},
  publisher = {Association for Computational Linguistics},
  booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing},
}


@article{gpt2,
  added-at = {2019-02-27T03:35:25.000+0100},
  author = {Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
  biburl = {https://www.bibsonomy.org/bibtex/2b30710316a8cfbae687672ea1f85c193/kirk86},
  description = {Language Models are Unsupervised Multitask Learners},
  interhash = {ce8168300081d74707849ed488e2a458},
  intrahash = {b30710316a8cfbae687672ea1f85c193},
  keywords = {learning multitask},
  timestamp = {2019-02-27T03:35:25.000+0100},
  title = {Language Models are Unsupervised Multitask Learners},
  url = {https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf},
  year = 2018
}

@article{lample2019cross,
  title={Cross-lingual Language Model Pretraining},
  author={Lample, Guillaume and Conneau, Alexis},
  journal={arXiv preprint arXiv:1901.07291},
  year={2019}
}

@InProceedings{pmlr-v37-kusnerb15,
  title = 	 {From Word Embeddings To Document Distances},
  author = 	 {Matt Kusner and Yu Sun and Nicholas Kolkin and Kilian Weinberger},
  booktitle = 	 {Proceedings of the 32nd International Conference on Machine Learning},
  pages = 	 {957--966},
  year = 	 {2015},
  editor = 	 {Francis Bach and David Blei},
  volume = 	 {37},
  series = 	 {Proceedings of Machine Learning Research},
  address = 	 {Lille, France},
  month = 	 {07--09 Jul},
  publisher = 	 {PMLR},
  pdf = 	 {http://proceedings.mlr.press/v37/kusnerb15.pdf},
  url = 	 {http://proceedings.mlr.press/v37/kusnerb15.html},
  abstract = 	 {We present the Word Mover’s Distance (WMD), a novel distance function between text documents. Our work is based on recent results in word embeddings that learn semantically meaningful representations for words from local co-occurrences in sentences. The WMD distance measures the dissimilarity between two text documents as the minimum amount of distance that the embedded words of one document need to "travel" to reach the embedded words of another document. We show that this distance metric can be cast as an instance of the Earth Mover’s Distance, a well studied transportation problem for which several highly efficient solvers have been developed. Our metric has no hyperparameters and is straight-forward to implement. Further, we demonstrate on eight real world document classification data sets, in comparison with seven state-of-the-art baselines, that the WMD metric leads to unprecedented low k-nearest neighbor document classification error rates.}
}

@inproceedings{Rabelo:2019:CST:3322640.3326741,
 author = {Rabelo, Juliano and Kim, Mi-Young and Goebel, Randy},
 title = {Combining Similarity and Transformer Methods for Case Law Entailment},
 booktitle = {Proceedings of the Seventeenth International Conference on Artificial Intelligence and Law},
 series = {ICAIL '19},
 year = {2019},
 isbn = {978-1-4503-6754-7},
 location = {Montreal, QC, Canada},
 pages = {290--296},
 numpages = {7},
 url = {http://doi.acm.org/10.1145/3322640.3326741},
 doi = {10.1145/3322640.3326741},
 acmid = {3326741},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {binary classification, document similarity, imbalanced datasets, legal textual entailment},
}

@article{DBLP:journals/corr/abs-1803-11175,
  author    = {Daniel Cer and
               Yinfei Yang and
               Sheng{-}yi Kong and
               Nan Hua and
               Nicole Limtiaco and
               Rhomni St. John and
               Noah Constant and
               Mario Guajardo{-}Cespedes and
               Steve Yuan and
               Chris Tar and
               Yun{-}Hsuan Sung and
               Brian Strope and
               Ray Kurzweil},
  title     = {Universal Sentence Encoder},
  journal   = {CoRR},
  volume    = {abs/1803.11175},
  year      = {2018},
  url       = {http://arxiv.org/abs/1803.11175},
  archivePrefix = {arXiv},
  eprint    = {1803.11175},
  timestamp = {Mon, 13 Aug 2018 16:46:40 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1803-11175},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{senin2008dynamic,
  title={Dynamic time warping algorithm review},
  author={Senin, Pavel},
  year={2008},
  publisher={Citeseer}
}

@incollection{Sakoe:1990:DPA:108235.108244,
 author = {Sakoe, Hiroaki and Chiba, Seibi},
 chapter = {Dynamic Programming Algorithm Optimization for Spoken Word Recognition},
 title = {Readings in Speech Recognition},
 editor = {Waibel, Alex and Lee, Kai-Fu},
 year = {1990},
 isbn = {1-55860-124-4},
 pages = {159--165},
 numpages = {7},
 url = {http://dl.acm.org/citation.cfm?id=108235.108244},
 acmid = {108244},
 publisher = {Morgan Kaufmann Publishers Inc.},
 address = {San Francisco, CA, USA},
}

@article{muller2007dynamic,
  title={Dynamic time warping},
  author={M{\"u}ller, Meinard},
  journal={Information retrieval for music and motion},
  pages={69--84},
  year={2007},
  publisher={Springer}
}