kleister-nda/heSaidEdgar.ipynb

169 lines
3.5 KiB
Plaintext
Raw Normal View History

2022-05-03 20:10:12 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import lzma\n",
"\n",
"NDAs = []\n",
"with lzma.open('train/in.tsv.xz') as f:\n",
" for line in f:\n",
" NDAs.append(line.decode('utf-8'))"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"from spacy import displacy\n",
"\n",
"nlp = spacy.load('NER')\n",
"\n",
"text = NDAs[9]\n",
"doc = nlp(text)\n",
"\n",
"effective_date = []\n",
"jurisdiction = []\n",
"party = []\n",
"term = []\n",
"\n",
"for word in doc.ents:\n",
" if word.label_ == 'effective_date':\n",
" effective_date.append(word.text)\n",
" elif word.label_ == 'jurisdiction':\n",
" jurisdiction.append(word.text)\n",
" elif word.label_ == 'party':\n",
" party.append(word.text)\n",
" else:\n",
" term.append(word.text)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"12"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"jurisdiction.count('New York')"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"juris = { j : jurisdiction.count(j) for j in jurisdiction}"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'New York': 12}"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"juris"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CompuDyne Corporation --> party\n",
"two years --> term\n",
"New York --> jurisdiction\n",
"New York --> jurisdiction\n",
"New York --> jurisdiction\n",
"CompuDyne Corporation --> party\n",
"two years --> term\n",
"New York --> jurisdiction\n",
"New York --> jurisdiction\n",
"New York --> jurisdiction\n",
"CompuDyne Corporation --> party\n",
"two years --> term\n",
"New York --> jurisdiction\n",
"New York --> jurisdiction\n",
"New York --> jurisdiction\n",
"CompuDyne Corporation --> party\n",
"two years --> term\n",
"New York --> jurisdiction\n",
"New York --> jurisdiction\n",
"New York --> jurisdiction\n"
]
}
],
"source": [
"text = NDAs[9]\n",
"doc = nlp(text)\n",
"for word in doc.ents:\n",
" print(word.text, '-->', word.label_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "3a5b3979b9a2fc2c8e649de363a592bbf5a2c9da164843b1adb5b45661722ad0"
},
"kernelspec": {
"display_name": "Python 3.8.10 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}