169 lines
3.5 KiB
Plaintext
169 lines
3.5 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 23,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import lzma\n",
|
||
|
"\n",
|
||
|
"NDAs = []\n",
|
||
|
"with lzma.open('train/in.tsv.xz') as f:\n",
|
||
|
" for line in f:\n",
|
||
|
" NDAs.append(line.decode('utf-8'))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 40,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import spacy\n",
|
||
|
"from spacy import displacy\n",
|
||
|
"\n",
|
||
|
"nlp = spacy.load('NER')\n",
|
||
|
"\n",
|
||
|
"text = NDAs[9]\n",
|
||
|
"doc = nlp(text)\n",
|
||
|
"\n",
|
||
|
"effective_date = []\n",
|
||
|
"jurisdiction = []\n",
|
||
|
"party = []\n",
|
||
|
"term = []\n",
|
||
|
"\n",
|
||
|
"for word in doc.ents:\n",
|
||
|
" if word.label_ == 'effective_date':\n",
|
||
|
" effective_date.append(word.text)\n",
|
||
|
" elif word.label_ == 'jurisdiction':\n",
|
||
|
" jurisdiction.append(word.text)\n",
|
||
|
" elif word.label_ == 'party':\n",
|
||
|
" party.append(word.text)\n",
|
||
|
" else:\n",
|
||
|
" term.append(word.text)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 44,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"12"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 44,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"jurisdiction.count('New York')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 45,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"juris = { j : jurisdiction.count(j) for j in jurisdiction}"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 47,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"{'New York': 12}"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 47,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"juris"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 39,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"CompuDyne Corporation --> party\n",
|
||
|
"two years --> term\n",
|
||
|
"New York --> jurisdiction\n",
|
||
|
"New York --> jurisdiction\n",
|
||
|
"New York --> jurisdiction\n",
|
||
|
"CompuDyne Corporation --> party\n",
|
||
|
"two years --> term\n",
|
||
|
"New York --> jurisdiction\n",
|
||
|
"New York --> jurisdiction\n",
|
||
|
"New York --> jurisdiction\n",
|
||
|
"CompuDyne Corporation --> party\n",
|
||
|
"two years --> term\n",
|
||
|
"New York --> jurisdiction\n",
|
||
|
"New York --> jurisdiction\n",
|
||
|
"New York --> jurisdiction\n",
|
||
|
"CompuDyne Corporation --> party\n",
|
||
|
"two years --> term\n",
|
||
|
"New York --> jurisdiction\n",
|
||
|
"New York --> jurisdiction\n",
|
||
|
"New York --> jurisdiction\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"text = NDAs[9]\n",
|
||
|
"doc = nlp(text)\n",
|
||
|
"for word in doc.ents:\n",
|
||
|
" print(word.text, '-->', word.label_)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"interpreter": {
|
||
|
"hash": "3a5b3979b9a2fc2c8e649de363a592bbf5a2c9da164843b1adb5b45661722ad0"
|
||
|
},
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3.8.10 64-bit",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.8.10"
|
||
|
},
|
||
|
"orig_nbformat": 4
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|