kleister-nda/run.ipynb
Iwona Christop 7e24387a46 No NER
2022-05-03 23:45:59 +02:00

6.0 KiB

import lzma


states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
    'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
    'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
    'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
    'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
    'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 
    'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
    'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
    'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
    'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

months = {'01': 'January', '02': 'February', '03': 'March', 
    '04': 'April', '05': 'May', '06': 'June',
    '07': 'July', '08': 'August', '09': 'September',
    '10': 'October', '11': 'November', '12': 'December'}

wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
        6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
        11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
        15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
        19 : 'nineteen', 20 : 'twenty',
        30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
        70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }

terms = ['days', 'months', 'years']


def readInput(dir):
    NDAs = []
    with lzma.open(dir) as f:
        for line in f:
            NDAs.append(line.decode('utf-8').replace('\\\\n', ' '))
    return NDAs
if __name__ == '__main__':
    NDAs = readInput('train/in.tsv.xz')
    predicted = []
import re

punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\\\\\]^_`{|}~'
stopWords = ['day ', 'of ', 'st', 'nd', 'rd', 'th', 'date ', 'Date ',
     'as ', 'on ', 'from ', 'dated ']

text = NDAs[2]

for char in punctuation:
    text = text.replace(char, '')

for word in stopWords:
    text = text.replace(word, '')

effectiveDate = []

for month in list(months.values()):
    if month in text:
        effectiveDate.append(re.findall('\d+ ' + month + ' \d+', text))
effectiveDate = [item for sublist in effectiveDate for item in sublist]
effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
try:
    # effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
    effectiveDate = max(effectiveDate, key=effectiveDate.get)
    for char in punctuation: effectiveDate = effectiveDate.replace(char, '')
    for d in effectiveDate.split():
        if d in list(months.values()):
            month = list(months.keys())[list(months.values()).index(d)]
        elif int(d) < 10:
            day = '0' + d
        elif int(d) < 32:
            day = d
        elif int(d) > 1900 and int(d) < 2030:
            year = d
    effectiveDate = year + '-' + month + '-' + day
except:
    effectiveDate = ''
effectiveDate
'2012-01-06'
text = NDAs[1]

re.findall(r'[A-Z]+[a-z]+, Inc.', text)
['Pharma, Inc.',
 'Sciences, Inc.',
 'Pharma, Inc.',
 'Sciences, Inc.',
 'Pharma, Inc.',
 'Sciences, Inc.',
 'Pharma, Inc.',
 'Sciences, Inc.']
'NIKE'.capitalize()
'Nike'