kleister-nda/run.ipynb at no-ner

Iwona Christop 7e24387a46 No NER

2022-05-03 23:45:59 +02:00

6.0 KiB

Raw Permalink Blame History

import lzma


states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
    'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
    'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
    'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
    'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
    'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 
    'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
    'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
    'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
    'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

months = {'01': 'January', '02': 'February', '03': 'March', 
    '04': 'April', '05': 'May', '06': 'June',
    '07': 'July', '08': 'August', '09': 'September',
    '10': 'October', '11': 'November', '12': 'December'}

wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
        6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
        11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
        15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
        19 : 'nineteen', 20 : 'twenty',
        30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
        70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }

terms = ['days', 'months', 'years']


def readInput(dir):
    NDAs = []
    with lzma.open(dir) as f:
        for line in f:
            NDAs.append(line.decode('utf-8').replace('\\\\n', ' '))
    return NDAs

if __name__ == '__main__':
    NDAs = readInput('train/in.tsv.xz')
    predicted = []

import re

punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\\\\\]^_`{|}~'
stopWords = ['day ', 'of ', 'st', 'nd', 'rd', 'th', 'date ', 'Date ',
     'as ', 'on ', 'from ', 'dated ']

text = NDAs[2]

for char in punctuation:
    text = text.replace(char, '')

for word in stopWords:
    text = text.replace(word, '')

effectiveDate = []

for month in list(months.values()):
    if month in text:
        effectiveDate.append(re.findall('\d+ ' + month + ' \d+', text))

effectiveDate = [item for sublist in effectiveDate for item in sublist]

effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }

try:
    # effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
    effectiveDate = max(effectiveDate, key=effectiveDate.get)
    for char in punctuation: effectiveDate = effectiveDate.replace(char, '')
    for d in effectiveDate.split():
        if d in list(months.values()):
            month = list(months.keys())[list(months.values()).index(d)]
        elif int(d) < 10:
            day = '0' + d
        elif int(d) < 32:
            day = d
        elif int(d) > 1900 and int(d) < 2030:
            year = d
    effectiveDate = year + '-' + month + '-' + day
except:
    effectiveDate = ''

effectiveDate

'2012-01-06'

text = NDAs[1]

re.findall(r'[A-Z]+[a-z]+, Inc.', text)

['Pharma, Inc.',
 'Sciences, Inc.',
 'Pharma, Inc.',
 'Sciences, Inc.',
 'Pharma, Inc.',
 'Sciences, Inc.',
 'Pharma, Inc.',
 'Sciences, Inc.']

'NIKE'.capitalize()

'Nike'

6.0 KiB Raw Permalink Blame History

6.0 KiB

Raw Permalink Blame History