6.0 KiB
6.0 KiB
import lzma
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
months = {'01': 'January', '02': 'February', '03': 'March',
'04': 'April', '05': 'May', '06': 'June',
'07': 'July', '08': 'August', '09': 'September',
'10': 'October', '11': 'November', '12': 'December'}
wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
19 : 'nineteen', 20 : 'twenty',
30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
terms = ['days', 'months', 'years']
def readInput(dir):
NDAs = []
with lzma.open(dir) as f:
for line in f:
NDAs.append(line.decode('utf-8').replace('\\\\n', ' '))
return NDAs
if __name__ == '__main__':
NDAs = readInput('train/in.tsv.xz')
predicted = []
import re
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\\\\\]^_`{|}~'
stopWords = ['day ', 'of ', 'st', 'nd', 'rd', 'th', 'date ', 'Date ',
'as ', 'on ', 'from ', 'dated ']
text = NDAs[2]
for char in punctuation:
text = text.replace(char, '')
for word in stopWords:
text = text.replace(word, '')
effectiveDate = []
for month in list(months.values()):
if month in text:
effectiveDate.append(re.findall('\d+ ' + month + ' \d+', text))
effectiveDate = [item for sublist in effectiveDate for item in sublist]
effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
try:
# effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
effectiveDate = max(effectiveDate, key=effectiveDate.get)
for char in punctuation: effectiveDate = effectiveDate.replace(char, '')
for d in effectiveDate.split():
if d in list(months.values()):
month = list(months.keys())[list(months.values()).index(d)]
elif int(d) < 10:
day = '0' + d
elif int(d) < 32:
day = d
elif int(d) > 1900 and int(d) < 2030:
year = d
effectiveDate = year + '-' + month + '-' + day
except:
effectiveDate = ''
effectiveDate
'2012-01-06'
text = NDAs[1]
re.findall(r'[A-Z]+[a-z]+, Inc.', text)
['Pharma, Inc.', 'Sciences, Inc.', 'Pharma, Inc.', 'Sciences, Inc.', 'Pharma, Inc.', 'Sciences, Inc.', 'Pharma, Inc.', 'Sciences, Inc.']
'NIKE'.capitalize()
'Nike'