161 lines
5.0 KiB
Python
161 lines
5.0 KiB
Python
|
# Sebastian Raschka 2014-2020
|
||
|
# mlxtend Machine Learning Library Extensions
|
||
|
#
|
||
|
# Function for processing names.
|
||
|
# Author: Sebastian Raschka <sebastianraschka.com>
|
||
|
#
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
|
||
|
import unicodedata
|
||
|
import string
|
||
|
import re
|
||
|
import sys
|
||
|
from pandas import __version__ as pandas_version
|
||
|
from distutils.version import LooseVersion as Version
|
||
|
|
||
|
|
||
|
if sys.version_info <= (3, 0):
|
||
|
raise ImportError("Sorry, the text.names module is incompatible"
|
||
|
" with Python 2.x,"
|
||
|
" due to its unicode intricacies")
|
||
|
|
||
|
|
||
|
def generalize_names(name, output_sep=' ', firstname_output_letters=1):
|
||
|
"""Generalize a person's first and last name.
|
||
|
|
||
|
Description : Returns a person's name in the format
|
||
|
`<last_name><separator><firstname letter(s)> (all lowercase)`
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
name : `str`
|
||
|
Name of the player
|
||
|
output_sep : `str` (default: ' ')
|
||
|
String for separating last name and first name in the output.
|
||
|
firstname_output_letters : `int`
|
||
|
Number of letters in the abbreviated first name.
|
||
|
|
||
|
Returns
|
||
|
----------
|
||
|
gen_name : `str`
|
||
|
The generalized name.
|
||
|
|
||
|
Examples
|
||
|
-----------
|
||
|
For usage examples, please see
|
||
|
http://rasbt.github.io/mlxtend/user_guide/text/generalize_names/
|
||
|
|
||
|
"""
|
||
|
# set first and last name positions
|
||
|
last, first = 'last', 'first'
|
||
|
last_pos = -1
|
||
|
|
||
|
name = name.lower()
|
||
|
|
||
|
# fix primarily Dutch names
|
||
|
exc = ['van der ', 'de ', 'van ', 'von ', 'di ']
|
||
|
for e in exc:
|
||
|
if name.startswith(e):
|
||
|
repl = e.replace(' ', '')
|
||
|
name = (repl + name[len(e) - 1:].strip())
|
||
|
|
||
|
exc = [' van der ', ' de ', ' van ', ' von ', ' di ',
|
||
|
', van der ', ', de', ', van ', ', von ', ', di ']
|
||
|
|
||
|
for e in exc:
|
||
|
name = name.replace(e, ' ' + e.replace(' ', ''))
|
||
|
|
||
|
if ',' in name:
|
||
|
last, first = first, last
|
||
|
name = name.replace(',', '')
|
||
|
last_pos = 1
|
||
|
|
||
|
spl = name.split()
|
||
|
if len(spl) > 2:
|
||
|
name = '%s %s' % (spl[0], spl[last_pos])
|
||
|
|
||
|
# remove accents
|
||
|
if sys.version_info.major == 2:
|
||
|
name = name.decode('utf-8')
|
||
|
|
||
|
name = ''.join(x for x in unicodedata.normalize('NFKD', name)
|
||
|
if x in string.ascii_letters + ' ')
|
||
|
|
||
|
# get first and last name if applicable
|
||
|
m = re.match(r'(?P<first>\w+)\W+(?P<last>\w+)', name)
|
||
|
if m:
|
||
|
output = '%s%s%s' % (m.group(last),
|
||
|
output_sep,
|
||
|
m.group(first)[:firstname_output_letters])
|
||
|
else:
|
||
|
output = name
|
||
|
|
||
|
gen_name = output.strip()
|
||
|
return gen_name
|
||
|
|
||
|
|
||
|
def generalize_names_duplcheck(df, col_name):
|
||
|
""" Generalizes names and removes duplicates.
|
||
|
|
||
|
Description : Applies mlxtend.text.generalize_names to a DataFrame
|
||
|
with 1 first name letter by default
|
||
|
and uses more first name letters if duplicates are detected.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
df : `pandas.DataFrame`
|
||
|
DataFrame that contains a column where
|
||
|
generalize_names should be applied.
|
||
|
col_name : `str`
|
||
|
Name of the DataFrame column where `generalize_names`
|
||
|
function should be applied to.
|
||
|
|
||
|
Returns
|
||
|
----------
|
||
|
df_new : `str`
|
||
|
New DataFrame object where generalize_names function has
|
||
|
been applied without duplicates.
|
||
|
|
||
|
Examples
|
||
|
-----------
|
||
|
For usage examples, please see
|
||
|
http://rasbt.github.io/mlxtend/user_guide/text/generalize_names_duplcheck/
|
||
|
|
||
|
"""
|
||
|
df_new = df.copy()
|
||
|
|
||
|
df_new.drop_duplicates(subset=[col_name], inplace=True)
|
||
|
|
||
|
df_new[col_name] = df_new[col_name].apply(generalize_names)
|
||
|
|
||
|
if Version(pandas_version) < '0.17':
|
||
|
dupl = (list(df_new[df_new.duplicated(subset=col_name,
|
||
|
keep='last')].index) +
|
||
|
list(df_new[df_new.duplicated(subset=col_name,
|
||
|
keep='first')].index))
|
||
|
else:
|
||
|
dupl = (list(df_new[df_new.duplicated(subset=col_name,
|
||
|
keep='last')].index) +
|
||
|
list(df_new[df_new.duplicated(subset=col_name,
|
||
|
keep='first')].index))
|
||
|
|
||
|
firstname_letters = 2
|
||
|
while len(dupl) > 0:
|
||
|
for idx in dupl:
|
||
|
df_new.loc[idx, col_name] = generalize_names(
|
||
|
df.loc[idx, col_name],
|
||
|
firstname_output_letters=firstname_letters)
|
||
|
if Version(pandas_version) < '0.17':
|
||
|
dupl = (list(df_new[df_new.duplicated(subset=col_name,
|
||
|
keep='last')].index) +
|
||
|
list(df_new[df_new.duplicated(subset=col_name,
|
||
|
keep='first')].index))
|
||
|
else:
|
||
|
dupl = (list(df_new[df_new.duplicated(subset=col_name,
|
||
|
keep='last')].index) +
|
||
|
list(df_new[df_new.duplicated(subset=col_name,
|
||
|
keep='first')].index))
|
||
|
firstname_letters += 1
|
||
|
return df_new
|