added csv_split

This commit is contained in:
Michael Herman 2014-05-30 09:24:44 -05:00
parent 8b65a7c03d
commit 9828e28fc5

View File

@ -1,167 +1,131 @@
### WIP
import sys import sys
import os import os
import getopt
import csv import csv
import argparse
""" """
Splits a CSV file into multiple pieces based on command line arguments.
Splits a CSV file into multiple pieces based on command line arguments.
Arguments: Arguments:
`-h`: help file of usage of the script `-h`: help file of usage of the script
`-i`: input file name `-i`: input file name
`-o`: output file, A %s-style template for the numbered output files. `-o`: output file name
`-r`: row limit to split `-r`: row limit to split
`-c`: A %s-style template for the numbered output files.
Default settings: Default settings:
`output_path` is the current directory `output_path` is the current directory
`keep_headers` is on (headers will be kept) headers are displayed on each split file
`delimeter` is , the default delimeter is a comma
Example usage: Example usage:
# split by every 10000 rows
>> python 12_csv_split.py -i input.csv -o rownumber -r 10000 ```
# split by unique items in column 0 # split csv by every 100 rows
>> python 12_csv_split.py -i input.csv -o userid -c 0 >> python csv_split.py -i input.csv -o output -r 100
# access help ```
>> python 12_csv_split.py -h for help
""" """
def main(argv):
argument_dict = grab_command_line_arguments(argv) def get_arguments():
parse_file(argument_dict) """Grab user supplied arguments using the argparse library."""
# Use arparse to get command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_file", required=True,
help="csv input file (with extension)", type=str)
parser.add_argument("-o", "--output_file", required=True,
help="csv output file (without extension)", type=str)
parser.add_argument("-r", "--row_limit", required=True,
help="row limit to split csv at", type=int)
args = parser.parse_args()
# Check if the input_file exits
is_valid_file(parser, args.input_file)
# Check if the input_file is valid
is_valid_csv(parser, args.input_file, args.row_limit)
return args.input_file, args.output_file, args.row_limit
def grab_command_line_arguments(argv): def is_valid_file(parser, file_name):
"""Ensure that the input_file exists."""
# global variables if not os.path.exists(file_name):
inputfile = '' parser.error("The file '{}' does not exist!".format(file_name))
outputfile = '' sys.exit(1)
rowlimit = ''
columnindex = ''
argument_dict = {}
# grab arguments
opts, args = getopt.getopt(argv,"hi:o:r:c:",["ifile=","ofile=","rowlimit=","columnindex="])
# end if no arguments provided
if not opts:
print "No options provided. Try again. Use `-h` for help."
sys.exit()
# grab arguments
for opt, arg in opts:
if opt == '-h':
print 'csvsplit.py -i <inputfile> -r <row limit> -c <column index> -o <outputfile>'
sys.exit()
elif opt in ("-i", "--ifile"):
inputfile = arg
elif opt in ("-o", "--ofile"):
outputfile = arg
elif opt in ("-r", "--rowlimit"):
rowlimit = arg
elif opt in ("-c", "--columnindex"):
columnindex = arg
# Output arguments
print "\nArguments:"
if inputfile:
argument_dict["input_file"] = inputfile
print "Input file is '{}'".format(inputfile)
else:
"Please enter an input file."
if outputfile:
argument_dict["output_file"] = outputfile
print "Output file is '{}'".format(outputfile)
else:
print "Please enter an output file."
if rowlimit:
argument_dict["rowlimit"] = rowlimit
print "Rowlimit is '{}'".format(rowlimit)
if columnindex:
argument_dict["columnindex"] = columnindex
print "Columnindex is '{}'".format(columnindex)
if rowlimit and columnindex:
print "Please use either a rowlimit or columnlimit, not both."
sys.exit()
if not rowlimit or columnindex:
print "Please enter either a rowlimit or columnlimit."
sys.exit()
# to do - check to make sure file, rowlimit, and columnlimit exist
print argument_dict
return argument_dict
def parse_file(argument_dict): def is_valid_csv(parser, file_name, row_limit):
"""
#split csv file by certain rownumber Ensure that the # of rows in the input_file
if argument_dict["rowlimit"]: is greater than the row_limit.
rowlimit = int(argument_dict["rowlimit"]) """
output_name_file = "{}.csv".format(argument_dict["output_file"]) row_count = 0
output_path='.' for row in csv.reader(open(file_name)):
keep_headers=True row_count += 1
delimiter=',' # Note: You could also use a generator expression
filehandler = open(argument_dict["input_file"],'r') # and the sum() function to count the rows:
reader = csv.reader(filehandler, delimiter=delimiter) # row_count = sum(1 for row in csv.reader(open(file_name)))
current_piece = 1 if row_limit > row_count:
current_out_path = os.path.join( parser.error(
output_path, "The 'row_count' of '{}' is > the number of rows in '{}'!"
output_name_file .format(row_limit, file_name)
) )
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter) sys.exit(1)
current_limit = rowlimit
if keep_headers:
headers = reader.next() def parse_file(arguments):
current_out_writer.writerow(headers) """
for i, row in enumerate(reader): Splits the CSV into multiple files or chunks based on the row_limit.
if i + 1 > current_limit: Then create new CSV files.
current_piece += 1 """
current_limit = rowlimit * current_piece input_file = arguments[0]
current_out_path = os.path.join( output_file = arguments[1]
row_limit = arguments[2]
output_path = '.' # Current directory
# Read CSV, split into list of lists
with open(input_file, 'r') as input_csv:
datareader = csv.reader(input_csv)
all_rows = []
for row in datareader:
all_rows.append(row)
# Remove header
header = all_rows.pop(0)
# Split list of list into chunks
current_chunk = 0
for i in range(0, len(all_rows), row_limit): # Loop through list
chunk = all_rows[i:i + row_limit] # Create single chunk
current_output = os.path.join( # Create new output file
output_path, output_path,
output_name_file "{}-{}.csv".format(output_file, current_chunk)
) )
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
# elif columnindex: #split csv file accrording to unique values of certain column,it's like filter only certain item in excel # Add header
# itemlist = [] chunk.insert(0, header)
# columnindex = int(columnindex)
# output_name_template= outputfile+'_%s.csv'
# output_path='.'
# keep_headers=True
# delimiter=','
# filehandler = open(inputfile,'r')
# reader = csv.reader(filehandler, delimiter=delimiter)
# if keep_headers:
# headers = reader.next()
# for i, row in enumerate(reader): # Write chunk to output file
with open(current_output, 'w') as output_csv:
writer = csv.writer(output_csv)
writer = writer.writerows(chunk)
# current_out_path = os.path.join( # Output info
# output_path, print ""
# output_name_template % row[columnindex] ) print "Chunk # {}:".format(current_chunk)
# if row[columnindex] not in itemlist: print "Filepath: {}".format(current_output)
# try: print "# of rows: {}".format(len(chunk))
# current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
# except IOError: # Create new chunk
# continue current_chunk += 1
# else:
# itemlist.append(row[columnindex])
# if keep_headers:
# current_out_writer.writerow(headers)
# current_out_writer.writerow(row)
# else:
# current_out_writer = csv.writer(open(current_out_path, 'a'), delimiter=delimiter)
# current_out_writer.writerow(row)
# print 'totally %i unique items in column %i \n' % (len(itemlist),columnindex)
# else:
# print "oops, please check instruction of script by >>./csvsplit.py -h"
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv[1:]) arguments = get_arguments()
parse_file(arguments)