python-scripts/12_csv_split.py

import sys
import os
import csv
import argparse

"""

Splits a CSV file into multiple pieces based on command line arguments.

    Arguments:

    `-h`: help file of usage of the script
    `-i`: input file name
    `-o`: output file name
    `-r`: row limit to split

    Default settings:

    `output_path` is the current directory
    headers are displayed on each split file
    the default delimeter is a comma

    Example usage:

    ```
    # split csv by every 100 rows
    >> python csv_split.py -i input.csv -o output -r 100
    ```

"""


def get_arguments():
    """Grab user supplied arguments using the argparse library."""

    # Use arparse to get command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input_file", required=True,
                        help="csv input file (with extension)", type=str)
    parser.add_argument("-o", "--output_file", required=True,
                        help="csv output file (without extension)", type=str)
    parser.add_argument("-r", "--row_limit", required=True,
                        help="row limit to split csv at", type=int)
    args = parser.parse_args()

    # Check if the input_file exits
    is_valid_file(parser, args.input_file)

    # Check if the input_file is valid
    is_valid_csv(parser, args.input_file, args.row_limit)

    return args.input_file, args.output_file, args.row_limit


def is_valid_file(parser, file_name):
    """Ensure that the input_file exists."""
    if not os.path.exists(file_name):
        parser.error("The file '{}' does not exist!".format(file_name))
        sys.exit(1)


def is_valid_csv(parser, file_name, row_limit):
    """
    Ensure that the # of rows in the input_file
    is greater than the row_limit.
    """
    row_count = 0
    for row in csv.reader(open(file_name)):
        row_count += 1
    # Note: You could also use a generator expression
    # and the sum() function to count the rows:
    # row_count = sum(1 for row in csv.reader(open(file_name)))
    if row_limit > row_count:
        parser.error(
            "The 'row_count' of '{}' is > the number of rows in '{}'!"
            .format(row_limit, file_name)
        )
        sys.exit(1)


def parse_file(arguments):
    """
    Splits the CSV into multiple files or chunks based on the row_limit.
    Then create new CSV files.
    """
    input_file = arguments[0]
    output_file = arguments[1]
    row_limit = arguments[2]
    output_path = '.'  # Current directory

    # Read CSV, split into list of lists
    with open(input_file, 'r') as input_csv:
        datareader = csv.reader(input_csv)
        all_rows = []
        for row in datareader:
            all_rows.append(row)

        # Remove header
        header = all_rows.pop(0)

        # Split list of list into chunks
        current_chunk = 0
        for i in range(0, len(all_rows), row_limit):  # Loop through list
            chunk = all_rows[i:i + row_limit]  # Create single chunk

            current_output = os.path.join(  # Create new output file
                output_path,
                "{}-{}.csv".format(output_file, current_chunk)
            )

            # Add header
            chunk.insert(0, header)

            # Write chunk to output file
            with open(current_output, 'w') as output_csv:
                writer = csv.writer(output_csv)
                writer = writer.writerows(chunk)

            # Output info
            print ""
            print "Chunk # {}:".format(current_chunk)
            print "Filepath: {}".format(current_output)
            print "# of rows: {}".format(len(chunk))

            # Create new chunk
            current_chunk += 1


if __name__ == "__main__":
    arguments = get_arguments()
    parse_file(arguments)
csv splitter 2014-05-29 18:06:17 +02:00			`import sys`
			`import os`
			`import csv`
added csv_split 2014-05-30 16:24:44 +02:00			`import argparse`
csv splitter 2014-05-29 18:06:17 +02:00
			`"""`
added csv_split 2014-05-30 16:24:44 +02:00
			`Splits a CSV file into multiple pieces based on command line arguments.`
csv splitter 2014-05-29 18:06:17 +02:00
			`Arguments:`
added csv_split 2014-05-30 16:24:44 +02:00
			`-h`: help file of usage of the script
			`-i`: input file name
			`-o`: output file name
			`-r`: row limit to split
csv splitter 2014-05-29 18:06:17 +02:00
			`Default settings:`
added csv_split 2014-05-30 16:24:44 +02:00
			`output_path` is the current directory
			`headers are displayed on each split file`
			`the default delimeter is a comma`
csv splitter 2014-05-29 18:06:17 +02:00
			`Example usage:`
added csv_split 2014-05-30 16:24:44 +02:00
			```
			`# split csv by every 100 rows`
			`>> python csv_split.py -i input.csv -o output -r 100`
			```

csv splitter 2014-05-29 18:06:17 +02:00			`"""`

added csv_split 2014-05-30 16:24:44 +02:00
			`def get_arguments():`
			`"""Grab user supplied arguments using the argparse library."""`

			`# Use arparse to get command line arguments`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("-i", "--input_file", required=True,`
			`help="csv input file (with extension)", type=str)`
			`parser.add_argument("-o", "--output_file", required=True,`
			`help="csv output file (without extension)", type=str)`
			`parser.add_argument("-r", "--row_limit", required=True,`
			`help="row limit to split csv at", type=int)`
			`args = parser.parse_args()`

			`# Check if the input_file exits`
			`is_valid_file(parser, args.input_file)`

			`# Check if the input_file is valid`
			`is_valid_csv(parser, args.input_file, args.row_limit)`

			`return args.input_file, args.output_file, args.row_limit`


			`def is_valid_file(parser, file_name):`
			`"""Ensure that the input_file exists."""`
			`if not os.path.exists(file_name):`
			`parser.error("The file '{}' does not exist!".format(file_name))`
			`sys.exit(1)`


			`def is_valid_csv(parser, file_name, row_limit):`
			`"""`
			`Ensure that the # of rows in the input_file`
			`is greater than the row_limit.`
			`"""`
			`row_count = 0`
			`for row in csv.reader(open(file_name)):`
			`row_count += 1`
			`# Note: You could also use a generator expression`
			`# and the sum() function to count the rows:`
			`# row_count = sum(1 for row in csv.reader(open(file_name)))`
			`if row_limit > row_count:`
			`parser.error(`
			`"The 'row_count' of '{}' is > the number of rows in '{}'!"`
			`.format(row_limit, file_name)`
csv splitter 2014-05-29 18:06:17 +02:00			`)`
added csv_split 2014-05-30 16:24:44 +02:00			`sys.exit(1)`


			`def parse_file(arguments):`
			`"""`
			`Splits the CSV into multiple files or chunks based on the row_limit.`
			`Then create new CSV files.`
			`"""`
			`input_file = arguments[0]`
			`output_file = arguments[1]`
			`row_limit = arguments[2]`
			`output_path = '.' # Current directory`

			`# Read CSV, split into list of lists`
			`with open(input_file, 'r') as input_csv:`
			`datareader = csv.reader(input_csv)`
			`all_rows = []`
			`for row in datareader:`
			`all_rows.append(row)`

			`# Remove header`
			`header = all_rows.pop(0)`

			`# Split list of list into chunks`
			`current_chunk = 0`
			`for i in range(0, len(all_rows), row_limit): # Loop through list`
			`chunk = all_rows[i:i + row_limit] # Create single chunk`

			`current_output = os.path.join( # Create new output file`
			`output_path,`
			`"{}-{}.csv".format(output_file, current_chunk)`
			`)`

			`# Add header`
			`chunk.insert(0, header)`

			`# Write chunk to output file`
			`with open(current_output, 'w') as output_csv:`
			`writer = csv.writer(output_csv)`
			`writer = writer.writerows(chunk)`

			`# Output info`
			`print ""`
			`print "Chunk # {}:".format(current_chunk)`
			`print "Filepath: {}".format(current_output)`
			`print "# of rows: {}".format(len(chunk))`

			`# Create new chunk`
			`current_chunk += 1`
csv splitter 2014-05-29 18:06:17 +02:00

			`if __name__ == "__main__":`
added csv_split 2014-05-30 16:24:44 +02:00			`arguments = get_arguments()`
			`parse_file(arguments)`