concordia-library/utf8case/generate_case_tables.pl

#!/usr/bin/perl

# Based on ideas from re2 library.

use strict;
use LWP::Simple;
use String::Util qw(hascontent);
use Data::Dumper;
use Clone qw(clone);

my $UNIDATA_PREFIX= q{http://unicode.org/Public/UNIDATA/};
my $OUTPUT_CPP_FILE = 'case_tables.cpp';

my @lower_case_ranges;
my @upper_case_ranges;
my @title_case_ranges;

my @lower_special_casing;
my @upper_special_casing;
my @title_special_casing;

open my $output_cpp_fh, '>', $OUTPUT_CPP_FILE;
generate_intro();
generate_standard_case_tables();
generate_special_casing_tables();

sub generate_intro {
    print $output_cpp_fh <<'END_OF_INTRO';
// GENERATED AUTOMATICALLY BY generate_case_tables.pl; DO NOT EDIT.

#include "case_tables.hpp"

END_OF_INTRO
}

sub generate_standard_case_tables {
    my @unicode_data_lines = download_unidata_file('UnicodeData.txt');

    for my $line (@unicode_data_lines) {
        append_to_case_ranges(\@upper_case_ranges, $line->[0], $line->[12]);
        append_to_case_ranges(\@lower_case_ranges, $line->[0], $line->[13]);
        append_to_case_ranges(\@title_case_ranges, $line->[0], $line->[14]);
    }

    @lower_case_ranges = compactify(\@lower_case_ranges);
    @upper_case_ranges = compactify(\@upper_case_ranges);
    @title_case_ranges = compactify(\@title_case_ranges);

    write_case_table('lower_case_ranges', \@lower_case_ranges);
    print "\n";

    write_case_table('upper_case_ranges', \@upper_case_ranges);
    print "\n";

    write_case_table('title_case_ranges', \@title_case_ranges);
    print "\n";
}

sub generate_special_casing_tables {
    my @special_casing_lines = download_unidata_file('SpecialCasing.txt');

    for my $line (@special_casing_lines) {
        if (hascontent($line->[4])) {
            print STDERR "This cannot be handled: ", join('; ', @{$line}),"\n";
        } else {
            append_to_special_casing_table(\@lower_special_casing, $line->[0], $line->[1]);
            append_to_special_casing_table(\@title_special_casing, $line->[0], $line->[2]);
            append_to_special_casing_table(\@upper_special_casing, $line->[0], $line->[3]);
        }
    }

    write_special_casing_table('lower_special_casing', \@lower_special_casing);
    print "\n";

    write_special_casing_table('title_special_casing', \@title_special_casing);
    print "\n";

    write_special_casing_table('upper_special_casing', \@upper_special_casing);
}

sub download_unidata_file {
    my ($file_name) = @_;

    my $url = $UNIDATA_PREFIX . $file_name;

    print STDERR "Downloading ${url}...\n";

    my $contents = get($url);

    return map { [ split/\s*;\s*/ ] }
           grep { /\S/ }
           map{ s/\#.*\Z//; $_}
           split/\r?\n/, $contents;
}

sub append_to_case_ranges {
    my ($case_ranges_ref, $hex_code_point, $hex_modified_code_point) = @_;

    if (!hascontent($hex_modified_code_point)) {
        return;
    }

    my $code_point = hex($hex_code_point);
    my $modified_code_point = hex($hex_modified_code_point);

    push @{$case_ranges_ref},
        [ $code_point, $code_point, delta($code_point, $modified_code_point) ];
}

sub compactify {
    my ($case_ranges_ref) = @_;

    my @new_table;

    my $current_compact_range;

    for my $range (@{$case_ranges_ref}) {
        if (!defined($current_compact_range)) {
            $current_compact_range = clone($range);
        } elsif ($range->[2] eq $current_compact_range->[2]
            && $range->[0] == $current_compact_range->[1] + 1) {
            ++$current_compact_range->[1];
        } elsif ($range->[2] eq de_skip($current_compact_range->[2])
            && $range->[0] == $current_compact_range->[1] + 2) {
            $current_compact_range->[1] += 2;
            $current_compact_range->[2] = add_skip($current_compact_range->[2]);
        } else {
            push @new_table, $current_compact_range;
            $current_compact_range = clone($range);
        }
    }

    push @new_table, $current_compact_range;

    return @new_table;
}

sub write_case_table {
    my ($name, $case_ranges_ref) = @_;

    my $table_name = uc($name);
    my $size_constant_name = $table_name . "_SIZE";
    my $table_size = $#{$case_ranges_ref} + 1;


    print $output_cpp_fh <<"END_OF_INTRO";
const size_t $size_constant_name = $table_size;
const CaseConversionRecord ${table_name}[$size_constant_name] = {
END_OF_INTRO

    my $string_to_prepend = '';

    for my $range (@{$case_ranges_ref}) {
        my $from = $range->[0];
        my $to = $range->[1];
        my $delta = $range->[2];

        print $output_cpp_fh "${string_to_prepend}    {$from, $to, $delta}";

        $string_to_prepend = ",\n";
    }


    print $output_cpp_fh "\n};\n";
}

sub append_to_special_casing_table {
    my ($special_casing_table_ref, $hex_code_point, $hex_code_point_vector) = @_;

    if (!hascontent($hex_code_point_vector)) {
        return;
    }

    my $code_point = hex($hex_code_point);
    my @code_point_vector = map { hex($_) } split/\s+/, $hex_code_point_vector;

    push $special_casing_table_ref, [$code_point, cpp_encode(@code_point_vector)];
}

sub write_special_casing_table {
    my ($name, $special_casing_table_ref) = @_;

    my $table_name = uc($name);
    my $size_constant_name = $table_name . "_SIZE";
    my $table_size = $#{$special_casing_table_ref} + 1;

    print $output_cpp_fh <<"END_OF_INTRO";
const size_t $size_constant_name = $table_size;
const SpecialCasingConversionRecord ${table_name}[$size_constant_name] = {
END_OF_INTRO

    my $string_to_prepend = '';

    for my $item (@{$special_casing_table_ref}) {
        my $code_point = $item->[0];
        my $replacement = $item->[1];

        print $output_cpp_fh "${string_to_prepend}    {$code_point, \"$replacement\"}";

        $string_to_prepend = ",\n";
    }

    print $output_cpp_fh "\n};\n";
}

sub cpp_encode {
    my (@v) = @_;

    my $s = join('', map{ chr($_) } @v);

    return join('', map { "\\x$_" } unpack("U0(H2)*", $s));
}

sub de_skip {
    my ($delta) = @_;

    if ($delta =~ /^(EVEN_ODD|ODD_EVEN)(?:_SKIP)?$/) {
        return $1;
    }

    return 'CANNOT_BE_SKIPPED';
}

sub add_skip {
    my ($delta) = @_;

    return de_skip($delta) . '_SKIP';
}


sub delta {
    my ($a, $b) = @_;

    if ($a + 1 == $b) {
        if ($a % 2 == 0) {
            return 'EVEN_ODD'
        }
        else {
            return 'ODD_EVEN';
        }
    } elsif ($a == $b + 1) {
        if ($a % 2 == 0) {
            return 'ODD_EVEN';
        }
        else {
            return 'EVEN_ODD';
        }
    }

    return $b - $a;
}
utf8 Former-commit-id: fa7407621e839f87613476596c6589aeceb9d796 2014-04-24 11:51:04 +02:00			`#!/usr/bin/perl`

			`# Based on ideas from re2 library.`

			`use strict;`
			`use LWP::Simple;`
			`use String::Util qw(hascontent);`
			`use Data::Dumper;`
			`use Clone qw(clone);`

			`my $UNIDATA_PREFIX= q{http://unicode.org/Public/UNIDATA/};`
			`my $OUTPUT_CPP_FILE = 'case_tables.cpp';`

			`my @lower_case_ranges;`
			`my @upper_case_ranges;`
			`my @title_case_ranges;`

			`my @lower_special_casing;`
			`my @upper_special_casing;`
			`my @title_special_casing;`

			`open my $output_cpp_fh, '>', $OUTPUT_CPP_FILE;`
			`generate_intro();`
			`generate_standard_case_tables();`
			`generate_special_casing_tables();`

			`sub generate_intro {`
			`print $output_cpp_fh <<'END_OF_INTRO';`
			`// GENERATED AUTOMATICALLY BY generate_case_tables.pl; DO NOT EDIT.`

			`#include "case_tables.hpp"`

			`END_OF_INTRO`
			`}`

			`sub generate_standard_case_tables {`
			`my @unicode_data_lines = download_unidata_file('UnicodeData.txt');`

			`for my $line (@unicode_data_lines) {`
			`append_to_case_ranges(\@upper_case_ranges, $line->[0], $line->[12]);`
			`append_to_case_ranges(\@lower_case_ranges, $line->[0], $line->[13]);`
			`append_to_case_ranges(\@title_case_ranges, $line->[0], $line->[14]);`
			`}`

			`@lower_case_ranges = compactify(\@lower_case_ranges);`
			`@upper_case_ranges = compactify(\@upper_case_ranges);`
			`@title_case_ranges = compactify(\@title_case_ranges);`

			`write_case_table('lower_case_ranges', \@lower_case_ranges);`
			`print "\n";`

			`write_case_table('upper_case_ranges', \@upper_case_ranges);`
			`print "\n";`

			`write_case_table('title_case_ranges', \@title_case_ranges);`
			`print "\n";`
			`}`

			`sub generate_special_casing_tables {`
			`my @special_casing_lines = download_unidata_file('SpecialCasing.txt');`

			`for my $line (@special_casing_lines) {`
			`if (hascontent($line->[4])) {`
			`print STDERR "This cannot be handled: ", join('; ', @{$line}),"\n";`
			`} else {`
			`append_to_special_casing_table(\@lower_special_casing, $line->[0], $line->[1]);`
			`append_to_special_casing_table(\@title_special_casing, $line->[0], $line->[2]);`
			`append_to_special_casing_table(\@upper_special_casing, $line->[0], $line->[3]);`
			`}`
			`}`

			`write_special_casing_table('lower_special_casing', \@lower_special_casing);`
			`print "\n";`

			`write_special_casing_table('title_special_casing', \@title_special_casing);`
			`print "\n";`

			`write_special_casing_table('upper_special_casing', \@upper_special_casing);`
			`}`

			`sub download_unidata_file {`
			`my ($file_name) = @_;`

			`my $url = $UNIDATA_PREFIX . $file_name;`

			`print STDERR "Downloading ${url}...\n";`

			`my $contents = get($url);`

			`return map { [ split/\s;\s/ ] }`
			`grep { /\S/ }`
			`map{ s/\#.*\Z//; $_}`
			`split/\r?\n/, $contents;`
			`}`

			`sub append_to_case_ranges {`
			`my ($case_ranges_ref, $hex_code_point, $hex_modified_code_point) = @_;`

			`if (!hascontent($hex_modified_code_point)) {`
			`return;`
			`}`

			`my $code_point = hex($hex_code_point);`
			`my $modified_code_point = hex($hex_modified_code_point);`

			`push @{$case_ranges_ref},`
			`[ $code_point, $code_point, delta($code_point, $modified_code_point) ];`
			`}`

			`sub compactify {`
			`my ($case_ranges_ref) = @_;`

			`my @new_table;`

			`my $current_compact_range;`

			`for my $range (@{$case_ranges_ref}) {`
			`if (!defined($current_compact_range)) {`
			`$current_compact_range = clone($range);`
			`} elsif ($range->[2] eq $current_compact_range->[2]`
			`&& $range->[0] == $current_compact_range->[1] + 1) {`
			`++$current_compact_range->[1];`
			`} elsif ($range->[2] eq de_skip($current_compact_range->[2])`
			`&& $range->[0] == $current_compact_range->[1] + 2) {`
			`$current_compact_range->[1] += 2;`
			`$current_compact_range->[2] = add_skip($current_compact_range->[2]);`
			`} else {`
			`push @new_table, $current_compact_range;`
			`$current_compact_range = clone($range);`
			`}`
			`}`

			`push @new_table, $current_compact_range;`

			`return @new_table;`
			`}`

			`sub write_case_table {`
			`my ($name, $case_ranges_ref) = @_;`

			`my $table_name = uc($name);`
			`my $size_constant_name = $table_name . "_SIZE";`
			`my $table_size = $#{$case_ranges_ref} + 1;`


			`print $output_cpp_fh <<"END_OF_INTRO";`
			`const size_t $size_constant_name = $table_size;`
			`const CaseConversionRecord ${table_name}[$size_constant_name] = {`
			`END_OF_INTRO`

			`my $string_to_prepend = '';`

			`for my $range (@{$case_ranges_ref}) {`
			`my $from = $range->[0];`
			`my $to = $range->[1];`
			`my $delta = $range->[2];`

			`print $output_cpp_fh "${string_to_prepend} {$from, $to, $delta}";`

			`$string_to_prepend = ",\n";`
			`}`


			`print $output_cpp_fh "\n};\n";`
			`}`

			`sub append_to_special_casing_table {`
			`my ($special_casing_table_ref, $hex_code_point, $hex_code_point_vector) = @_;`

			`if (!hascontent($hex_code_point_vector)) {`
			`return;`
			`}`

			`my $code_point = hex($hex_code_point);`
			`my @code_point_vector = map { hex($_) } split/\s+/, $hex_code_point_vector;`

			`push $special_casing_table_ref, [$code_point, cpp_encode(@code_point_vector)];`
			`}`

			`sub write_special_casing_table {`
			`my ($name, $special_casing_table_ref) = @_;`

			`my $table_name = uc($name);`
			`my $size_constant_name = $table_name . "_SIZE";`
			`my $table_size = $#{$special_casing_table_ref} + 1;`

			`print $output_cpp_fh <<"END_OF_INTRO";`
			`const size_t $size_constant_name = $table_size;`
			`const SpecialCasingConversionRecord ${table_name}[$size_constant_name] = {`
			`END_OF_INTRO`

			`my $string_to_prepend = '';`

			`for my $item (@{$special_casing_table_ref}) {`
			`my $code_point = $item->[0];`
			`my $replacement = $item->[1];`

			`print $output_cpp_fh "${string_to_prepend} {$code_point, \"$replacement\"}";`

			`$string_to_prepend = ",\n";`
			`}`

			`print $output_cpp_fh "\n};\n";`
			`}`

			`sub cpp_encode {`
			`my (@v) = @_;`

			`my $s = join('', map{ chr($_) } @v);`

			`return join('', map { "\\x$_" } unpack("U0(H2)*", $s));`
			`}`

			`sub de_skip {`
			`my ($delta) = @_;`

			`if ($delta =~ /^(EVEN_ODD\|ODD_EVEN)(?:_SKIP)?$/) {`
			`return $1;`
			`}`

			`return 'CANNOT_BE_SKIPPED';`
			`}`

			`sub add_skip {`
			`my ($delta) = @_;`

			`return de_skip($delta) . '_SKIP';`
			`}`


			`sub delta {`
			`my ($a, $b) = @_;`

			`if ($a + 1 == $b) {`
			`if ($a % 2 == 0) {`
			`return 'EVEN_ODD'`
			`}`
			`else {`
			`return 'ODD_EVEN';`
			`}`
			`} elsif ($a == $b + 1) {`
			`if ($a % 2 == 0) {`
			`return 'ODD_EVEN';`
			`}`
			`else {`
			`return 'EVEN_ODD';`
			`}`
			`}`

			`return $b - $a;`
			`}`