#!/usr/bin/perl # Based on ideas from re2 library. use strict; use LWP::Simple; use String::Util qw(hascontent); use Data::Dumper; use Clone qw(clone); my $UNIDATA_PREFIX= q{http://unicode.org/Public/UNIDATA/}; my $OUTPUT_CPP_FILE = 'case_tables.cpp'; my @lower_case_ranges; my @upper_case_ranges; my @title_case_ranges; my @lower_special_casing; my @upper_special_casing; my @title_special_casing; open my $output_cpp_fh, '>', $OUTPUT_CPP_FILE; generate_intro(); generate_standard_case_tables(); generate_special_casing_tables(); sub generate_intro { print $output_cpp_fh <<'END_OF_INTRO'; // GENERATED AUTOMATICALLY BY generate_case_tables.pl; DO NOT EDIT. #include "case_tables.hpp" END_OF_INTRO } sub generate_standard_case_tables { my @unicode_data_lines = download_unidata_file('UnicodeData.txt'); for my $line (@unicode_data_lines) { append_to_case_ranges(\@upper_case_ranges, $line->[0], $line->[12]); append_to_case_ranges(\@lower_case_ranges, $line->[0], $line->[13]); append_to_case_ranges(\@title_case_ranges, $line->[0], $line->[14]); } @lower_case_ranges = compactify(\@lower_case_ranges); @upper_case_ranges = compactify(\@upper_case_ranges); @title_case_ranges = compactify(\@title_case_ranges); write_case_table('lower_case_ranges', \@lower_case_ranges); print "\n"; write_case_table('upper_case_ranges', \@upper_case_ranges); print "\n"; write_case_table('title_case_ranges', \@title_case_ranges); print "\n"; } sub generate_special_casing_tables { my @special_casing_lines = download_unidata_file('SpecialCasing.txt'); for my $line (@special_casing_lines) { if (hascontent($line->[4])) { print STDERR "This cannot be handled: ", join('; ', @{$line}),"\n"; } else { append_to_special_casing_table(\@lower_special_casing, $line->[0], $line->[1]); append_to_special_casing_table(\@title_special_casing, $line->[0], $line->[2]); append_to_special_casing_table(\@upper_special_casing, $line->[0], $line->[3]); } } write_special_casing_table('lower_special_casing', \@lower_special_casing); print "\n"; write_special_casing_table('title_special_casing', \@title_special_casing); print "\n"; write_special_casing_table('upper_special_casing', \@upper_special_casing); } sub download_unidata_file { my ($file_name) = @_; my $url = $UNIDATA_PREFIX . $file_name; print STDERR "Downloading ${url}...\n"; my $contents = get($url); return map { [ split/\s*;\s*/ ] } grep { /\S/ } map{ s/\#.*\Z//; $_} split/\r?\n/, $contents; } sub append_to_case_ranges { my ($case_ranges_ref, $hex_code_point, $hex_modified_code_point) = @_; if (!hascontent($hex_modified_code_point)) { return; } my $code_point = hex($hex_code_point); my $modified_code_point = hex($hex_modified_code_point); push @{$case_ranges_ref}, [ $code_point, $code_point, delta($code_point, $modified_code_point) ]; } sub compactify { my ($case_ranges_ref) = @_; my @new_table; my $current_compact_range; for my $range (@{$case_ranges_ref}) { if (!defined($current_compact_range)) { $current_compact_range = clone($range); } elsif ($range->[2] eq $current_compact_range->[2] && $range->[0] == $current_compact_range->[1] + 1) { ++$current_compact_range->[1]; } elsif ($range->[2] eq de_skip($current_compact_range->[2]) && $range->[0] == $current_compact_range->[1] + 2) { $current_compact_range->[1] += 2; $current_compact_range->[2] = add_skip($current_compact_range->[2]); } else { push @new_table, $current_compact_range; $current_compact_range = clone($range); } } push @new_table, $current_compact_range; return @new_table; } sub write_case_table { my ($name, $case_ranges_ref) = @_; my $table_name = uc($name); my $size_constant_name = $table_name . "_SIZE"; my $table_size = $#{$case_ranges_ref} + 1; print $output_cpp_fh <<"END_OF_INTRO"; const size_t $size_constant_name = $table_size; const CaseConversionRecord ${table_name}[$size_constant_name] = { END_OF_INTRO my $string_to_prepend = ''; for my $range (@{$case_ranges_ref}) { my $from = $range->[0]; my $to = $range->[1]; my $delta = $range->[2]; print $output_cpp_fh "${string_to_prepend} {$from, $to, $delta}"; $string_to_prepend = ",\n"; } print $output_cpp_fh "\n};\n"; } sub append_to_special_casing_table { my ($special_casing_table_ref, $hex_code_point, $hex_code_point_vector) = @_; if (!hascontent($hex_code_point_vector)) { return; } my $code_point = hex($hex_code_point); my @code_point_vector = map { hex($_) } split/\s+/, $hex_code_point_vector; push $special_casing_table_ref, [$code_point, cpp_encode(@code_point_vector)]; } sub write_special_casing_table { my ($name, $special_casing_table_ref) = @_; my $table_name = uc($name); my $size_constant_name = $table_name . "_SIZE"; my $table_size = $#{$special_casing_table_ref} + 1; print $output_cpp_fh <<"END_OF_INTRO"; const size_t $size_constant_name = $table_size; const SpecialCasingConversionRecord ${table_name}[$size_constant_name] = { END_OF_INTRO my $string_to_prepend = ''; for my $item (@{$special_casing_table_ref}) { my $code_point = $item->[0]; my $replacement = $item->[1]; print $output_cpp_fh "${string_to_prepend} {$code_point, \"$replacement\"}"; $string_to_prepend = ",\n"; } print $output_cpp_fh "\n};\n"; } sub cpp_encode { my (@v) = @_; my $s = join('', map{ chr($_) } @v); return join('', map { "\\x$_" } unpack("U0(H2)*", $s)); } sub de_skip { my ($delta) = @_; if ($delta =~ /^(EVEN_ODD|ODD_EVEN)(?:_SKIP)?$/) { return $1; } return 'CANNOT_BE_SKIPPED'; } sub add_skip { my ($delta) = @_; return de_skip($delta) . '_SKIP'; } sub delta { my ($a, $b) = @_; if ($a + 1 == $b) { if ($a % 2 == 0) { return 'EVEN_ODD' } else { return 'ODD_EVEN'; } } elsif ($a == $b + 1) { if ($a % 2 == 0) { return 'ODD_EVEN'; } else { return 'EVEN_ODD'; } } return $b - $a; }