#!perl
#
# This auxiliary script makes five header files
# used for building XSUB of Unicode::Normalize.
#
# Usage:
#    <do 'mkheader'> in perl, or <perl mkheader> in command line
#
# Input files:
#    unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
#    unicore/Decomposition.pl (or unicode/Decomposition.pl)
#    unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
#
# Output files:
#    unfcan.h
#    unfcpt.h
#    unfcmb.h
#    unfcmp.h
#    unfexc.h
#

use warnings;
use File::Spec;

BEGIN {
    unless ("A" eq pack('U', 0x41)) {
	die "Unicode::Normalize cannot stringify a Unicode code point\n";
    }
}

our $PACKAGE = 'Unicode::Normalize, mkheader';

our $Combin = do "unicore/CombiningClass.pl"
    || do "unicode/CombiningClass.pl"
    || die "$PACKAGE: CombiningClass.pl not found";

our $Decomp = do "unicore/Decomposition.pl"
    || do "unicode/Decomposition.pl"
    || die "$PACKAGE: Decomposition.pl not found";

our %Combin;	# $codepoint => $number    : combination class
our %Canon;	# $codepoint => \@codepoints : canonical decomp.
our %Compat;	# $codepoint => \@codepoints : compat. decomp.
# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat
our %Exclus;	# $codepoint => 1          : composition exclusions
our %Single;	# $codepoint => 1          : singletons
our %NonStD;	# $codepoint => 1          : non-starter decompositions

our %Comp1st;	# $codepoint => $listname  : may be composed with a next char.
our %Comp2nd;	# $codepoint => 1          : may be composed with a prev char.
our %CompList;	# $listname,$2nd  => $codepoint : composite

our $prefix = "UNF_";
our $structname = "$($prefix)complist";

########## definition of Hangul constants ##########
use constant SBase  => 0xAC00;
use constant SFinal => 0xD7A3; # SBase -1 + SCount
use constant SCount =>  11172; # LCount * NCount
use constant NCount =>    588; # VCount * TCount
use constant LBase  => 0x1100;
use constant LFinal => 0x1112;
use constant LCount =>     19;
use constant VBase  => 0x1161;
use constant VFinal => 0x1175;
use constant VCount =>     21;
use constant TBase  => 0x11A7;
use constant TFinal => 0x11C2;
use constant TCount =>     28;

sub decomposeHangul {
    my $SIndex = @_[0] - SBase;
    my $LIndex = int( $SIndex / NCount);
    my $VIndex = int(($SIndex % NCount) / TCount);
    my $TIndex =      $SIndex % TCount;
    my @ret = @(
       LBase() + $LIndex,
       VBase() + $VIndex,
      $TIndex ?? (TBase() + $TIndex) !! (),
    );
    return @ret;
}

########## getting full decomposion ##########
do {
    my($f, $fh);
    foreach my $d ( $^INCLUDE_PATH) {
	$f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
	last if open($fh, "<", $f);
	$f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
	last if open($fh, "<", $f);
	$f = undef;
    }
    die "$PACKAGE: neither unicore/CompositionExclusions.txt "
	. "nor unicode/CompExcl.txt is found in $(join ' ',$^INCLUDE_PATH)" unless defined $f;

    while ( ~< $fh) {
	next if m/^#/ or m/^$/;
	s/#.*//;
	%Exclus{+hex($1) } = 1 if m/([0-9A-Fa-f]+)/;
    }
    close $fh;
};

##
## converts string "hhhh hhhh hhhh" to a numeric list
##
sub _getHexArray { return map { hex }, @( @_[0] =~ m/([0-9A-Fa-f]+)/g) }

while ($Combin =~ m/(.+)/g) {
    my @tab = split m/\t/, $1;
    my $ini = hex @tab[0];
    if (@tab[1] eq '') {
	%Combin{+$ini } = @tab[2];
    } else {
	%Combin{+$_ } = @tab[2] foreach $ini .. hex(@tab[1]);
    }
}

while ($Decomp =~ m/(.+)/g) {
    my @tab = split m/\t/, $1;
    my $compat = @tab[2] =~ s/<[^>]+>//;
    my $dec = \ _getHexArray(@tab[2]); # decomposition
    my $ini = hex(@tab[0]); # initial decomposable character

    my $listname =
	(nelems @$dec) == 2 ?? sprintf("$($structname)_\%06x", $dec->[0]) !! 'USELESS';
		# %04x is bad since it'd place _3046 after _1d157.

    if (@tab[1] eq '') {
	%Compat{+$ini } = $dec;

	if (! $compat) {
	    %Canon{+$ini } = $dec;

	    if ((nelems @$dec) == 2) {
		if (%Combin{?$dec->[0] }) {
		    %NonStD{+$ini } = 1;
		} else {
		    %CompList{ + $listname }->{+$dec->[1] } = $ini;
		    %Comp1st{+$dec->[0] } = $listname;
		    %Comp2nd{+$dec->[1] } = 1 if ! %Exclus{?$ini};
		}
	    } elsif ((nelems @$dec) == 1) {
		%Single{+$ini } = 1;
	    } else {
		die("Weird Canonical Decomposition of U+@tab[0]");
	    }
	}
    } else {
	foreach my $u ($ini .. hex(@tab[1])) {
	    %Compat{+$u } = $dec;

	    if (! $compat) {
		%Canon{+$u } = $dec;

		if ((nelems @$dec) == 2) {
		    if (%Combin{?$dec->[0] }) {
			%NonStD{+$u } = 1;
		    } else {
			%CompList{ $listname }->{+$dec->[1] } = $u;
			%Comp1st{+$dec->[0] } = $listname;
			%Comp2nd{+$dec->[1] } = 1 if ! %Exclus{?$u};
		    }
		} elsif ((nelems @$dec) == 1) {
		    %Single{+$u } = 1;
		} else {
		    die("Weird Canonical Decomposition of U+@tab[0]");
		}
	    }
	}
    }
}

# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
foreach my $j ( @(<0x1161..0x1175, < 0x11A8..0x11C2)) {
    %Comp2nd{+$j} = 1;
}

sub getCanonList {
    my @src = @_;
    my @dec = @+: map {
	(SBase +<= $_ && $_ +<= SFinal) ?? decomposeHangul($_)
	    !! %Canon{?$_} ?? @{ %Canon{?$_} } !! @($_)
		}, @src;
    return join(" ", @src) eq join(" ", @dec) ?? @dec !! getCanonList(< @dec);
    # condition @src == @dec is not ok.
}

sub getCompatList {
    my @src = @_;
    my @dec = @+: map {
	(SBase +<= $_ && $_ +<= SFinal) ?? decomposeHangul($_)
	    !! %Compat{?$_} ?? @{ %Compat{?$_} } !! @($_)
		}, @src;
    return join(" ", @src) eq join(" ", @dec) ?? @dec !! getCompatList(< @dec);
    # condition @src == @dec is not ok.
}

# exhaustive decomposition
foreach my $key (keys %Canon) {
    %Canon{+$key}  = \ getCanonList($key);
}

# exhaustive decomposition
foreach my $key (keys %Compat) {
    %Compat{+$key} = \ getCompatList($key);
}

sub _pack_U {
    return pack('U*', < @_);
}

sub split_into_char {
    use bytes;
    my $uni = shift;
    my $len = length($uni);
    my @ary;
    for my $i (0..$len-1) {
	push @ary, ord(substr($uni,$i,1));
    }
    return @ary;
}

sub _U_stringify {
    sprintf '"%s"', join '', map { sprintf("\\x\%02x", $_) }, split_into_char( _pack_U(< @_));
}

foreach my $hash (@(\%Canon, \%Compat)) {
    foreach my $key (keys %$hash) {
	$hash->{+$key} = _U_stringify( < @{ $hash->{$key} } );
    }
}

########## writing header files ##########

my @boolfunc = @(
    \%(
	name => "Exclusion",
	type => "bool",
	hash => \%Exclus,
    ),
    \%(
	name => "Singleton",
	type => "bool",
	hash => \%Single,
    ),
    \%(
	name => "NonStDecomp",
	type => "bool",
	hash => \%NonStD,
    ),
    \%(
	name => "Comp2nd",
	type => "bool",
	hash => \%Comp2nd,
    ),
);

my $file = "unfexc.h";
open my $fh, ">", "$file" or die "$PACKAGE: $file can't be made";
binmode $fh;

    print $fh, << 'EOF';
/*
 * This file is auto-generated by mkheader.
 * Any changes here will be lost!
 */
EOF

foreach my $tbl ( @boolfunc) {
    my @temp = sort {$a <+> $b}, keys %{$tbl->{?hash}};
    my $type = $tbl->{?type};
    my $name = $tbl->{?name};
    print $fh, "$type is$name (UV uv)\n\{\nreturn\n\t";

    while ((nelems @temp)) {
	my $cur = shift @temp;
	if ((nelems @temp) && $cur + 1 == @temp[0]) {
	    print $fh, "($cur <= uv && uv <= ";
	    while ((nelems @temp) && $cur + 1 == @temp[0]) {
		$cur = shift @temp;
	    }
	    print $fh, "$cur)";
	    print $fh, "\n\t|| " if (nelems @temp);
	} else {
	    print $fh, "uv == $cur";
	    print $fh, "\n\t|| " if (nelems @temp);
	}
    }
    print $fh, "\n\t? TRUE : FALSE;\n\}\n\n";
}

close $fh;

####################################

my $compinit =
    "typedef struct \{ UV nextchar; UV composite; \} $structname;\n\n";

foreach my $i (sort keys %CompList) {
    $compinit .= "$structname $i [] = \{\n";
    $compinit .= join ",\n", map { sprintf("\t\{ \%d, \%d \}", $_, %CompList{$i}->{?$_}) },
	    sort { $a <+> $b }, keys %{ %CompList{?$i} };
    $compinit .= ",\n\{0,0\}\n\};\n\n"; # with sentinel
}

my @tripletable = @(
    \%(
	file => "unfcmb",
	name => "combin",
	type => "STDCHAR",
	hash => \%Combin,
	null =>  0,
    ),
    \%(
	file => "unfcan",
	name => "canon",
	type => "char*",
	hash => \%Canon,
	null => "NULL",
    ),
    \%(
	file => "unfcpt",
	name => "compat",
	type => "char*",
	hash => \%Compat,
	null => "NULL",
    ),
    \%(
	file => "unfcmp",
	name => "compos",
	type => "$structname *",
	hash => \%Comp1st,
	null => "NULL",
	init => $compinit,
    ),
);

foreach my $tbl ( @tripletable) {
    my $file = "$tbl->{?file}.h";
    my $head = "$($prefix)$tbl->{?name}";
    my $type = $tbl->{?type};
    my $hash = $tbl->{?hash};
    my $null = $tbl->{?null};
    my $init = $tbl->{?init};

    open my $fh, ">", "$file" or die "$PACKAGE: $file can't be made";
    binmode $fh;
    my %val;

    print $fh ,<< 'EOF';
/*
 * This file is auto-generated by mkheader.
 * Any changes here will be lost!
 */
EOF

    print $fh, $init if defined $init;

    foreach my $uv (keys %$hash) {
	die sprintf("a Unicode code point 0x\%04X over 0x10FFFF.", $uv)
	    unless $uv +<= 0x10FFFF;
	my @c = @( unpack 'CCCC', pack 'N', $uv );
	%val{ + @c[1] }->{ + @c[2] }->{+@c[3] } = $hash->{?$uv};
    }

    foreach my $p (sort { $a <+> $b }, keys %val) {
	next if ! %val{?$p };
	for my $r (0..255) {
	    next if ! %val{ $p }->{?$r };
	    printf $fh, "static $type $($head)_\%02x_\%02x [256] = \{\n", $p, $r;
	    for my $c (0..255) {
		print $fh, "\t", defined %val{$p}->{$r}->{?$c}
		    ?? "($type)".%val{$p}->{$r}->{?$c}
		    !! $null;
		print $fh, ','  if $c != 255;
		print $fh, "\n" if $c % 8 == 7;
	    }
	    print $fh, "\};\n\n";
	}
    }
    foreach my $p (sort { $a <+> $b }, keys %val) {
	next if ! %val{?$p };
	printf $fh, "static $type* $($head)_\%02x [256] = \{\n", $p;
	for my $r (0..255) {
	    print $fh, %val{ $p }->{?$r }
		?? sprintf("$($head)_\%02x_\%02x", $p, $r)
		!! "NULL";
	    print $fh, ','  if $r != 255;
	    print $fh, "\n" if %val{ $p }->{?$r } || ($r+1) % 8 == 0;
	}
	print $fh, "\};\n\n";
    }
    print $fh, "static $type** $head [] = \{\n";
    for my $p (0..0x10) {
	print $fh, %val{?$p } ?? sprintf("$($head)_\%02x", $p) !! "NULL";
	print $fh, ','  if $p != 0x10;
	print $fh, "\n";
    }
    print $fh, "\};\n\n";
    close $fh;
}

1;
__END__
