use warnings; use strict; use Spreadsheet::ParseExcel; # perl -CS kanji_plus.pl ..\babymetal_lyrics_2003.xls BABYMETAL > full.html # perl -CS kanji_plus.pl ..\nemophila_lyrics_2003.xls NEMOPHILA > full.html my $filename = $ARGV[0] or die "Must specify filename to parse.\n"; my $ARTIST = $ARGV[1] or die "must specify artist name.\n"; my $parser = Spreadsheet::ParseExcel->new(); my $workbook = $parser->parse( $filename ); my ($kanjicol, %KANJI, %KANJI_text, %KANJI_html, %FURI_html); my ($romajicol, %ROMAJI, %ROMAJI_text); my ($englishcol, %ENGLISH, %ENGLISH_text); my ($SONG, $LINE, @wds, $wd); my (%full_title); my ($song, $line, $max); my (%ROtable, %HGtable, %GHtable, %KKtable,%KJtable, %RKtable); my ($i,$j,$tmp); my $old_fh = select(STDERR); $| = 1; select($old_fh); $old_fh = select(STDOUT); $| = 1; select($old_fh); if ( !defined $workbook ) { die "Parsing error: ", $parser->error(), ".\n"; } open (IN, "<:encoding(UTF-8)", "hiragana_table.txt") or die "Hiragana table"; while() { #print; m/^(\S+) (\S+)/; if (defined($HGtable{$1})) {print "hiragana table duplicate $2 \n";} $HGtable{$1}=uc($2); if (defined($GHtable{uc($2)})) {print "ganahira table duplicate $1 \n";} $GHtable{uc($2)}=$1; } close(IN); open (IN, "<:encoding(UTF-8)", "katakana_table.txt") or die "Katakana table"; while() { m/^(\S+) (\S+)/; if (defined ($KKtable{$1}) ) {print "katakana table duplicate $1 $2 \n";} $KKtable{$1}=uc($2); } close(IN); open (IN, "<:encoding(UTF-8)", "rendaku_table.txt") or die "Rendaku table"; while() { m/^(\S+) (\S+)/; if (defined ($RKtable{"$1$2"}) ) {print "rendaku table duplicate $1 $2\n";} $RKtable{"$1$2"}=uc($2); } close(IN); open (IN, "<:encoding(UTF-8)", "roman_list.txt") or die "Roman table"; while() { s/[\r\n]//g; @wds = split (/\t/, $_); if (defined ($ROtable{$wds[0]}) ) {print "roman table duplicate $wds[0] \n";} #processing for romaji matching; so that larger substrings match first my @rds = split (/,/, uc($wds[1])); for ( $i=0; $i<=$#rds; $i++) { $rds[$i] =~ s/\-\w+$//; $rds[$i]=~s/\s//g; } @{$ROtable{$wds[0]}{"reads"}} = sort { length($b) <=> length($a) || $b cmp $a } @rds; } close(IN); open (IN, "<:encoding(UTF-8)", "kanji_list.txt") or die "Kanji table"; while() { s/[\r\n]//g; @wds = split (/\t/, $_); if (defined($KJtable{$wds[0]})) {print "kanji table duplicate $wds[0] \n";} $KJtable{$wds[0]}{"grade"}=$wds[1]; $KJtable{$wds[0]}{"english"}=$wds[2]; $KJtable{$wds[0]}{"readings"}=$wds[4]; # preserving wikipedia listing #processing for romaji matching; so that larger substrings match first my @rds = split (/,/, uc($wds[4])); for ( $i=0; $i<=$#rds; $i++) { $rds[$i] =~ s/\-\w+$//; $rds[$i]=~s/\s//g; } @{$KJtable{$wds[0]}{"reads"}} = sort { length($b) <=> length($a) || $b cmp $a } @rds; # sorting the list of readings by size, and later stopping at the first match, has the effect of # making the matching greedy. FIXME But there are cases where a short match and a following character # are a better choice than a longer match. There can be duplicate syllables in the ruby. #printf "%s ;%s;%s;",$wds[0], $wds[1], $wds[2], $wds[4]; print @{$KJtable{$wds[0]}{"reads"}},"::::\n"; } close(IN); sub ruby_to_furi { # Textbooks usually write on-readings with katakana and kun-readings with hiragana. # We don't have that information. # An OO will be shown as OU my $furi = $_[0]; my ($wd, @wds); # split after vowels, between N and consonant $furi =~ s/OO/OU/g; $furi =~ s/([AEUIO])/$1 /g; $furi =~ s/(N)[^AEUIO]/$1 /g; #print "R2F $_[0] $furi\n"; @wds = split ( / /, $furi ); $furi = ""; foreach $wd ( @wds ) { $furi .= $GHtable{$wd}; } #print " >>$furi<<\n"; return $furi; } for my $worksheet ( $workbook->worksheets() ) { my $cell; print "Worksheet name: ", $worksheet->get_name(), "\n\n"; $SONG=$worksheet->get_name(); my ( $row_min, $row_max ) = $worksheet->row_range(); my ( $col_min, $col_max ) = $worksheet->col_range(); $kanjicol=-1; $romajicol=-1; $englishcol=-1; for my $col ( $col_min .. $col_max ) { $cell = $worksheet->get_cell( 2, $col ); next unless $cell; #print "row2: ", $cell->value(), "\n"; if ($cell->value() =~ /kanji/) { $kanjicol=$col; } if ($cell->value() =~ /romaji/) { $romajicol=$col; } if ($cell->value() =~ /english/) { $englishcol=$col; } } if (-1==$kanjicol or -1==$romajicol or -1==$englishcol) { if (-1==$kanjicol) {print "NO KANJI COLUMN DEFINED\n"; } if (-1==$romajicol) {print "NO ROMAJI COLUMN DEFINED\n"; } if (-1==$englishcol) {print "NO ENGLISH COLUMN DEFINED\n"; } next; # worksheet } $cell = $worksheet->get_cell(0,0); $full_title{$SONG} = $cell->value(); # quickly scan the kanji column to find the end tag my $rowm; # grr for my $row ( 3 .. $row_max ) { my $cell = $worksheet->get_cell( $row, $kanjicol ); next unless $cell; $_=$cell->value(); last if /__END__/; $rowm = $row; } $row_max = $rowm; # which may or may not have changed # romaji for my $row ( 3 .. $row_max ) { my $cell = $worksheet->get_cell( $row, $romajicol ); next unless $cell; $_=$cell->value(); $LINE=$_; ${ROMAJI_text}{$SONG}[$row] = $LINE; s/[^\-\sa-zA-Z]//g ; # drop most punctuation my %linewds; @wds=split; foreach $wd ( @wds ) { # if the word is all caps, keep caps. else lower case unless ( $wd =~ /^[A-Z]+$/ ) { $wd = lc($wd); } $linewds{$wd}=1; } foreach $wd ( keys %linewds ) { # record unique words per line, not every occurence on a line #push @{$ROMAJI{$wd}{$SONG}}, $LINE push @{$ROMAJI{$wd}{$SONG}}, $row } #print "$LINE\n"; #print " Row, Col = ($row, $col)\n"; #print " Value = ", $cell->value(), "\n"; #print " Unformatted = ", $cell->unformatted(), "\n"; #print "\n"; } # row (LINE) # english for my $row ( 3 .. $row_max ) { my $cell = $worksheet->get_cell( $row, $englishcol ); next unless $cell; $_=$cell->value(); $LINE=$_; if (/[\x{0080}-\x{FFFF}]/ ) { print "non-ascii ", $_,"\n"; } #print $_,"\n"; ${ENGLISH_text}{$SONG}[$row] = $LINE; s/[^\-\s\'a-zA-Z]//g ; # drop most punctuation # keeping apostrophe for contractions #print $_,"\n"; my %linewds; @wds=split; foreach $wd ( @wds ) { # if the word is all caps, keep caps. else lower case unless ( $wd =~ /^[A-Z]+$/ ) { $wd = lc($wd); } #print "setting linewds >$wd<\n"; $linewds{$wd}=1; } foreach $wd ( keys %linewds ) { # record unique words per line, not every occurence on a line #push @{$ENGLISH{$wd}{$SONG}}, $LINE push @{$ENGLISH{$wd}{$SONG}}, $row } #print "$LINE\n"; } # row (LINE) # kanji # This is where the magic happens, because we want to annotate the Japanese characters with the RUBY mechanism # at first, this is to print romaji for each kanji or kana; # this might become a way to handle furigana on the kanji for my $row ( 3 .. $row_max ) { my ($row_html, $ruby, $row_furi); my $cell = $worksheet->get_cell( $row, $kanjicol ); next unless $cell; $_=$cell->value(); $LINE=$_; ${KANJI_text}{$SONG}[$row] = $LINE; # keeping apostrophe for contractions my $romaji_line = ${ROMAJI_text}{$SONG}[$row]; #printf "raw: $romaji_line\n"; $romaji_line =~ s/<.*?>//g; # drop text not in official lyrics $romaji_line =~ s/[^a-zA-Z]//g ; # drop punctuation and spaces $romaji_line = uc($romaji_line); #printf "fix: $romaji_line\n"; $row_html=""; $row_furi=""; # dual-ruby for romaji and english is not all that pretty, except maybe specific browsers # Special cases like wo/o wa/ha oo/ou # geminate consonants in kanji are handled by listing the compound pair. Should there be a different mechanism? # Between Amore and Shine, there are Kanji that work one at a time in Amore, but must be paired in Shine # And Rondo only matches as a group of three... This is not allowed to fail to match, as of yet. # If it's not a group, the next character is H or K, and the matched reading equals a shorter reading plus next kana... non-greedy short match is better @wds = split(//, $_); # really, each character for ($i=0; $i<=$#wds; $i++) { #print "$ruby "; $wd = $wds[$i]; my $group = 0; my $repeating = ""; if ( $wd =~ /[\x{3005}]/ ) { $repeating = $wd; $wd = $wds[$i-1]; } # repeat kanji character if ( $wd =~ /[\x{3400}-\x{4DB5}\x{4E00}-\x{9FCB}\x{F900}-\x{FA6A}]/ ) { # Kanji $j=0; $ruby=""; if (($i+2)<=$#wds && defined($KJtable{"$wds[$i]"."$wds[$i+1]"."$wds[$i+2]"}) ) { #three $wd = $wds[$i] . $wds[$i+1] . $wds[$i+2]; $i++; $i++; $group=3; } elsif (($i+1)<=$#wds && defined($KJtable{"$wds[$i]"."$wds[$i+1]"}) ) { #paired $wd = $wds[$i] . $wds[$i+1]; $i++; $group=2; } #printf "$i KJ %s ", $wd; REDO_READING: if (!defined($KJtable{$wd}) ) { print "unknown Kanji: $wd\n"; } foreach $tmp ( @{$KJtable{$wd}{"reads"}} ) { #print "check $tmp "; # ruby should match beginning of remaining $romaji_line #if ($romaji_line=~/^$tmp(.*)$/ ) { print "match $tmp "; $romaji_line = $1; $j=1; $ruby=$tmp; last; } if ($romaji_line=~/^$tmp(.*)$/ ) { $romaji_line = $1; $j=1; $ruby=$tmp; last; } } # check for rendaku pull leading consonants from romaji_line and tmp; concatenate; lookup # if valid shift, update tmp with romaji; set flag j; ruby=tmp if (!$j && $romaji_line =~ m/^([^AIUEO]+)/ ) { # not matched yet, and line starts with consonant my($a, $b) = ( "", $1 ); foreach $tmp ( @{$KJtable{$wd}{"reads"}} ) { $tmp =~ m/^([^AIUEO]+)/; $a=$1; if ( $RKtable{"$a$b"} ) { my $rktmp = $tmp; $rktmp =~ s/^$a/$b/e; if ($romaji_line=~/^$rktmp(.*)$/ ) { $romaji_line = $1; $j=1; $ruby=$rktmp; print "RK:$rktmp\n"; last; } } } } if (!$j && $group==2) { $i--; $wd=$wds[$i]; $group=0; goto REDO_READING; } # we found a list entry for the pair, but it did not match. try again with first char if ($repeating) { $wd = $repeating; } # put special character back in place, keeping the ruby that was (not) found if ($j) { print "assign $ruby "; #$row_html .= sprintf "%s%s%s", $wd, $ruby, $KJtable{$wd}{"english"}; $row_html .= sprintf "%s%s", $wd, $ruby; $row_furi .= sprintf "%s%s", $wd, ruby_to_furi($ruby); } else { print "romaji match fail :$wd :$romaji_line\n"; #$row_html .= sprintf "%s%s%s", $wd, "??", $KJtable{$wd}{"english"}; $row_html .= sprintf "%s%s", $wd, "??"; $row_furi .= sprintf "%s%s", $wd, "??"; } } elsif ( $wd =~ /[\x{3041}-\x{3096}]/ ) { # Hiragana #printf "$i HG %s ", $wd; $ruby = $HGtable{$wd}; if (defined($HGtable{"$wds[$i]"."$wds[$i+1]"}) ) { #paired $wd = $wds[$i] . $wds[$i+1]; $i++; $ruby = $HGtable{$wd}; } else { # FIXME: try to handle geminate sokuon modifying kanji; flag for kanji processing to handle if ( $wd =~ /\x{3063}/ ) { # geminate consonant tiny tsu # look ahead if (defined($HGtable{"$wds[$i+1]"."$wds[$i+2]"}) ) { #paired $wd = $wds[$i+1] . $wds[$i+2]; $ruby = $HGtable{$wd}; $wd = $wds[$i] . $wd; $i++; # for the 3rd character; but i points at base after tiny tsu } else { # single $ruby = $HGtable{$wds[$i+1]}; $wd = $wds[$i] . $wds[$i+1]; } $ruby = substr($ruby,0,1) . $ruby; # double the leading consonant # and then cch changes to tch (always?) $ruby =~ s/CCH/TCH/; $i++; # for the 2nd character; but i points at the last char of wd } } # whether it is the original single, or one of the paired cases, look ahead for long vowel if ( $wds[$i+1] =~ /[\x{30fc}\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}]/ ) { # long vowel #$wd = "stretch$wd"; $ruby = $ruby . substr($ruby,-1,1); $wd = $wd . $wds[$i+1]; $i++; } #printf "HG %s %s ", $wd, $ruby; # ruby should match beginning of remaining $romaji_line. But let it match first occurrence to correct #if ($romaji_line=~/$ruby(.*)$/ ) { print "match $ruby "; $romaji_line = $1; } if ($romaji_line=~/^$ruby(.*)$/ ) { $romaji_line = $1; } # next up elsif ( $wd =~ /\x{3092}/ && $romaji_line=~/^O(.*)/ ) { $ruby="O"; $romaji_line=$1; } # wo/o elsif ( $wd =~ /\x{306f}/ && $romaji_line=~/^WA(.*)/ ) { $ruby="WA"; $romaji_line=$1; } # ha/wa elsif ( $wd =~ /\x{3046}/ && $romaji_line=~/^O(.*)/ ) { $ruby="O"; $romaji_line=$1; } # u/o elsif ( $wd =~ /\x{3078}/ && $romaji_line=~/^E(.*)/ ) { $ruby="E"; $romaji_line=$1; } # he/e elsif ($romaji_line=~/$ruby(.*)$/ ) { $romaji_line = $1; } # deeper elsif ( $wd =~ /\x{3092}/ && $romaji_line=~/O(.*)$/ ) { $ruby="O"; $romaji_line=$1; } # wo/o but dangerous else { print "romaji match fail :$ruby :$romaji_line\n"; } $row_html .= sprintf "%s%s", $wd, $ruby; $row_furi .= sprintf "%s", $wd; } elsif ( $wd =~ /[\x{30A0}-\x{30FF}]/ ) { # Katakana #printf "$i KK %s ", $wd; $ruby = $KKtable{$wd}; if (defined($KKtable{"$wds[$i]"."$wds[$i+1]"}) ) { # paired $wd = $wds[$i] . $wds[$i+1]; $i++; $ruby = $KKtable{$wd}; } elsif ( $wd =~ /\x{30c3}/ ) { # geminate consonant tiny tsu # FIXME katakana tsu followed by hiragana character # look ahead if (defined($KKtable{"$wds[$i+1]"."$wds[$i+2]"}) ) { #paired $wd = $wds[$i+1] . $wds[$i+2]; $ruby = $KKtable{$wd}; $wd = $wds[$i] . $wd; $i++; # for the 3rd character; but i points at base after tiny tsu } else { $ruby = $KKtable{$wds[$i+1]}; $wd = $wds[$i] . $wds[$i+1]; } # FIXME if followed by mark or space, it is glottal stop. Grr. or even other characters. missing this case $ruby = substr($ruby,0,1) . $ruby; # double the leading consonant # and then cch changes to tch (always?) $ruby =~ s/CCH/TCH/; $i++; } if ( $wds[$i+1] =~ /[\x{30fc}]/ ) { # long vowel #$wd = "stretch$wd"; $ruby = $ruby . substr($ruby,-1,1); $wd = $wd . $wds[$i+1]; $i++; } #printf "KK %s ", $wd; # ruby should match beginning of remaining $romaji_line. But let it match first occurrence to correct #if ($romaji_line=~/$ruby(.*)$/ ) { print "match $ruby "; $romaji_line = $1; } if ($romaji_line=~/^$ruby(.*)$/ ) { $romaji_line = $1; } # next up elsif ( $wd =~ /\x{30a6}/ && $romaji_line=~/^O(.*)/ ) { $ruby="O"; $romaji_line=$1; } # u/o elsif ($romaji_line=~/$ruby(.*)$/ ) { $romaji_line = $1; } # deeper else { print "romaji match fail :$ruby :$romaji_line\n"; } $row_html .= sprintf "%s%s", $wd, $ruby; $row_furi .= sprintf "%s", $wd; } else { $j=0; if (defined($ROtable{"$wds[$i]"."$wds[$i+1]"}) ) { #paired $wd = $wds[$i] . $wds[$i+1]; $i++; } if (defined($ROtable{$wd}) ) { foreach $tmp ( @{$ROtable{$wd}{"reads"}} ) { #print "check $tmp "; # reading should match beginning of remaining $romaji_line if ($romaji_line=~/^$tmp(.*)$/ ) { $romaji_line = $1; $j=1; last; } } if ($j) { #print "assign $ruby "; } else { print "romaji match fail :$wd :$romaji_line\n"; } $row_html .= sprintf "%s", $wd; $row_furi .= sprintf "%s", $wd; } else { #print "?? $wd "; $row_html .= sprintf "%s", $wd; $row_furi .= sprintf "%s", $wd; # if a letter, it should match romaji_line; but let it match later to self-correct after kanji fail $wd=uc($wd); if ($wd=~/[0-9a-zA-Z]/ ) { #if ($romaji_line=~/$wd(.*)$/ ) { print "match $wd "; $romaji_line = $1; } if ($romaji_line=~/^$wd(.*)$/ ) { $romaji_line = $1; } # first try next character elsif ($romaji_line=~/$wd(.*)$/ ) { $romaji_line = $1; } # then try deeper match for correcting else { print "romaji match fail :$wd :$romaji_line\n"; } } else { print "skip $wd\n"; } } } } #print "\n"; ${KANJI_html}{$SONG}[$row] = $row_html; ${FURI_html}{$SONG}[$row] = $row_furi; my %linewds; while ( $LINE =~ /(.)/g ) { $wd = $1; # if the character is kanji (single kanji characters at a time) if ( $wd =~ /[\x{3400}-\x{4DB5}\x{4E00}-\x{9FCB}\x{F900}-\x{FA6A}]/ ) { $linewds{$wd}=1; } } foreach $wd ( keys %linewds ) { # record unique words per line, not every occurence on a line #push @{$ENGLISH{$wd}{$SONG}}, $LINE push @{$KANJI{$wd}{$SONG}}, $row } print "$LINE\n"; } # row (LINE) } # worksheet (SONG) sub start_file { my($which) = shift; local(*FH); open (FH, ">:encoding(UTF-8)", "$which.html") or die "can't open $which.html"; # this went in table, but width=device-width might be better for actual mobile # my($width_string) = ""; if ($which eq "Mobile_Songs") {$width_string="width: 500px;";} # $width_string print FH " $ARTIST concordance $which "; return *FH; } sub end_file { # filehandle local(*FH) = shift; print FH " "; close (FH); } PRINT_ENGLISH: local(*ENG) = start_file("English"); print ENG "

English words

\n"; foreach $wd ( sort keys %ENGLISH ) { print ENG "

: $wd

\n"; print ENG "\n"; foreach $song ( sort keys %{$ENGLISH{$wd}} ) { print ENG "\n"; foreach $line ( @{$ENGLISH{$wd}{$song}} ) { printf ENG ""; printf ENG "\n"; printf ENG "\n"; printf ENG " \n"; } } print ENG "

$song
$ENGLISH_text{$song}[$line]	$ROMAJI_text{$song}[$line]

\n"; } end_file(*ENG); PRINT_ROMAJI: local(*ROM) = start_file("Romaji"); print ROM "

Romaji words

\n"; #foreach $wd ( sort keys %ROMAJI ) { # put all caps at the end my %ROMAJI_sort; foreach $wd ( keys %ROMAJI ) { my $tmp=$wd; $tmp=~s/([A-Z])/\|$1/; $ROMAJI_sort{$wd} = $tmp; } foreach $wd ( map { $_->[0] } sort { $a->[1] cmp $b->[1] } map { [ $_, $ROMAJI_sort{$_} ] } keys %ROMAJI ) { print ROM "

: $wd

\n"; print ROM "\n"; foreach $song ( sort keys %{$ROMAJI{$wd}} ) { print ROM "\n"; foreach $line ( @{$ROMAJI{$wd}{$song}} ) { printf ROM ""; printf ROM "\n"; printf ROM "\n"; printf ROM "\n"; printf ROM " \n"; } } print ROM "

$song
$ROMAJI_text{$song}[$line]	$ENGLISH_text{$song}[$line]	$KANJI_html{$song}[$line]

\n"; } end_file(*ROM); PRINT_KANJI: local(*KJI) = start_file("Kanji"); print KJI "

Kanji characters

\n"; # foreach $wd ( sort keys %KANJI ) { # by unicode? # # sort by reading: foreach $wd ( map { $_->[0] } sort { $a->[1] cmp $b->[1] || $a->[0] cmp $b->[0] } map { [$_, $KJtable{$_}{"readings"} ] } keys %KANJI ) { #print KJI "

$wd

\n"; print KJI "

: $wd -- ", $KJtable{$wd}{"grade"}, " -- ", $KJtable{$wd}{"english"}, " -- ", $KJtable{$wd}{"readings"}, "

\n"; print KJI "\n"; foreach $song ( sort keys %{$KANJI{$wd}} ) { print KJI "\n"; foreach $line ( @{$KANJI{$wd}{$song}} ) { printf KJI ""; printf KJI "\n"; printf KJI "\n"; printf KJI "\n"; printf KJI " \n"; } } print KJI "

$song
$KANJI_html{$song}[$line]	$ROMAJI_text{$song}[$line]	$ENGLISH_text{$song}[$line]

\n"; } end_file(*KJI); PRINT_SONG: local(*SONG) = start_file("Songs"); open(TOC, "<:encoding(UTF-8)", "TOC_$ARTIST.txt") or die "can't find TOC_$ARTIST.txt for MSONG"; while() { print SONG; } close(TOC); foreach $song ( sort { $full_title{$a} cmp $full_title{$b} } keys %KANJI_text ) { # foreach $song ( sort keys %KANJI_text ) { print SONG "

$full_title{$song}

\n"; print SONG "\n"; $max = @{$KANJI_text{$song}}; for ($line=4; $line<$max; $line++) { #if( $KANJI_html{$song}[$line] =~ /\?\?/ ) { #if( $KANJI_html{$song}[$line] =~ /[\x{3005}\x{3063}]/ ) { printf SONG " \n"; printf SONG "\n"; printf SONG "\n"; printf SONG "\n"; printf SONG " \n"; #} } print SONG "

$line

$KANJI_html{$song}[$line]

$ROMAJI_text{$song}[$line]

$ENGLISH_text{$song}[$line]

\n"; print SONG "Return to Table of Contents\n"; } end_file(*SONG); PRINT_MSONG: local(*MSONG) = start_file("Mobile_Songs"); open(TOC, "<:encoding(UTF-8)", "TOC_$ARTIST.txt") or die "can't find TOC_$ARTIST.txt for MSONG"; while() { print MSONG; } close(TOC); foreach $song ( sort { $full_title{$a} cmp $full_title{$b} } keys %KANJI_text ) { # foreach $song ( sort keys %KANJI_text ) { print MSONG "

$full_title{$song}

\n"; print MSONG "\n"; $max = @{$KANJI_text{$song}}; for ($line=4; $line<$max; $line++) { #if( $KANJI_html{$song}[$line] =~ /\?\?/ ) { #if( $KANJI_html{$song}[$line] =~ /[\x{3005}\x{3063}]/ ) { printf MSONG " \n"; printf MSONG "\n"; printf MSONG " \n"; #} } print MSONG "

$line

$KANJI_html{$song}[$line]\n"; printf MSONG "
$ROMAJI_text{$song}[$line]\n"; printf MSONG "
$ENGLISH_text{$song}[$line]

\n"; print MSONG "Return to Table of Contents\n"; } end_file(*MSONG); PRINT_KANJISONG: local(*KJSONG) = start_file("Kanji_only_Songs"); foreach $song ( sort keys %KANJI_text ) { print KJSONG "

$full_title{$song}

\n"; print KJSONG "\n"; $max = @{$KANJI_text{$song}}; for ($line=4; $line<$max; $line++) { printf KJSONG " \n"; #printf KJSONG "\n"; printf KJSONG "\n"; printf KJSONG " \n"; } print KJSONG "

$line

$KANJI_html{$song}[$line]

$FURI_html{$song}[$line]

\n"; } end_file(*KJSONG); PRINT_ROMAJI_F: local(*ROM) = start_file("Romaji_F"); print ROM "

Romaji words

\n"; #foreach $wd ( sort keys %ROMAJI ) { # put all caps at the end undef %ROMAJI_sort; my %ROMAJI_sort; foreach $wd ( keys %ROMAJI ) { my $tmp=$wd; $tmp=~s/([A-Z])/\|$1/; $ROMAJI_sort{$wd} = $tmp; } foreach $wd ( map { $_->[0] } sort { $a->[1] cmp $b->[1] } map { [ $_, $ROMAJI_sort{$_} ] } keys %ROMAJI ) { print ROM "

: $wd

$song
$ROMAJI_text{$song}[$line]	$ENGLISH_text{$song}[$line]	$FURI_html{$song}[$line]

\n"; } end_file(*ROM); PRINT_KANJI_F: local(*KJI) = start_file("Kanji_F"); print KJI "

Kanji characters

$wd

\n"; print KJI "

: $wd -- ", $KJtable{$wd}{"grade"}, " -- ", $KJtable{$wd}{"english"}, " -- ", $KJtable{$wd}{"readings"}, "

$song
$FURI_html{$song}[$line]	$ROMAJI_text{$song}[$line]	$ENGLISH_text{$song}[$line]

\n"; } end_file(*KJI); PRINT_SONG_F: local(*SONG) = start_file("Songs_F"); open(TOC, "<:encoding(UTF-8)", "TOC_$ARTIST.txt") or die "can't find TOC_$ARTIST.txt for SONG"; while() { print SONG; } close(TOC); foreach $song ( sort { $full_title{$a} cmp $full_title{$b} } keys %KANJI_text ) { # foreach $song ( sort keys %KANJI_text ) { print SONG "

$full_title{$song}

$line

$FURI_html{$song}[$line]

$ROMAJI_text{$song}[$line]

$ENGLISH_text{$song}[$line]

\n"; print SONG "Return to Table of Contents\n"; } end_file(*SONG); PRINT_ALL: local(*ALL) = start_file("All"); foreach $song ( sort { $full_title{$a} cmp $full_title{$b} } keys %KANJI_text ) { # foreach $song ( sort keys %KANJI_text ) { print ALL "

$full_title{$song}

\n"; print ALL "\n"; $max = @{$KANJI_text{$song}}; for ($line=4; $line<$max; $line++) { #if( $KANJI_html{$song}[$line] =~ /\?\?/ ) { #if( $KANJI_html{$song}[$line] =~ /[\x{3005}\x{3063}]/ ) { printf ALL " \n"; printf ALL "\n"; printf ALL "\n"; printf ALL "\n"; printf ALL "\n"; printf ALL " \n"; #} } print ALL "

$line

$FURI_html{$song}[$line]

$KANJI_html{$song}[$line]

$ROMAJI_text{$song}[$line]

$ENGLISH_text{$song}[$line]

\n"; } end_file(*ALL); $i = keys %ENGLISH; print "Unique words in English text ", $i, "\n"; $i=0; $j=0; foreach $wd (keys %ROMAJI) { if ($wd =~ /^[a-z\-]+$/ ) { # [unicode fail] $i++; } else { $j++; } } print "Unique Japanese words in Romaji text ", $i, "\n"; print "Unique English words in Romaji text ", $j, "\n"; my %grades; foreach $wd ( keys %KANJI ) { $grades{$KJtable{$wd}{"grade"}}++; } print "Number of Kanji of each grade level:\n"; foreach $wd ( sort keys %grades ) { print "$wd $grades{$wd}\n"; }