use strict;
use warnings;

use constant DEBUG => 0;

our $norm = {
  q{á} => q{A},
  q{Á} => q{A},
  q{ã} => q{A},
  q{â} => q{A},
  q{é} => q{E},
  q{í} => q{I},
  q{ó} => q{O},
  q{ú} => q{U},
  q{Ú} => q{U},
};

sub Normalize_Letter($) {
  my $letter = shift(@_);
  if ($letter =~ m{^[A-Z]$}) {return $letter;}
  if ($letter =~ m{^[a-z]$}) {return uc($letter);}
  my $tr = $norm->{$letter};
  if (!defined($tr)) {
    die(qq{How to normalize? {$letter}});
  }
  return $tr;
}

sub Normalize_Name($) {
  my $name = shift(@_);
  eval {$name =~ s{(á|Á|ã|â|é|í|ó|ú|Ú|.)}{Normalize_Letter($1)}eg;};
  if ($@) {die(qq{$@\nIn name {$name}\n});}
  $name;
}

sub Analyze_Name_Lengths($) {
  my $lines = shift(@_);
  my @lengths = ();
  foreach my $line (@$lines) {
    my $name = $line->{name};
    my $length = length($name);
    $lengths[$length]++;
  }
  my $last_index = $#lengths;
  my $cum = 0;
  for (my $i = 1; $i <= $last_index; $i++) {
    if (exists($lengths[$i])) {
      $cum += $lengths[$i];
      printf(qq{%2d: %3d   %4d\n}, $i, $lengths[$i], $cum);
    }
  }
}

sub Analyze_Children($$$$) {
  my($alphabet, $lines, $name_length, $quartile) = @_;
  my $nletters = scalar(@$alphabet);
  if (DEBUG) {print(qq{  $nletters letters in alphabet.\n});}
  my $data = {};
  # letter =>  nowhere =>    {kids_in_group => 0, kids_in_group_OK => 0}, 
  #            initial =>    {kids_in_group => 0, kids_in_group_OK => 0},
  #            noninitial => {kids_in_group => 0, kids_in_group_OK => 0}}
  my $nSubjects = 0;
  foreach my $line (@$lines) {
    my $name = $line->{name};
    if (DEBUG) {print(qq{  $name\n});}
    if ($name_length ne q{all}) {
      my $length = length($name);
      if ($name_length eq q{short}) {
        next unless $length < 6;
      } else {  # long
        next unless $length > 7;
      }
      if (DEBUG) {print(qq{    Fits length requirement, at $length\n});}
    }
    if ($quartile ne q{all}) {
      my $pc = $line->{propCorrect};
      die($name) unless defined($pc);
      if ($quartile eq q{quart1}) {
        next if $pc >= .31;
      } else {  #quart2
        next if $pc < .31 or $pc >= .783;
      }
    }
    $nSubjects++;
    my $responsa = $line->{letters};
    for (my $i = 0; $i < $nletters; $i++) {
      my $letter = $alphabet->[$i];
      if (DEBUG) {print(qq{    $letter\n});}
      my $letter_inclusion = q{nowhere};
      if ($name =~ m{^$letter}) {
        $letter_inclusion = q{initial};
      } elsif ($name =~ m{$letter}) {
        $letter_inclusion = q{noninitial};
      }
      if (DEBUG) {print(qq{      Found in name $letter_inclusion\n});}
      $data->{$letter}->{$letter_inclusion}->{kids_in_group}++;
      my $response = $responsa->[$i];
      if (DEBUG) {print(qq{      Correct response = $response\n});}
      if ($response > 0) {
        $data->{$letter}->{$letter_inclusion}->{kids_in_group_OK}++;
      }
    }
  }
  print(qq{$nSubjects subjects:"\r\n});
  print(qq{Letter,Init,"..;",Noninit,"..;",Nowhere,"..;"\r\n});
  print(qq{,All,Correct,All,Correct,All,Correct\r\n});
  foreach my $letter (@$alphabet) {
    print($letter);
    my $letter_counts = $data->{$letter};
    #if (DEBUG) {print(qq{letter_counts = $letter_counts});}
    foreach my $pos (qw{initial noninitial nowhere}) {
      my $pos_counts = $letter_counts->{$pos};
      #if (DEBUG and !defined($pos_counts)) {
      #  print(qq{No pos_counts at all!\n});
      #}
      $pos_counts ||= {};
      my $all = $pos_counts->{kids_in_group};
      #if (DEBUG and !defined($all)) {
      #   print(qq{No all count!\n});
      #}
      $all ||= 0;
      print(qq{,$all});
      my $correct = $pos_counts->{kids_in_group_OK} || 0;
      print(qq{,$correct});
    }
    print(qq{\r\n});
  }
}

sub Analyze_File($) {
  my $file_name = shift(@_);
  open(my $in, q{<}, $file_name) or die($file_name);
  my $header = <$in>;
  chomp($header);
  my $header_fields = [split(m{,}, $header)];
  my $n_fields = scalar(@$header_fields);
  my $alphabet = [];
  foreach my $hf (@$header_fields) {
    if ($hf =~ m{^[A-Z]$}) {push(@$alphabet, $hf);}
  }
  my $nLetters = scalar(@$alphabet);
  my $lines = [];
  while (defined(my $line = <$in>)) {
    next unless $line =~ m{\S};
    chomp($line);
    my $data_fields = [split(m{,}, $line, -1)];
    my $line_data = {letters => []};
    my $nCorrect = 0;
    push(@$lines, $line_data);
    for (my $f = 0; $f < $n_fields; $f++) {
      my $tag = $header_fields->[$f];
      my $value = $data_fields->[$f];
      if ($tag =~ m{name}i) {
        $line_data->{name} = Normalize_Name($value);
      } elsif ($tag eq q{Nletters}) {
      } elsif ($tag =~ m{^[A-Z]$}) {
        my $correct = ($value eq q{1}) ? q{1} : q{0};
        push(@{$line_data->{letters}}, $correct);
        $nCorrect += $correct;
      } elsif ($tag eq q{sex}) {
        my $sexCode;
        if ($value eq q{1}) {$sexCode = q{F};}
        elsif ($value eq q{2}) {$sexCode = q{M};}
        else {die(qq{bad sex: $value});}
        $line_data->{sex} = $sexCode;
      } else {die(qq{$tag: $value});}
    }
    $line_data->{propCorrect} = $nCorrect / $nLetters;
  }
  close($in);
  foreach my $goodness (qw{all quart1 quart2}) {
    print(qq{\r\n"CHILDREN IN $goodness:"\r\n}) 
      unless $goodness eq q{all};
    foreach my $name_length (qw{all short long}) {
      print(qq{\r\n"For children with $name_length names:"\r\n});
        Analyze_Children($alphabet, $lines, $name_length, $goodness);
    }
  }
}

sub main() {
  my $all_data = {};
  print(qq{,"Number of kids with letter in name at initial position, noninitial position only, or nowhere; broken down by how many of them identified the letter correctly."\r\n});
  foreach my $file_name (@ARGV) {
    my($base_name) = $file_name =~ m{^(.+?)\.};
    print(qq{"$base_name:"\r\n});
    Analyze_File($file_name);
    print(qq{\r\n});
  }

}

main();
