use strict;
use warnings;

our $vowelRE = qr{aɪ|aʊ|ɔɪ|ɑ|æ|e|ɛ|ə|ɚ|i|ɪ|o|ɔ|u|ʊ};
our $consonantRE = qr{dʒ|ð|ɡ|ŋ|ʃ|tʃ|ʒ|θ|[bdfhjklmnprstvwz]};
our $polysyllabicRE = qr{$vowelRE .*$vowelRE};
our $shortVowelRE = qr{ɑ|ɪ|ɛ|æ|ə|ʊ};
our $longVowelRE = qr{aɪ|aʊ|ɔɪ|ɚ|e|i|o|ɔ|u};
our $palatalVowelRE = qr{aɪ|ɪ|ɛ|i};
our $palatalVowelLetterRE = qr{[EIY]};
our $nonpalatalVowelRE = qr{aʊ|ɔɪ|ɑ|æ|e|ə|ɚ|o|ɔ|u|ʊ};
our $nonpalatalVowelLetterRE = qr{[AOU]};
our $stressRE = qr{ˈ|ˌ};
our $pronRE = qr{^((?:$stressRE|$vowelRE|$consonantRE) )*$};

our $spell;
our $pron;
our $alignment;
our $case;  # eq. "Final,/k/"
our $nSylls; # number of syllables
our $condVPr; # is vowel pronunciation conditioning: short vowels for finals,
              # palatalizing vowels for initial /k/
our $narrow; # is vowel pronunciation limited to those used in experiment?
our $condVSp;  # is vowel spelling conditioning: 
              # 1 vowel (no final E) for finals,
              # [EIY] for initial /k/
our $consSpelling;  # "result": condC (conditioned spelling) | 
                    #  typicC (typical spelling) | oddC (other)
our $initialStress = qr{^(?:$consonantRE )*ˈ};
our $finalStress = qr{ˈ $vowelRE (?:$consonantRE )*};

our $initialWords = {};
our $finalWords = {};

sub printOut($) {
  my $table = shift(@_);
  return if exists($table->{$spell});
  my $pron2 = $pron;
  $pron2 =~ s{ }{}g;
  my $string = 
    $case
  . q{,} . (($condVPr) ? q{condVPr} : q{typVPr})
  . q{,} . (($narrow) ? q{strict} : q{univ})
  . q{,} . $nSylls
  . q{,} . (($condVSp) ? q{condVSp} : q{typVSp})
  . q{,} . $consSpelling
  . q{,-,-,-,-,-}
  . q{,} . $spell
  . q{,} . $pron2;
  $table->{$spell} = $string;
}

sub tryInitial() {
  if ($alignment =~ m{^([^=]+)=k ([^=]+)=ˈ\+($vowelRE) }) {
    my $consSpell = $1;
    my $vowelSpell = $2;
    my $vowelPron = $3;
    $case = q{initial,/k/};
    if ($vowelPron =~ m{^$palatalVowelRE$}) {
      $condVPr = 1;
      $narrow = ($vowelPron =~ m{^(?:ɪ|ɛ)$});    
    } else {
      $condVPr = 0;
      $narrow = ($vowelPron =~ m{^(?:æ|ə|ɑ)$});    
    }
    $condVSp = $vowelSpell =~ m{^[EIY]};
    if ($consSpell =~ m{^K$}) {
      $consSpelling = q{condC};
    } elsif ($consSpell =~ m{^C$}) {
      $consSpelling = q{typicC};
    } else {
      $consSpelling = q{oddC};
    }
    printOut($initialWords);
  }
}

our $shortVowels = {
  q{tʃ} => qr{^(?:æ|ə|ɛ)$},
  q{l} => qr{^ə$},
  q{f} => qr{^æ$},
  q{k} => qr{^(?:æ|ə|ɛ)$},
};
our $longVowels = {
  q{tʃ} => qr{^[iuo]$},
  q{l} => qr{^[iu]$},
  q{f} => qr{^[euo]$},
  q{k} => qr{^[eiou]$},
};
our $typicalSpelling = {
  q{tʃ} => qr{^CH$},
  q{l} => qr{^L$},
  q{f} => qr{^F$},
  q{k} => qr{^(?:C|K)$},
};
our $conditionedSpelling = {
  q{tʃ} => qr{^TCH$},
  q{l} => qr{^LL$},
  q{f} => qr{^FF$},
  q{k} => qr{^CK$},
};

sub tryFinal() {
  if ($alignment =~ 
      m{(?: |^)([^=]+)=ˈ\+($vowelRE) ([^ E=]+)(E?)=(tʃ|[fkl])$}) {
    my $vowelSpell = $1;
    my $vowelPron = $2;
    my $consSpell = $3;
    my $vowelSpellE = $4;
    my $consPron = $5;
    $case = qq{final,/$consPron/};
    if ($vowelPron =~ m{^$shortVowelRE$}) {
      $condVPr = 1;
      $narrow = $vowelPron =~ $shortVowels->{$consPron};
    } else {
      $condVPr = 0;
      $narrow = $vowelPron =~ $longVowels->{$consPron};
    }
    $condVSp = length($vowelSpell) == 1 && !$vowelSpellE;
    $consSpelling = ($consSpell =~ $typicalSpelling->{$consPron}) 
    ? q{typicC}
    : (($consSpell =~ $conditionedSpelling->{$consPron}) 
      ? q{condC} : q{oddC});
    printOut($finalWords);
  }
}

sub filterCMU() {
  open(my $in, q{<}, q{/home/bkessler/RAT/data/CMU/db/alignments}) or die();
  my $header = <$in>;
  while (defined(my $line = <$in>)) {
    chomp($line);
    ($spell, $pron, $alignment) = split(m{\t}, $line);
    next unless $spell =~ m{^[A-Z]+$};
    die($pron) unless $pron =~ $pronRE;
    $nSylls = 0;
    while ($pron =~ m{$vowelRE }g) {$nSylls++;}
    if ($pron =~ $initialStress) {
      tryInitial();
    }
    if ($pron =~ $finalStress) {
      tryFinal();
    }
  }
  close($in);
}

sub printWordsInTable($) {
  my $table = shift(@_);
  foreach my $entry (sort(values(%$table))) {
    print($entry, qq{\n});
  }
}

sub printWords() {
  print(q{"Children Use Vowels: Data from CMU and Zeno"
"end: which end of the word is the target consonant at?"
"  final = looking at last consonant in word"
"  initial = looking at first consonant in word"
"Cpr: the consonant whose spelling we're looking at"
"condVPr: does the vowel phoneme next to the target consonant condition the "
"     special (conditioned) spelling?"
"  condVPr = yes:"
"     in final cases, short vowels (ɑ|ɪ|ɛ|æ|ə|ʊ)"
"     in initial case, (aɪ|ɪ|ɛ|i)"
"  typVPr = no:"
"     in final cases, long vowels (aɪ|aʊ|ɔɪ|ɚ|e|i|o|ɔ|u);"
"     in initial case, (aʊ|ɔɪ|ɑ|æ|e|ə|ɚ|o|ɔ|u|ʊ)"
"strict: was the vowel phoneme next to the target consonant used in the behavioural experiment?"
"  strict = yes"
"  univ = no"
"sylls = number of syllables in the word"
"condVSp: does the vowel LETTER next to the target consonant condition the "
"    special (conditioned) spelling?"
"  condVSp = yes: "
"    in final cases, there is exactly one vowel letter, no final E;"
"    in initial case, one of (E|I|Y)"
"  typVSp = no:"
"    in final cases, there is more than one vowel letter, and/or final E;"
"    in initial case, one of (A|O|U)"
"condC: is the target consonant spelled with the special (conditioned) spelling?"
"  condC = yes: "
"    in final cases, the extended spelling (TCH, LL, FF, CK)"
"    in initial case, K"
"  typicC = no, a typical spelling was used:"
"    in final cases, unextended CH, L, F, C or K"
"    in initial case, C"
"  oddC = no, some other spelling was used"
"freqN: frequency of the word in Zeno et al Word Frequency Guide; - means not found in Zeno"
"freq2: frequency of the word in Zeno grade 2 corpus; .3 replaces original -"
"freq3: same, for grade 3 corpus"
"freq5: same, for grade 5 corpus"
"freq13: same, for grade 13 (post-HS) corpus"
"spell: spelling"
"pron: pronunciation of entire word"
});
  print(qq{end,Cpr,condVPr,strict,sylls,condVSp,condC,freqN,freq2,freq3,freq5,freq13,spell,pron\n});
  printWordsInTable($finalWords);
  printWordsInTable($initialWords);
}

our @nums;

sub minimize($) {
  my $num = shift(@_);
  return ($num eq q{-}) ? q{.3} : $num;
}

sub filterTable($) {
  my $table = shift(@_);
  my $cmuData = $table->{$spell};
  return unless defined($cmuData);
  my @counts = ($nums[3], q{-}, q{-}, q{-}, q{-});
  if (defined($nums[16])) {
    $counts[1] = minimize($nums[5]);
    $counts[2] = minimize($nums[6]);
    $counts[3] = minimize($nums[8]);
    $counts[4] = minimize($nums[16]);
  }
  my $cString = join(q{,}, @counts);
  $cmuData =~ s{-,-,-,-,-}{$cString};
  $table->{$spell} = $cmuData;
}

sub filterZeno() {
  open(my $in, q{<}, q{/home/bkessler/RAT/data/WFG/wfg}) or die;
  while (defined(my $line = <$in>)) {
    chomp($line);
    ($spell, @nums) = split(m{\t}, $line);
    $spell = uc($spell);
    filterTable($finalWords);
    filterTable($initialWords);    
  }
  close($in);
}

sub main() {
  filterCMU();
  filterZeno();
  printWords();
}

main();