#!/usr/bin/perl $punctSet = '[^A-Za-z=/^]+'; $letterSet = '[A-Za-z=/^]+'; $nTokens = 0; $nPunctTypes = 0; $nWordTypes = 0; $punct = '<>'; $lineNumber = 0; print "Memorizing text.\n"; while ($line = <>) { next if ($line =~ /^\W*$/); chop($line); $line .= '<>'; $lineNumber++; $lineStart[$lineNumber] = $nTokens; if ($line =~ //) { $line =~ s///g; } if ($line =~ //) { $line =~ s///g; } $line =~ s/[\[\]]//g; ($initPunct) = $line =~ /^($punctSet)/; if ($initPunct) { $punct .= $initPunct; $line =~ s/^$punctSet//; #print "PUNCT: <$punct>\n"; } while ($line ne '') { ($word) = $line =~ /^($letterSet)/; if ($word eq '') {die "Null word!";} #print "WORD: <$word>\n"; &Memorize($punct, $word); $line =~ s/^$letterSet//; ($punct) = $line =~ /^($punctSet)/; #print "PUNCT: <$punct>\n"; $line =~ s/^$punctSet//; } } print "$nTokens tokens, $nWordTypes word types, $nPunctTypes punct types, $lineNumber lines.\n"; print "Levelling spellings.\n"; $nStandardSpells = 0; @spellingEquivalents = ("a/e:e", "c:k", "d=:th", "d/:th", "uua:wa", "uue:we", "uui:wi", "uuo:wo", "uuu:wu", "uu:wu"); for ($type = 0; $type < $nWordTypes; $type++) { $baseSpell = $wordTypesToTokens[$type]; $baseSpell =~ tr/[A-Z]/[a-z]/; foreach $equiv (@spellingEquivalents) { ($sp1, $sp2) = split(/:/, $equiv); $baseSpell =~ s/$sp1/$sp2/g; } if (!defined $standardSpellToStandardType{$baseSpell}) { $standardSpellToStandardType{$baseSpell} = $nStandardSpells; $standardSpellType = $nStandardSpells; $standardTypeToSpell[$standardSpellType] = "$baseSpell"; # $standardTypeToWordType[$standardSpellType] = $type; $nStandardSpells++; } else { $standardSpellType = $standardSpellToStandardType{$baseSpell}; } $wordTypeToStandardSpellType[$type] = $standardSpellType; $standardTypeToSpell[$standardSpellType] .= ";$wordTypesToTokens[$type]"; } print "Counting word frequencies.\n"; for ($i = 0; $i < $nStandardSpells; $i++) { $standardTypeFreq[$i] = 0; } for ($i = 0, $line = 1; $i < $nTokens; $i++, ($lineStart[$line + 1] <= $i) ? $line++ : 0) { $wordType = $wordTypeToStandardSpellType[$wordArray[$i]]; $standardTypeFreq[$wordType]++; if (defined $standardTypeIndex[$wordType]) { $standardTypeIndex[$wordType] .= " $line"; } else { $standardTypeIndex[$wordType] = "$line"; } } for ($i = 0; $i < $nStandardSpells; $i++) { $freq = $standardTypeFreq[$i]; push(@freq_type, "$freq $i"); $probability[$i] = $freq / $nTokens; } print "By frequency:\n"; open(OUT, ">freqWords") || die "Can't open freqWords for writing."; @freq_type = sort field1NumericDecreasing @freq_type; for $freq_type (@freq_type) { ($freq, $type) = split(/ /, $freq_type); printf OUT "%4d ", $freq; $spellings = $standardTypeToSpell[$type]; $spellings =~ s/^[^;]+;//; $spellings =~ s/;/, /g; print OUT "$spellings\n"; } close(OUT); print "\nWord-level bigrams:\n"; $secondWordType = $wordTypeToStandardSpellType[$wordArray[0]]; $nEdge = 0; $nNonEdge = 0; $line = 1; for ($first = 0, $second = 1; $second < $nTokens; $first++, $second++) { $firstWordType = $secondWordType; $secondWordType = $wordTypeToStandardSpellType[$wordArray[$second]]; $pair = "$firstWordType $secondWordType"; $edgeP = ($punctTypesToTokens[$punctArray[$second]] =~ /<>/); #print "At slot $second, punct = |$punctArray[$second]|\n"; if (defined $bigram{$pair}) { ($edge, $nonEdge) = split(/ /, $bigram{$pair}); } else { ($edge, $nonEdge) = (0, 0); } if ($edgeP) { $edge++; $nEdge++; } else { $nonEdge++; $nNonEdge++; } $bigram{$pair} = "$edge $nonEdge"; if ($lineStart[$line + 1] == $second) { $loc = $line; $line++; $loc .= "/$line:$first"; } else { $loc = "$line:$first"; } if (defined $bigramIndex{$pair}) { $bigramIndex{$pair} .= " $loc"; } else { $bigramIndex{$pair} = "$loc"; } } print "$nNonEdge pair tokens in-line, $nEdge across line boundaries.\n"; print "Average deviance (Observed - Expected) / Expected:\n"; $edgeDeviationSum = 0; $nonEdgeDeviationSum = 0; $nEdgeTypes = 0; $nNonEdgeTypes = 0; while (($pair, $counts) = each %bigram) { ($edge, $nonEdge) = split(/ /, $counts); ($i, $j) = split(/ /, $pair); $prob = $probability[$i] * $probability[$j]; $edgeDeviation = &edgeDeviation($i, $j, $edge); $nonEdgeDeviation = &nonEdgeDeviation($i, $j, $nonEdge); $edgeDeviationSum += $edgeDeviation; $nonEdgeDeviationSum += $nonEdgeDeviation; if ($edge > 1) { $deviation = sprintf("%.4f", $edgeDeviation); push(@edgeBigramFreq, "$deviation $edge $pair"); $nEdgeTypes++; } if ($nonEdge > 1) { $deviation = sprintf("%.4f", $nonEdgeDeviation); push(@nonEdgeBigramFreq, "$deviation $nonEdge $pair"); $nNonEdgeTypes++; } if ($edge + $nonEdge > 1) { if (defined $continuations{$i}) { $continuations{$i} .= " $j"; } else { $continuations{$i} = "$j"; } } else { undef $bigram{$pair}; undef $bigramIndex{$pair}; } } $nPossiblePairs = ($nStandardSpells ** 2); print " within a line: "; $edgeDeviationSum += -1.0 * ($nPossiblePairs - $nEdgeTypes); $edgeAverageDeviation = $edgeDeviationSum / $nPossiblePairs; printf("%.5f; across lines: ", $edgeAverageDeviation); $nonEdgeDeviationSum += -1.0 * ($nPossiblePairs - $nNonEdgeTypes); $nonEdgeAverageDeviation = $nonEdgeDeviationSum / $nPossiblePairs; printf("%.5f (diff = %.5f)\n", $nonEdgeAverageDeviation, $nonEdgeAverageDeviation - $edgeAverageDeviation); print "\nBigrams within lines:\n"; open(OUT, ">within2") || die "Can't open file within2 for writing."; select(OUT); $^ = "WITHIN_TOP"; &reportBigrams(*nonEdgeBigramFreq, 0); format WITHIN_TOP = Bigrams within lines Deviance # found Bigram Vs. at edge deviance, count . close(OUT); select(STDOUT); print "Bigrams that straddle line breaks:\n"; open(OUT, ">between2") || die "Can't open file between2 for writing."; select(OUT); $^ = "BETWEEN_TOP"; $- = 0; &reportBigrams(*edgeBigramFreq, 1); format BETWEEN_TOP = Bigrams straddling line breaks Deviance # found Bigram Vs. within-line deviance, count . close(OUT); select(STDOUT); print "\n\nSorted alphabetically:\n"; open(OUT, ">alphWords") || die "Can't open alphWords for writing"; @freq_type = sort alphabeticFreq @freq_type; for $freq_type (@freq_type) { ($freq, $type) = split(/ /, $freq_type); $spellings = $standardTypeToSpell[$type]; $spellings =~ s/^[^;]+;//; $spellings =~ s/;/, /g; printf OUT "$spellings\t%4d", $freq; if ($freq < 50) { print OUT ", at $standardTypeIndex[$type]"; } print OUT "\n"; if (defined $continuations{$type}) { @continuations = split(/ /, $continuations{$type}); @continuations = sort byStandardTypeSpell @continuations; for $cont (@continuations) { $spell = &standardTypeToStandardSpell($cont); print OUT " $spell\t"; $pair = "$type $cont"; ($edge, $nonEdge) = split(/ /, $bigram{$pair}); print OUT $edge + $nonEdge, ", at "; @locs = split(/ /, $bigramIndex{$pair}); for $loc (@locs) { $loc =~ s/:.+$//; print OUT "$loc "; } print OUT "\n"; } } } close(OUT); undef %continuations; undef @edgeBigramFreq; undef @nonEdgeBigramFreq; undef %bigram; #Find all n-grams that occur more than once, i.e., 3-grams, etc. #Algorithm: for every (n-1)gram that occurs more than once, see if the # following word occurs more than once, and if so, # enter in index. Then filter that index to discard collocations that # occur only once. Recurse until we reach an n that results in empty set. print "Phrases of three or more words (ignoring line breaks), by deviance:\n"; $nGram = 3; while (&searchNGram($nGram, *bigramIndex)) {$nGram++;} sub searchNGram { local($nGram, *ngramIndex) = @_; #print "\n$nGram-word collocations:\n"; $incr = $nGram - 1; while (($collocation, $locs) = each %ngramIndex) { next unless defined $locs; #print "|$collocation|\n"; #print " |$locs|\n"; @locs = split(/ /, $locs); next if @locs < 2; for $loc (@locs) { ($line, $tokenNumber) = split(/:/, $loc); $line =~ s|/.+$||; $following = $wordTypeToStandardSpellType[$wordArray[$tokenNumber + $incr]]; if ($standardTypeFreq[$following] > 1) { &indexNGram("$collocation $following", $line, $tokenNumber); } } } $nNgrams = 0; while (($collocation, $locs) = each %collocationsIndex) { @locs = split(/ /, $locs); $count = @locs; if ($count < 2) { undef $collocationsIndex{$collocation}; } else { $nNgrams++; @words = split(/ /, $collocation); $prob = 1; for $word (@words) { $prob *= $probability[$word]; } $expectation = $prob * $nTokens; $deviation = sprintf("%.4f", ($count - $expectation) / $expectation); push(@polygrams, "$deviation&$count&$collocation&$locs"); } } %ngramIndex = %collocationsIndex; undef %collocationsIndex; return $nNgrams; } @polygrams = sort byDeviance @polygrams; open(OUT, ">ngrams") || die "Can't open file ngrams for writing."; for $dev_count_polygram_locs (@polygrams) { ($deviance, $count, $wordTypes, $locs) = split(/&/, $dev_count_polygram_locs); @words = split(/ /, $wordTypes); @locs = split(/ /, $locs); print OUT "$deviance\t"; for $word (@words) { ($spell) = split(/;/, $standardTypeToSpell[$word]); print OUT "$spell "; } print OUT "\t$count, at "; for $loc (@locs) { $loc =~ s/:.+$//; print OUT "$loc "; } print OUT "\n"; } close(OUT); sub indexNGram { local($collocation, $line, $token) = @_; if (defined $collocationsIndex{$collocation}) { $collocationsIndex{$collocation} .= " $line:$token"; } else { $collocationsIndex{$collocation} = "$line:$token"; } } sub byStandardTypeSpell { &comparableSpelling($a) cmp &comparableSpelling($b); } sub reportBigrams { local(*array, $edgeP) = @_; @array = sort byDeviance @array; $~ = "BIGRAMS"; for $bigramFreq (@array) { ($deviance, $count, $word1, $word2) = split(/ /, $bigramFreq); ($w1Spell) = split(/;/, $standardTypeToSpell[$word1]); ($w2Spell) = split(/;/, $standardTypeToSpell[$word2]); $spelling = "$w1Spell $w2Spell"; $pair = "$word1 $word2"; ($edge, $nonEdge) = split(/ /, $bigram{$pair}); if ($edgeP) { $expected = &nonEdgeExpectation($i, $j); $found = $nonEdge; } else { $expected = &edgeExpectation($i, $j); $found = $edge; } $otherDev = &deviation($found, $expected); write; } undef @array; } format BIGRAMS = @>>>>>>>>> @>> @<<<<<<<<<<<<<<<<<<<< @>>>>>>>>> @>> $deviance, $count, $spelling, $otherDev, $found . #undef %bigram; sub edgeExpectation { local($word1, $word2) = @_; $prob = $probability[$word1] * $probability[$word2]; $prob * $nEdge; } sub edgeDeviation { local($word1, $word2, $found) = @_; $expectation = &edgeExpectation($word1, $word2); &deviation($found, $expectation); } sub nonEdgeExpectation { local($word1, $word2) = @_; $prob = $probability[$word1] * $probability[$word2]; $prob * $nNonEdge; } sub nonEdgeDeviation { local($word1, $word2, $found) = @_; $expectation = &nonEdgeExpectation($word1, $word2); &deviation($found, $expectation); } sub deviation { local($found, $expectation) = @_; ($found - $expectation) / $expectation; } sub byDeviance { ($devA) = split(/ /, $a); ($devB) = split(/ /, $b); $b <=> $a; } sub alphabeticFreq { ($ignore, $aType) = split(/ /, $a); ($ignore, $bType) = split(/ /, $b); &comparableSpelling($aType) cmp &comparableSpelling($bType); } sub field1NumericDecreasing { ($a1, $aType) = split(/ /, $a); ($b1, $bType) = split(/ /, $b); if ($a1 == $b1) { &comparableSpelling($aType) cmp &comparableSpelling($bType); } else { $b1 <=> $a1; } } sub comparableSpelling { local($type) = @_; $spell = &standardTypeToStandardSpell($type); $strip = $spell; $strip =~ s/\W//g; "$strip$spell"; } sub standardTypeToStandardSpell { local($type) = @_; ($spell) = split(/;/, $standardTypeToSpell[$type]); $spell; } sub Memorize { &MemorizeToken(*punctArray, *punctTypesToTokens, *punctTokensToTypes, $punct, *nPunctTypes); &MemorizeToken(*wordArray, *wordTypesToTokens, *wordTokensToTypes, $word, *nWordTypes); $nTokens++; } sub MemorizeToken { local(*array, *typesToTokens, *tokensToTypes, $token, *nTypes) = @_; if (!defined $tokensToTypes{$token}) { $tokensToTypes{$token} = $nTypes; $tokenType = $nTypes; $typesToTokens[$nTypes] = $token; $nTypes++; } else { $tokenType = $tokensToTypes{$token}; } $array[$nTokens] = $tokenType; }