=for comment Format of data files: ;preferably ISO 3-letter lang code. Contains words. ;just one sem entry per word. If more than one applies, ; if internal reconstruction implies one over another, ; use it. Else follow universal preference. ;default = 0; else a number estimating certainty that ; it should be discarded because nonarbitrary (e.g. ; onomatopoeia, sound-symbolic. phonemes ;Ideally, root in earliest form available to internal ; reconstruction. (In practice, probably just earliest attested ; form.) Ideally, xscribe phonetically; in practice, ; capturing phonemic differences is probably good enough. =cut use strict; use warnings; use Getopt::Long; use XML::Parser::Expat; use Signif::RetentionRates; use Signif::Language; use File::Temp; our $script_file_name; sub Read_Options() { my $result = Getopt::Long::GetOptions( q{script=s} => \$script_file_name, #string ); die(qq{Getopt::Long::GetOptions failed.}) unless $result; } our $libraries = [qw{. data}]; our $language1; our $language2; our $general_retention_rates_file = q{swadesh-retent.xml}; our $choose_candidate = q{average}; our $suppress_loans_at = 0.5; our $suppress_redundants = 1; our $suppress_motivateds_at = 0.5; our $weight_derivs = 100; our $sample_size = 100; our $compare_part = q{C1}; our $compare_metric = q{place+voice}; our $iters = 100000; our $max_prons_per_sem = 100; our $retention_rates; our $language_data; sub Process_Script() { if (!defined($script_file_name)) { die(qq{Need to specify --script=FILE}); } do($script_file_name); if ($@) {die(qq{Couldn't compile $script_file_name: $@});} if ($!) {die(qq{Couldn't load $script_file_name: $!});} system(qq{cat $script_file_name}); } #sems chosen for the tests: our $sems = []; sub by_descending_num {$b->[0] <=> $a->[0];} sub Harmonize_Langs($) { my $language_data = shift(@_); my $score_array = []; #print(qq{Composite scores:\n}); $language_data->[0]->for_each_sem(sub { my($sem, $rating1) = @_; return if $rating1 <= 0; my $rating2 = $language_data->[1]->sem_rating($sem); return if $rating2 <= 0; my $composite = ($rating1 * $rating2) / 100; #print(STDERR qq{ $sem: $composite\n}); push(@$score_array, [$composite, $sem]); }); $score_array = [sort(by_descending_num @$score_array)]; my $n_scores = scalar(@$score_array); if (!$n_scores) { die(qq{No words at all match the criteria!}); } if ($n_scores < $sample_size) { print(qq{Sample size $sample_size requested, but only $n_scores semantic concepts meet hard criteria.\n}); } else { print(qq{Selecting $sample_size semantic concepts.\n}); } if ($n_scores > $sample_size) { my $cutoff = $score_array->[99]->[0]; $#$score_array = 99; #print(qq{Score array now } . scalar(@$score_array) . qq{\n}); } $sems = []; foreach my $rec (@$score_array) { my $sem = $rec->[1]; #print(qq{ $sem\n}); push(@$sems, $sem); } } sub Prepare_Data() { $retention_rates = Signif::RetentionRates->Load($libraries, $general_retention_rates_file); foreach my $language_file ($language1, $language2) { push(@$language_data, Signif::Language->Load( $libraries, $language_file, $retention_rates, $suppress_loans_at, $suppress_redundants, $suppress_motivateds_at, $weight_derivs, $max_prons_per_sem)); } Harmonize_Langs($language_data); foreach my $language_d (@$language_data) { $language_d->select($sems, $compare_part, $compare_metric); } } sub Run_Tests() { my($handle, $file_name) = File::Temp::tempfile(q{sigdataXXXXXX}, DIR => q{temp}); die unless defined($handle); my $n_sems = scalar(@$sems); print($handle $n_sems, qq{\n}); for (my $i = 0; $i < $n_sems; $i++) { my $semi = $language_data->[0]->get_by_sem($sems->[$i]); for (my $j = 0; $j < $n_sems; $j++) { my $semj = $language_data->[1]->get_by_sem($sems->[$j]); my $pair_score = $semi->score_against($semj, $choose_candidate); printf($handle qq{%.3f\n}, $pair_score); } } close($handle); my $result = qx{./signif-distance --iters=$iters < $file_name}; if (!defined($result)) { die(qq{External signif-distance command failed.}); } print($result, qq{\n}); } sub main() { Read_Options(); Process_Script(); Prepare_Data(); Run_Tests(); } main();