#!/usr/bin/perl use Getopt::Long; use DBI; use Math::Round; # TO DO: # Add negation # Allow for wildcards and proper matching, and non-specified databases # in MySQL interface. %words = %master = %nickmap = %options = %catdefs = %cat = %people = (); %tf = %once = %mastertask = %names = %catdefs = %categ = @header = (); $thisfile = $spreaddelimjoin = $o = ""; @indexcols = @textcols = @allcols = (); $session = 0; $chunk = 1; $mainindex = ""; %opts = ( "dict=s" => \@dict, "output=s" => \$outstr, "collapse=s" => \$nicks, "corpus" => \$corpus, "help" => \$gethelp, "session=s" => \$dayrx, "spreadsheet" => \$spreadsheet, "spreaddelim=s" => \$spreaddelimit, "spreadskip=n" => \$spreadskip, "spreadhead" => \$spreadhead, "spreadtime=n" => \$spreadtime, "line=s" => \@linerx, "cellsplit=s" => \$splitrx, "cellindex=i" => \$splitnum, "flat" => \$flat, "filename" => \$filename, "exclude=s" => \@exclude, "include=s" => \@include, "mysql-user=s" => \$sqluser, "mysql-pass=s" => \$sqlpass, "mysql-host=s" => \$sqlhost, "mysql-database=s" => \$sqldatabase, "mysql-select=s" => \$sqlselect, "mysql-indexcol=s" => \@sqlindex, "mysql-textcol=s" => \@sqltext, "use-mysql" => \$usesql, "showrow=n" => \$showrow, "showcat=n" => \$showcat, "showcol=n" => \$showcol, "rcfile=s" => \$rcfile, "orig" => \$orig, "percents" => \$percent, "anonymize" => \$anon, "redundant" => \$redun, "temporal" => \$temporal, "time=s" => \@timerx, "timeformat=s" => \$timeform, "no-total" => \$nototal, "no-master" => \$nomaster, "only-master" => \$onlymaster, "version" => \$version ); GetOptions( %opts ); if ($version) { print "TAWC is version 0.82, last edited 5/2/2005. TAWC will now quit. If you want to actually use TAWC, don't call it with -v or --version.\n"; exit 0; } if ($rcfile) { $rcfile = open(RC,"$rcfile"); } else { $rcfile = open(RC,"$ENV{HOME}/.tawcrc"); } if ($rcfile) { while () { print "line $_"; if (/^\#/) { next; } chomp; if ($_ =~ /^\s*(.*?)\s+(.*)/) { ($opt,$val) = ($1,$2); foreach $k (keys %opts) { if ($k eq $opt || substr($k,0,(length($k) - 2)) eq $opt) { if (ref $opts{$k} eq "SCALAR") { ${$opts{$k}} = $val unless defined ${$opts{$k}}; } elsif (ref $opts{$k} eq "ARRAY") { push(@{$opts{$k}},$val) unless defined @{$opts{$k}}; } } } } } } GetOptions(%opts); if ($gethelp) { syntax(); exit 0; } if ($usesql) { if (defined $spreadsheet) { print "You have specified that you're using a MySQL database and that your text files are in spreadsheet format. MySQL database use precludes input file use. Not analyzing input files.\n\n"; undef $spreadsheet; } unless (defined $sqluser) { $sqluser = "root"; } unless (defined $sqlpass) { print "Please enter the password for this mysql account: "; system('stty -echo'); $sqlpass = ; system('stty echo'); chomp $sqlpass; } unless (defined $sqlhost) { $sqlhost = "localhost"; } unless (defined $sqldatabase) { print "You must specify a database.\n\n"; exit 1; } unless (defined @sqltext) { print "You must specify a column to analyze text from.\n\n"; exit 1; } undef @ARGV; } else { unless (defined $ARGV[0]) { print "No input files entered or database specified!\n\n"; exit 1; } if (defined $sqluser || defined $sqlpass || defined $sqlhost || defined $sqldatabase || defined $sqlselect || defined @sqlindex || defined @sqltext) { print "You have defined a mysql option without specifying --use-mysql. Aborting TAWC in case you don't know what you're doing.\n\n"; exit 1; } } if ((defined $spreadtime || defined $spreadskip || defined $spreadhead || defined $spreaddelimit) && !defined $spreadsheet) { print "You have defined spreadsheet options without specifying --spreadsheet. Aborting TAWC in case you don't know what you're doing.\n\n"; exit 1; } if (defined $spreadsheet && !(defined $spreadskip)) { print "You have specified that you are using a spreadsheet, but have not specified how many columns to skip. TAWC will only skip the first column, and analyze the rest.\n\n"; $spreadskip = 1; } if (defined $spreadsheet && !(defined $spreaddelimit)) { print "You have specified that you are using a spreadsheet, but have not specified a delimiter. TAWC will use tabs.\n\n"; $spreaddelimit = "\\t"; } if (defined $spreadsheet && defined $temporal && !(defined $spreadtime)) { print "You have specified that you are using a spreadsheet, and that you would like temporal data counted, but you have not specified a column containing timing data. Aborting TAWC in case you don't know what you're doing.\n\n"; exit 1; } if (defined $spreadsheet) { unless ( $spreaddelimit =~ /^(?:\\t)|[, ]$/ ) { print "You may only use tabs (\\t), commas, or spaces as spreadsheet delimiters. Sorry!\n\n"; exit 1; } $spreaddelimjoin = $spreaddelimit eq "\\t" ? "\t" : $spreaddelimit; $onlymaster = 1; } unless (defined @dict) { print "You have not specified a dictionary file. TAWC will only count total number of words used.\n\n"; } unless (defined $outstr) { print "You have not specified a way to name output files. Aborting TAWC in case you don't know what you're doing.\n\n"; exit 1; } unless (defined $nicks) { print "No nickname equivalency entered, not collapsing any nicknames.\n\n"; } if (!defined $flat && !defined $usesql && (defined $spreadsheet || !defined $dayrx) ) { print "You have not specified a session-denoting regexp, so TAWC assumes you meant to declare --flat.\n\n"; $flat = 1; } if (defined $flat) { print "Treating file as sessionless, providing only one output.\n\n"; undef $dayrx; $session = 1; } unless (!defined $spreadsheet && !defined $usesql && defined @linerx) { print "You have not specified a way to determine users and text from each line in your input. TAWC will analyze the entire file and provide one line of output.\n\n"; } unless (defined $splitrx) { print "You have not specified which part of input lines to read, so TAWC assumes that your --line takes entire lines.\n\n"; } unless (defined $splitrx && defined $splitnum) { print "You have specified that only part of each line is relevant, but you have not specified which part. TAWC assumes the first part is relevant.\n\n"; $splitnum = 0; } if (defined @include) { @inc = (); foreach (@include) { if (/,/) { push(@inc,split(/,/,$_)); } } foreach (@inc) { if (/(\d+)-(\d+)/) { push(@inc2,$1..$2); next; } if (/[^\d]/) { print "You have specified to include categories which were not defined in the canonical way: use a list of numbers with dashes and commas. 2-10 represents categories 2 through 10. 2,4,6,8,10 represents the categories listed. 2-4,9,10 is equivalent to 2,3,4,9,10 which is in turn equivalent to --include=2 --include=3 --include=4 --include=9 --include=10. TAWC is now aborting in case you don't know what you're doing.\n\n"; exit 1; } push(@inc2,($_+0)); } @include = @inc2; print "TAWC will include only the following categories, if they are defined in one of your --dict files:\n".join(", ",@include)."\n"; } if (defined @exclude) { foreach (@exclude) { if (/,/) { push(@exc,split(/,/,$_)); } } foreach (@exc) { if (/(\d+)-(\d+)/) { push(@exc,$1..$2); next; } if (/[^\d]/) { print "You have specified to exclude categories which were not defined in the canonical way: use a list of numbers with dashes and commas. 2-10 represents categories 2 through 10. 2,4,6,8,10 represents the categories listed. 2-4,9,10 is equivalent to 2,3,4,9,10 which is in turn equivalent to --exclude=2 --exclude=3 --exclude=4 --exclude=9 --exclude=10. TAWC is now aborting in case you don't know what you're doing.\n\n"; exit 1; } push(@exc,($_+0)); } @exclude = @exc; print "TAWC will exclude only the following categories, if they are defined in your --dict files:\n".join(", ",@exclude)."\n"; } if (defined @include && defined @exclude) { print "You have defined --include as well as --exclude. This is likely redundant, so you may not know what you're doing, but TAWC will let you do it anyway.\n\n"; } if (defined $temporal) { unless (defined @timerx) { print "You have specified that you want temporal data counted, but you have not specified how to detect time. Use --time=() if you just want to count turns. Aborting TAWC in case you don't know what you're doing.\n\n"; exit 1; } unless (defined $timeform) { print "You have not specified a time format. Assuming that your timerx is in hour/minute/second format."; $timeform = "hms"; } unless (scalar (grep {/^$timeform$/} qw(hms hsm mhs msh shm smh)) == 1) { print "Valid timeforms are: hms hsm mhs msh shm smh. You specified a timeform of $timeform. Aborting TAWC in case you don't know what you're doing.\n\n"; exit 1; } $tf{"h"} = $timeform =~ /^h/ ? 1 : ($timeform =~ /^.h/ ? 2 : 3); $tf{"m"} = $timeform =~ /^m/ ? 1 : ($timeform =~ /^.m/ ? 2 : 3); $tf{"s"} = $timeform =~ /^s/ ? 1 : ($timeform =~ /^.s/ ? 2 : 3); } foreach $dic (@dict) { open(IN,$dic) or die "Could not open dictionary $dic"; TOP: while () { chomp; if (/\%/) { while () { chomp; if (/\%/) { next TOP; } ($num,$def) = split(/\s+/,$_); $numcheck = int $num; $num = "0".$num if $num < 10; $numcheck = "0".$numcheck if $numcheck < 10; if (defined @exclude && memberOf($num,@exclude)) { next; } elsif (defined @include && !(memberOf($num,@include))) { next; } elsif ($num eq $numcheck) { $catdefs{$num} = $def; } else { print "Dictionary error! Definitions not contiguous!\n"; exit 1; } } next; } $line = $_; $word = ""; @thecats = (); if ($line =~ /^\/\//) { next; } elsif ($line =~ /^\/(.*?[^\\])\/\s+(.*)/) { $word = $1; @thecats = split(/\s+/,$2); } else { ($word,@thecats) = split(/\s+/,$line); $word =~ s/([^\w\*])/\\$1/g; $word =~ s/\*/\[a\-zA\-Z0\-9\]\*/g; } if (defined $word) { foreach $c (@thecats) { if ($c eq int($c) || ($c < 10 && $c eq ("0".int($c)) )) { if (memberOf($c,keys %catdefs)) { push(@{$cat{$c}},$word); # $cat{$c} .= "\\b$word\\b"; } elsif (!defined (@include)) { print "Warning: $word defined in dictionary $dic as belonging to category $c, which is not defined.\n"; } } else { print "Word \"$word\" did not make it into cat \"$c\" because \"$c\" is not a proper number.\n"; } } } } close IN; } print "After reading dictionaries, ".scalar (keys %catdefs)." categories were defined.\n"; %revcat = reverse %catdefs; # map { $cat{$_} =~ s/\\b\\b/\\b\|\\b/g } (keys %cat); foreach $re (sort {$a <=> $b} keys %cat) { @{$cat{$re}} = flatten($re,@{$cat{$re}}); } if (defined @dict && defined $redun) { foreach $k (keys %cat) { print "$k\n"; } } if (defined $nicks) { open (IN,$nicks); while () { chomp; ($alias, $canon) = split (/\s+/,$_); $alias =~ tr/[A-Z]/[a-z]/; $canon =~ tr/[A-Z]/[a-z]/; $nickmap{$alias} = $canon; } } if ($usesql) { $mainindex = join("\t",@sqlindex); $db = DBI->connect("DBI:mysql:$sqldatabase:$sqlhost","$sqluser",$sqlpass) or die "Can't connect to $sqldatabase: ".$db->errstr."\n"; $db->{RaiseError} = 1; $str = $sqlselect; print "String is $str.\n"; $a = (split(/\s+from\s+/i,$str))[0]; $a =~ s/^\s*select\s+//i; foreach (split(/\s*,\s*/,$a)) { # s/.*?\.//; push(@allcols,$_); } foreach $c (0..scalar @allcols) { if (grep {/^$allcols[$c]$/} @sqlindex) { push(@indexcols,$c); } if (grep {/^$allcols[$c]$/} @sqltext) { push(@textcols,$c); } } print "Index columns ".join(",",map {$allcols[$_]} @indexcols)."\n"; print "appear at indices ".join(",",@indexcols)."\n"; print "in the select string \"$str\".\n"; print "Text columns ".join(",",map {$allcols[$_]} @textcols)."\n"; print "appear at indices ".join(",",@textcols)."\n"; print "in the select string \"$str\".\n"; $sth = $db -> prepare($str); $sth->execute or die "Unable to execute query: ".$db->errstr."\n"; $index = join("\t",@sqlindex); print "Rows complete: "; $cnt = 0; while (@next = $sth->fetchrow_array) { print "$cnt\n" if $cnt % 100 == 0; $cnt++; $index = join("\t",@next[@indexcols]); if (defined $nicks && defined $nickmap{$index} ) { $index = $nickmap{$index}; } $names{$index} .= join("\t",@next[@textcols]); } $sth -> finish; $db -> disconnect; print "\n"; open(OUT,">$outstr-mysql.out") or die "Could not open file."; $session = 1; } if (defined @ARGV) { $wkday = $month = $day = $time = $year = $who = $rest = $pose = ""; if (defined $flat) { $mainindex = "ID"; } else { $mainindex = "s1\ts2\ts3\tID"; } } foreach $infile (@ARGV) { print "Infile $infile\n"; open(IN,$infile) or die "Could not open $infile"; @wholefile = (); while () { s/\xd/\n/g; push(@wholefile,split(/\n/s,$_)); } close IN; $thisfile = $infile; if ($spreadsheet && $spreadhead) { $eh = shift @wholefile; chomp $eh; @header = split(/$spreaddelimit/,$eh); } elsif ($spreadsheet) { @header = ("Subject","Text"); } $latest = 0; foreach (@wholefile) { chomp; $line = $_; undef $now; if (!defined $spreadsheet && defined $dayrx && $line =~ /$dayrx/) { $ny = $1; $nm = $2; $nd = $3; print "Found session match \n $&\n..."; if ($year eq $ny && $month eq $nm && $day eq $nd) { print "which is the current session.\n"; next; } elsif ($session == 1) { print "counting words...\n"; unless (defined $onlymaster) { display(); close OUT; } $chunk++ unless defined $chunks{$chunk}; unless (defined $nomaster) { $o = $outstr."-master-out.txt"; open(OUT,">>$o") or die "sigh"; display(); close OUT; $once{$o} = 1; } $mastertask{"$year\t$month\t$day"} = 1; } else { print " starting wordcount process.\n"; $session = 1; } $year = $ny; $month = $nm; $day = $nd; %names = (); %people = (); unless (defined $onlymaster) { $o = "$outstr-$year-$month-$day-out.txt"; print "Opening file $o\n\n"; open(OUT,">>$o") or die "sigh2"; $once{$o} = 1; } next; } elsif (!defined $spreadsheet && defined $dayrx && $session != 1) { next; } if ($spreadsheet) { @f = split(/$spreaddelimit/,$line); $who = join($spreaddelimjoin,@f[0..($spreadskip - 1)]); $rest = join($spreaddelimjoin,@f[$spreadskip..((scalar @header) - 1)]); # print "who: $who\nrest:$rest\n"; if (defined $temporal) { T: foreach $t (@timerx) { if ($f[$spreadtime] =~ /$t/) { $now = (60 * ${$tf{"h"}}) + ${$tf{"m"}} + (${$tf{"s"}} / 60); last T; } } } } else { $content = defined $splitrx ? (split(/$splitrx/,$line))[$splitnum] : $line; foreach $l (@linerx) { if ($content =~ /$l/) { $who = $1; $rest = $2; last; } else { undef $who; } } unless (defined @linerx) { $who = $infile; $rest = $content; } if (defined $temporal) { T: foreach $t (@timerx) { if ($content =~ /$t/) { $now = (60 * ${$tf{"h"}}) + ${$tf{"m"}} + (${$tf{"s"}} / 60); last T; } } } } if ($who) { $who =~ tr/[A-Z]/[a-z]/ unless $spreadsheet; if ($nicks) { $who = $nickmap{$who}; } $d = $spreadsheet ? $spreaddelimitjoin : " "; $names{$who} .= "$d$rest"; if ($temporal && defined $now) { while ($now < $latest) { $now += 1440; } push(@{$people{$chunk}{$who}},$now); $chunks{$chunk} = 1; $latest = $now; undef $now; } } } if (defined $filename) { unless ($mastertask{"$year\t$month\t$day"} == 1 || defined $flat) { unless (defined $onlymaster) { display(); close OUT; } unless (defined $nomaster) { $o = $outstr."-master-out.txt"; open(OUT,">>$o"); display(); close OUT; $once{$o} = 1; } %mastertask = (); %chunks = (); $session = 2; undef $year; undef $month; undef $day; } } close IN; } if (!defined $filename) { unless ($mastertask{"$year\t$month\t$day"} == 1 || defined $flat) { unless (defined $onlymaster) { display(); close OUT; } unless (defined $nomaster) { $o = $outstr."-master-out.txt"; open(OUT,">>$o"); display(); close OUT; $once{$o} = 1; } %names = () unless $spreadsheet || $usesql; %mastertask = (); %people = (); %chunks = (); $session = 2; } } if (!defined $onlymaster && $session == 1 && !defined $flat) { print "There are ".(scalar keys %names)." names.\n"; display(); close OUT; } elsif ($session == 2) { exit 0 unless $corpus; } elsif (!defined $flat) { print "No session regexp was matched, so there are no words to count. If your file is single-session, use --flat to specify.\n"; exit 0; } else { close OUT and die "OUT should not have been open...\n"; $o = $outstr."-master-out.txt"; open(OUT,">>$o"); display(); close OUT; $once{$o} = 1; } close OUT; unless (defined $usesql || defined $filename || defined $nomaster || defined $onlymaster) { print "Compiling and printing master file...\n"; $domaster=1; $chunk = 0; $o = $outstr."-master-out.txt"; open(OUT,">>$o"); display(); close OUT; $once{$o} = 1; } if (defined $corpus) { print "Removing nonwords from master for corpus...\n"; %cor = (); foreach $k (keys %names) { $l = $names{$k}; $l =~ tr/A-Z\xd4\xd5/a-z\'\'/; $l =~ s/\#//g; $l =~ s/(\:\-\))|(:\))/\#smile/g; $l =~ s/(\:\-\()|(:\()/\#frown/g; $l =~ s/,//g; $l =~ s/[^a-zA-Z0-9\#\'\t]/ /gs; $l =~ s/ +/ /g; foreach $w (split (/\s+/,$l)) { $cor{lc($w)}++; } } $o = $outstr."-corpus.txt"; open(OUT,">$o") or die "no corpus output?"; print "Printing corpus to disk...\n"; foreach $k (sort { $cor{$b} <=> $cor{$a} } keys %cor) { print OUT "$k\t$cor{$k}\n"; } close OUT; } exit 0; sub display { print "Removing random characters...\n"; foreach (keys %names) { $names{$_} =~ tr/A-Z\xd4\xd5/a-z\'\'/; $names{$_} =~ s/\#/ /g; $names{$_} =~ s/(\:\-\))|(:\))/\#smile/g; $names{$_} =~ s/(\:\-\()|(:\()/\#frown/g; if ($spreadsheet) { @new = (); foreach (split(/$spreaddelimit/,$names{$_})) { s/,//g; s/[^a-zA-Z0-9\#\'\xd5\t]/ /gs; s/ +/ /g; push(@new,$_); } $names{$_} = join($spreaddelimjoin,@new); } else { $names{$_} =~ s/,//g; $names{$_} =~ s/[^a-zA-Z0-9\#\'\xd5\t]/ /gs; $names{$_} =~ s/ +/ /g; } } if (defined $anon) { print "Anonymizing data...\n"; $next = 1; %newname = %newpeople = (); foreach $name (sort { int(rand(3)) - 1 } keys %names) { foreach (1..(length (keys %names) - length($next))) { $next = "0$next"; } $n = "participant$next"; $newname{$n} = $names{$name}; foreach $c (keys %people) { if (defined $people{$c}{$name}) { $newpeople{$c}{$n} = $people{$c}{$name}; } } $next++; } %names = %newnames; %people = %newpeople; } if (defined $spreadsheet || defined $usesql) { displaySpread(); return; } print "Counting words for ".scalar (keys %names)." rows in ". scalar (keys %catdefs)." categories...\n"; unless ($once{$o} == 1) { print OUT "filename\t" if (defined $filename); print OUT "$mainindex"; foreach (sort keys %catdefs) { print OUT "\t".$catdefs{$_}; } if (defined $percent) { foreach (sort keys %catdefs) { print OUT "\t".$catdefs{$_}."-pct"; } } print OUT "\ttotal" unless defined $nototal; if (defined $temporal) { print OUT "\tmin\tmax\tt_med\tl_med\tavg\tmode\tvar\tturns"; } print OUT "\n"; } $numnames = scalar (keys %names); $numcats = scalar (keys %catdefs); # foreach $n (sort keys %names) { print "n $n $names{$n}\n"; } if ($domaster == 1) { %categ = %master; } else { $ev = " \$rowcnt = 0; foreach \$n (sort keys \%names) { \$rowcnt++; print \"row \$rowcnt out of $numnames, \$n\\n\" if \$showrow && \$rowcnt \% \$showrow == 0; \$catcnt = 0; "; foreach $c (keys %catdefs) { $ev .= " \$catcnt++; print \"row \$rowcnt, cat \$catcnt out of $numcats\\n\" if (\$showcat && (\$catcnt \% \$showcat == 0)); "; if (!$cat{$c}) { $ev .= " \$categ{\"$c\"}{\$n} = 0;\n"; next; } $ev .=" \$thiscnt = 0; \$destroy = \$names{\$n};\n"; foreach $re (@{$cat{$c}}) { $ev .= " \$thiscnt += \$destroy =~ s/(\\b)$re\\b/\$1/g;\n"; } # $ev .= "print \"\$destroy\\n\";\n"; # \$thiscnt = scalar (split(/$cat{$c}/i,\$names{\$n})) - 1; $ev .= " \$categ{\"$c\"}{\$n} += \$thiscnt <= 0 ? 0 : \$thiscnt; # print \"\".\$categ{\"$c\"}{\$n}.\"\\n\"; unless (defined (\$names{\$n}) && length(\$names{\$n}) > 0) { undef \$categ{\"$c\"}{\$n}; }\n"; } $ev .= "}\n"; # open(LOG,">eval1-log"); # print LOG "$ev"; # close LOG; eval $ev; die "$@" if $@; } print "Writing output to file...\n"; # foreach $c (keys %categ) { # print "For $c we have:\n"; # foreach $n (keys %{$categ{$c}}) { # print " $n\n"; # } # } foreach $n ( sort smartsort keys %names ) { print OUT "$thisfile\t" if (defined $filename); print OUT "$month\t$day\t$year\t" if (defined @ARGV) && !defined $flat; print OUT "$n\t"; foreach $c (sort keys %catdefs) { print OUT $categ{$c}{$n}."\t"; $master{$c}{$n} += $categ{$c}{$n}; } if (defined $names{$n} && length($names{$n}) > 0) { $total = $names{$n}; $total =~ s/\s+/ /g; $total =~ s/^ +//g; $total =~ s/ +$//g; $total = scalar (split (/ /,$total)); if (defined $percent) { foreach $c (sort keys %catdefs) { $pct = $total == 0 ? 0 : nearest(.00001,$categ{$c}{$n} / $total); print OUT "$pct\t"; } } print OUT "$total" unless defined $nototal; if (defined $temporal) { print OUT "\t" unless defined $nototal; print OUT "".temporal($chunk,$n); } print OUT "\n"; } else { if (defined $percent) { print OUT "".join("",map { "\t" } (keys %catdefs)); } print OUT "\n"; } } %categ = (); return; } sub displaySpread { $num = defined $spreadsheet ? ((scalar @header) - $spreadskip) : (scalar @textcols); print "Counting words for ".scalar (keys %names)." rows in "; print scalar (keys %catdefs)." categories in $num columns...\n"; print OUT "filename\t" if (defined $filename); print OUT join("\t", (defined $spreadsheet ? @header[0..($spreadskip - 1)] : @allcols[@indexcols]))."\t"; @l = defined $spreadsheet ? @header[$spreadskip..((scalar @header) - 1)] : @allcols[@textcols]; foreach $col (@l) { foreach $c (sort keys %catdefs) { print OUT "$col"."_".$catdefs{$c}."\t"; } if (defined $percent) { foreach $c (sort keys %catdefs) { print OUT "$col"."_".$catdefs{$c}."_pct\t"; } } print OUT "$col"."_total\t" unless defined $nototal; print OUT "$col"."_orig\t" if $orig; } if (defined $temporal) { print OUT "\tmin\tmax\tt_med\tl_med\tavg\tmode\tvar\tturns"; } print OUT "\n"; $numnames = scalar(keys %names); $numcats = scalar (keys %catdefs); $ev = "\$rowcnt = \$catcnt = 0; foreach \$n (keys \%names) { \$rowcnt++; print \"row \$rowcnt out of $numnames for \$n\\n\" if \$showrow && (\$rowcnt \% \$showrow == 0); \$colcnt = 0;\n"; if (defined \$usesql) { $ev .= " \@columns = split(/\\t/,\$names{\$n});\n"; } elsif (defined \$spreadhead) { $ev .= " \@columns = split(/\$spreaddelimit/,\$names{\$n});\n"; } else { $ev .= " \@columns = (\$names{\$n});\n"; } $ev .= " CELL: foreach \$coln (0..((scalar \@columns)-1)) { \$colcnt++; if (\$showcol && \$colcnt \% \$showcol == 0) { print \"column \$colcnt out of $num for \$n\\n\"; } \$catcnt = 0;\n"; CAT: foreach $c (keys %catdefs) { if ($cat{$c} eq "" | !defined $cat{$c}) { $ev .= " \$categ{\$n}{\"$c\"}{\$coln} = 0;\n"; next CAT; } $ev .= " \$catcnt++; print \"row \$rowcnt, cat \$catcnt out of $numcats\\n\" if (\$showcat && (\$catcnt \% \$showcat == 0)); unless (defined \$columns[\$coln] && length(\$columns[\$coln]) > 0) { undef \$categ{\$n}{\"$c\"}{\$coln}; }\n \$thiscnt = 0; \$destroy = \$columns[\$coln];\n"; foreach $re (@{$cat{$c}}) { $ev .= " \$thiscnt += \$destroy =~ s/(\\b)$re\\b/\$1/g;\n"; } $ev .= " \$categ{\$n}{\"$c\"}{\$coln} += \$thiscnt <= 0 ? 0 : \$thiscnt;\n"; } $ev .= " } }\n"; eval $ev; die "$@" if $@; print "Writing output to file...\n"; foreach $n ( sort smartsort keys %names ) { print OUT "$thisfile\t" if (defined $filename); print OUT "$n"; if (defined $usesql) { @columns = split(/\t/,$names{$n}); } elsif (defined $spreadhead) { @columns = split(/$spreaddelimit/,$names{$n}); } else { @columns = ($names{$n}); } foreach $coln (0..((scalar @columns) - 1)) { foreach $c (sort keys %catdefs) { print OUT "\t".$categ{$n}{$c}{$coln}; } $total = $columns[$coln]; $total =~ s/\s+/ /g; $total =~ s/^ +//; $total =~ s/ +$//; $total = scalar (split (/ /,$total)); if (defined $percent) { foreach $c (sort keys %catdefs) { $pct = $total == 0 ? 0 : nearest(.00001,$categ{$n}{$c}{$coln} / $total); $pct += .00000001; $pct = substr($pct,0,5); print OUT "\t$pct"; } } print OUT "\t$total" unless defined $nototal; print OUT "\t$columns[$coln]" if $orig; } if (defined $temporal) { print OUT "\t".temporal($chunk,$n); } print OUT "\n"; } %categ = (); return; } sub flatten { my ($re,@r) = @_; my @new = (); unless ( scalar (grep {/\\b\$/} @r) > 0 ) { # print "No meta-vars in category $re.\n"; return @r; } foreach $item (@r) { if ($item =~ /\\b\$.*/) { $item = substr($r,2,length($item)-4); push(@new,"\\b".eval($item)."\\b"); } else { push(@new,$item); } } return (flatten(@new)); } sub thing { my ($item, $cond, $else) = @_; if (eval "\"$item\" $cond") { return $item; } else { return $else; } } sub memberOf { my ($item,@list) = @_; $item += 0; foreach (@list) { if ($item eq $_ || $item == $_) { return 1; } } return 0; } sub temporal { my ($chunk,$name) = @_; # print "Chunk is $chunk, name is $name, scalar is ".scalar(@{$people{$chunk}{$name}})."\n"; my $turns = 0; my @diffs = (); unless ($chunk == 0) { @times = @{$people{$chunk}{$name}}; $turns = scalar @times; @diffs = map { $times[$_] - $times[$_ - 1] } (1..(scalar(@times)-1)); } else { foreach $ck (keys %people) { if (defined $people{$ck}{$name}) { @t = @{$people{$chunk}{$name}}; push(@diffs, map { $t[$_] - $t[$_ - 1] } (1..(scalar(@t)-1))); $turns += scalar @t; } } } undef $tmed; $tmed = ((scalar @diffs) % 2) == 1 ? ($diffs[int((scalar @diffs)/2)]) : (($diffs[$foo = (scalar @diffs)/2] + $diffs[$foo - 1]) / 2) if scalar @diffs > 0; @diffs = sort {$a <=> $b} @diffs; undef $lmed; $lmed = ((scalar @diffs) % 2) == 1 ? ($diffs[int((scalar @diffs)/2)]) : (($diffs[$foo = (scalar @diffs)/2] + $diffs[$foo - 1]) / 2) if scalar @diffs > 0; $min = $diffs[0]; $max = $diffs[(scalar @diffs) - 1]; undef $avg; $avg = (eval join(" + ",@diffs)) / (scalar @diffs) unless (scalar @diffs) == 0; my %mode = (); map { $mode{$_}++ } @diffs; $md = (sort { $mode{$a} <=> $mode{$b} } (keys %mode))[scalar (keys %mode) - 1]; undef $var; $var = ( (eval join(" + ",map { ($_ - $avg) ** 2 } @diffs)) / ((scalar @diffs) - 1) ) unless (scalar @diffs) < 2; return join("\t",($min,$max,$tmed,$lmed,$avg,$md,$var,$turns)); } sub smartsort { if ($a =~ /(.*?)(\d+)$/) { $afirst = $1; $anum = $2; } else { return $a cmp $b; } if ($b =~ /(.*?)(\d+)$/) { $bfirst = $1; $bnum = $2; } else { return $a cmp $b; } if ($afirst eq $bfirst) { return $anum <=> $bnum; } else { return $a cmp $b; } }