agonistics / source code

source code

agonistics: a language game

w a r r e n s a c k <wsack@media.mit.edu>

In the spirit of two exhibitions curated by Christiane Paul, CODeDOC
(September 2002) and CODeDOC II (September 2003), the source code
for the project is presented here to facilitate "a reverse look at 'software art'
projects by focusing on and comparing the 'back end' of the code that drives
the artwork's 'front end'-- the result of the code"(Paul, 2002). Unfortunately, unlike
the code in the CODeDOC shows, my code is not pretty. This is unfinished
work, likely to be in medias res for a couple of years. Consequently, it is provided
with a Cultural Commons Attribution-NonCommercial-NoDerivs 2.0 License,
rather than an Free or Open Source license (like the GPL) because I hope
to improve upon, debug and enhance Agonistics before others elaborate on
or modify it. Email me suggestions please!

1: #!/usr/bin/perl
2:
3: #########################################################################
4: ##
5: ## RUN_AGONISTICS
6: ##
7: ## Warren Sack (wsack@media.mit.edu)
8: ##
9: ## February 2005
10: ##
11: ## Usage: perl run_agonistics.pl <configuration file>
12: ##
13: ## Preconditions:
13: ## (1) The file configuration file needs to exist and contain
14: ## a correct set of values for all of the necessary
15: ## parameters.
16: ##
17: ## (2) The sub-directory "Resources" needs to be exist within
18: ## the directory that contains this script. The Resources
19: ## sub-directory contains a number of images and CGI scripts.
20: ##
21: ## (3) If this script is going be run by polling new messages
22: ## periodically from a server, a means to connect and
23: ## download those messages needs to be arranged before the
24: ## script is run. See the example configuration files
25: ## *.conf for details concerning, for example, the use of
26: ## a Yahoo email account as an means to poll and archive
27: ## new messages from a mailing list.
28: ##
29: #########################################################################
30: ##
31: ## Copyright (c) 2005 by Warren Sack
32: ##
33: ## This work is copyrighted with a Creative Commons
34: ## (http://creativecommons.org) Attribution-NonCommercial-NoDerivs 2.0
35: ## License.
36: ##
37: ## Here is a short summary of the license:
38: ##
39: ## You are free to copy, distribute, display, and perform the work
40: ## under the following conditions:
41: ##
42: ## Attribution: You must give the original author (Warren Sack) credit.
43: ##
44: ## Noncommercial: You may not use this work for commercial purposes.
45: ##
46: ## No Derivative Works: You may not alter, transform, or build upon this work.
47: ##
48: ## * For any reuse or distribution, you must make clear to others the
49: ## license terms of this work.
50: ## * Any of these conditions can be waived if you get permission from
51: ## the copyright holder (Warren Sack).
52: ## * Your fair use and other rights are in no way affected by the above.
53: ## * The details and full text of the license can be found at this URL:
54: ## http://creativecommons.org/licenses/by-nc-nd/2.0/legalcode
55: ##
56: #########################################################################
57:
58: if ( $#ARGV < 0 ) {
59:     die "Usage: run_agonistics.pl <configuration file>\n".
60:     "This script requires one argument.\n";
61: }
62: elsif ( not(-e $ARGV[0]) ) {
63:     die "Usage: run_agonistics.pl <configuration file>\n".
64:     "This script requires a configuration file. The given configuration file cannot be found.\n";
65: }
66:
67:
68: use utf8;
69: use Unicode::Normalize;
70: use HTML::Entities();
71: use File::Temp qw(tempfile);
72: use File::Path;
73: use Date::Manip qw(ParseDate ParseDateString UnixDate);
74: use Time::Local;
75: use File::Copy;
76: use MIME::WordDecoder;
77: use Lingua::Stem;
78: use Fcntl qw(:flock);
79: use Crypt::SSLeay;
80: use Mail::Client::Yahoo;
81: use Digest::MD5;
82: use Net::NNTP;
83: use Net::SSLeay;
84: use IO::Socket::SSL;
85: use Net::IMAP::Simple::SSL;
86:
87: $batch_of_messages = 0;
88:
89: ## READ_CONFIG_FILE
90: ##
91: sub read_config_file
92: {
93:     my($key,$value);
94:
95:     open(CONFIG,$ARGV[0]) || die "Can't find configuration file: $ARGV[0]\n";
96:     while(<CONFIG>) {
97:     chomp;
98:     if ( /^\#/ ) { next; }
99:     if ( /^\s*$/ ) { next; }
100:     ($key,$value) = $_ =~ /^(\S+)\s+(\S+)/;
101:     $agonistics_config{$key} = $value;
102:     }
103:     close(CONFIG);
104: }
105:
106:
107: ## INITIALIZE_GLOBAL_VARIABLES
108: ##
109: ## Input: None
110: ##
111: ## Effects: Global variables associated with this package are initialized.
112: ##
113: ## Output: None
114: ##
115: sub initialize_global_variables
116: {
117:     ## initializations done only before the first batch of messages is processed
118:     if ( $batch_of_messages == 0 ) {
119: ## Open and read the config file into a hash.
120:     %agonistics_config = ();
121:     &read_config_file();
122: ## Before setting slash, the machine's OS is checked.
123:     if ( $ENV{'OSTYPE'} =~ /^win/i ) { $slash = '\\'; }
124:     else { $slash = '/'; }
125:     $news_group = $agonistics_config{'Newsgroup'};
126:     $archive_file_name = $agonistics_config{'FileNameOfArchive'};
127:     $language_locale = $agonistics_config{'LanguageTag'};
128:     if ( defined($agonistics_config{'MaxFrames'}) ) {
129:         $max_frames = $agonistics_config{'MaxFrames'};
130:     }
131:     else { $max_frames = 1000; }
132:     if ( defined($agonistics_config{'PauseBetweenFrames'}) ) {
133:         $pause_between_frames = $agonistics_config{'PauseBetweenFrames'};
134:     }
135:     else { $pause_between_frames = 7; }
136: ## Load the correct end-of-sentence tagger. English and French
137: ## texts both use the English tagger. German texts use a
138: ## different tagger.
139:     if ($language_locale eq 'DE') {
140:         require Lingua::DE::Sentence;
141:         Lingua::DE::Sentence->import( qw(get_sentences) );
142:     }
143:     else {
144:         require Lingua::EN::Sentence;
145:         Lingua::EN::Sentence->import( qw(get_sentences) );
146:     }
147:     $archive_name = $agonistics_config{'DirectoryForOutput'};
148:     $recency = $agonistics_config{'Recency'};
149:     $documents_url = $agonistics_config{'DocumentsURL'};
150:     $cgi_url = $agonistics_config{'CGIURL'};
151:     $web_server_directory = $agonistics_config{'WebServerDirectory'};
152:     $web_server_cgi_directory = $agonistics_config{'WebServerCGIDirectory'};
153:     $is_interactive_p = $agonistics_config{'InteractiveMode'};
154:     if ( $is_interactive_p =~ /n/i ) { $is_interactive_p = 'NO'; }
155:     else { $is_interactive_p = 'YES'; }
156: ## Record the address to be used for posting messages to the list analyzed.
157:     $mailing_list_address = $agonistics_config{'MailingListAddress'};
158: ## Note information about the Yahoo mail account, if it is to be used.
159:     $yahoo_uid = $agonistics_config{'YahooUID'};
160:     $yahoo_password = $agonistics_config{'YahooPassword'};
161:     $yahoo_outbox = $agonistics_config{'YahooOutbox'};
162: ## Note information about the IMAP mail server and account, if it is to be used.
163:     $imap_server = $agonistics_config{'IMAPServer'};
164:     $imap_uid = $agonistics_config{'IMAPUID'};
165:     $imap_password = $agonistics_config{'IMAPPassword'};
166:     $imap_outbox = $agonistics_config{'IMAPOutbox'};
167: ## Note information about the NNTP account and server, if it is to be used.
168:     $nntp_server = $agonistics_config{'NNTPServer'};
169:     $nntp_uid = $agonistics_config{'NNTPUID'};
170:     $nntp_password = $agonistics_config{'NNTPPassword'};
171:     if ( ( $nntp_server and ( $yahoo_uid or $imap_server ) )
172:          or ( $yahoo_uid and ( $nntp_server or $imap_server ) )
173:          or ( $imap_server and ( $nntp_server or $yahoo_uid ) ) ) {
174:         die "Only one of the following may be defined: (a) NNTP server; (b) Yahoo UID; (c) IMAP server".
175:         "\nTwo of the three need to be commented out in the configuration file $ARGV[0]\n";
176:     }
177: ## How many seconds should the script wait between tries to download
178: ## messages from the server?
179:     $pause = $agonistics_config{'PauseBetweenFetches'};
180:     $output_directory = $web_server_directory.$slash.'Agonistics'.$slash.$archive_name;
181:     $cgi_directory = $web_server_cgi_directory.$slash.'Agonistics';
182: ## create the directory to house the CGI scripts
183:     mkdir($cgi_directory);
184: ## copy the CGI scripts into the CGI directory
185:     my $cgi_script_file;
186:     my $send_script_file_found_p = 0;
187:     opendir CGISCRIPTS, 'Resources'.$slash.'CGIScripts' or die "Cannot open CGIScripts directory: $!";
188:     foreach $cgi_script_file (readdir CGISCRIPTS) {
189:         if ( $cgi_script_file eq 'send_message.pl' ) {
190:         &rewrite_send_script_file('Resources'.$slash.'CGIScripts'.$slash.$cgi_script_file,$cgi_directory.$slash.$cgi_script_file);
191:         $send_script_file_found_p = 1;
192:         }
193:         else { copy('Resources'.$slash.'CGIScripts'.$slash.$cgi_script_file,$cgi_directory.$slash.$cgi_script_file); }
194:         chmod(0777,$cgi_directory.$slash.$cgi_script_file);
195:     }
196:     closedir CGISCRIPTS;
197:     if ( $send_script_file_found_p == 0 ) { die "Can't find send_message.pl CGI script\n"; }
198: ## Alternatively load the German or English+French end-of-sentence
199: ## tagger depending upon the language specified on the command line.
200:     if ( $language_locale =~ /^DE/ ) {
201:         require Lingua::DE::Sentence;
202:         Lingua::DE::Sentence->import( qw(get_sentences) );
203:     }
204:     else {
205:         require Lingua::EN::Sentence;
206:         Lingua::EN::Sentence->import( qw(get_sentences) );
207:     }
208:     $stemmer = Lingua::Stem->new({-locale => $language_locale});
209:     $stemmer->stem_caching({ -level => 2 });
210:     $log_file = $agonistics_config{'FileNameOfLog'};
211:     open(LOG,'>'.$log_file);
212:     close(LOG);
213:     $radius_of_circle = 100000;
214:     my $random_number = int(rand(10000));
215:     $raw_messages_file = 'raw_messages_file_'.$random_number.'.txt';
216:     $end_of_message_marker = '__end_of_message_marker__';
217: ## English stop words
218:     @english_stop_words = ("a", "about", "above", "according", "across", "actually", "adj", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anywhere", "are", "aren", "around", "as", "at", "b", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "behind", "being", "below", "beside", "besides", "between", "beyond", "billion", "both", "but", "by", "c", "can", "can", "cannot", "caption", "co", "could", "couldn", "d", "did", "didn", "do", "does", "doesn", "don", "down", "during", "e", "each", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "f", "few", "fifty", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "further", "g", "h", "had", "has", "hasn", "have", "haven", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "instead", "into", "is", "isn", "it", "its", "itself", "j", "k", "l", "last", "later", "latter", "latterly", "least", "less", "let", "like", "likely", "ll", "ltd", "m", "made", "make", "makes", "many", "maybe", "me", "meantime", "meanwhile", "might", "million", "miss", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "must", "my", "myself", "n", "namely", "neither", "never", "nevertheless", "next", "nine", "ninety", "no", "nobody", "none", "nonetheless", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "overall", "own", "p", "per", "perhaps", "q", "r", "rather", "re", "recent", "recently", "s", "same", "seem", "seemed", "seeming", "seems", "seven", "seventy", "several", "she", "should", "shouldn", "since", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "stop", "such", "t", "taking", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thirty", "this", "those", "though", "thousand", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "toward", "towards", "trillion", "twenty", "two", "u", "under", "unless", "unlike", "unlikely", "until", "up", "upon", "us", "used", "using", "v", "ve", "very", "via", "w", "was", "wasn", "we", "well", "were", "weren", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whomever", "whose", "why", "will", "with", "within", "without", "would", "wouldn", "wrote", "x", "y", "yes", "yet", "you", "your", "yours", "yourself", "yourselves", "z", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "http", "www", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec", "com", "edu", "org", "just", "net", "ftp", "nntp", "http", "html");
219:     @english_present_tense_of_to_be = ( "am", "are", "is" );
220: ## French stop words
221: ## Note that some words here are intentionally misspelled (i.e., spelled without diacritical marks).
222: ## This appears to be necessary since the diacritical marks are not always employed by the participants
223: ## and they are oftentimes lost between the client and server.
224:     @french_stop_words = ( "a", "au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "elles", "en", "et", "eux", "il", "ils", "je", "la", "le", "les", "leur", "lui", "ma", "mais", "me", "m\x{EA}me", "mes", "moi", "mon", "ne", "nos", "notre", "nous", "on", "ou", "par", "pas", "plus", "pour", "qu", "que", "qui", "bien", "bon", "bonne", "bonnes", "faire", "fais", "fait", "faisons", "font", "sa", "se", "ses", "son", "sur", "ta", "te", "tes", "toi", "ton", "tu", "un", "une", "vos", "votre", "vous", "c", "d", "j", "l", "\x{E0}", "m", "n", "s", "t", "y", "\x{E9}t\x{E9}", "ete", "\x{E9}t\x{E9}e", "etee", "\x{E9}t\x{E9}es", "etees", "\x{E9}t\x{E9}s", "etes", "\x{E9}tant", "etant", "\x{E9}tante", "etante", "\x{E9}tants", "etants", "\x{E9}tantes", "etantes", "suis", "es", "est", "sommes", "\x{EA}tes", "etes", "sont", "serai", "seras", "sera", "serons", "serez", "seront", "serais", "serait", "serions", "seriez", "seraient", "\x{E9}tais", "etais", "\x{E9}tait", "etait", "\x{E9}tions", "etions", "\x{E9}tiez", "etiez", "\x{E9}taient", "etaient", "fus", "fut", "f\x{FB}mes", "fumes", "f\x{FB}tes", "futes", "furent", "sois", "soit", "soyons", "soyez", "soient", "fusse", "fusses", "f\x{FB}t", "fut", "fussions", "fussiez", "fussent", "ayant", "ayante", "ayantes", "ayants", "eu", "eue", "eues", "eus", "ai", "as", "avons", "avez", "ont", "aurai", "auras", "aura", "aurons", "aurez", "auront", "aurais", "aurait", "aurions", "auriez", "auraient", "avais", "avait", "avions", "aviez", "avaient", "eut", "e\x{FB}mes", "eumes", "e\x{FB}tes", "eutes", "eurent", "aie", "aies", "ait", "ayons", "ayez", "aient", "eusse", "eusses", "e\x{FB}t", "eut", "eussions", "eussiez", "eussent", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "un", "deux", "trois", "quatre", "cinq", "six", "sept", "huit", "neuf", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "http", "message", "jamais", "fr", "sans", "non", "oui", "dire", "dis", "dit", "disez", "disons", "disent", "faut", "toujours", "que", "quel", "quelle", "quelles", "quelqu", "quelque", "quelques", "quels", "qui", "quiconque", "quoi", "quoiqu", "quoique", "\x{E7}a", "ca", "car", "ce", "ceci", "cela", "celle", "l\x{E0}", "la", "celles", "ci", "celles", "celui", "ces", "cet", "cette", "ceux", "comme", "vrai", "\x{EA}tre", "etre", "\x{E9}crit", "ecrit", "rien", "tout", "tous", "toute", "toutes", "si", "fr", "com", "edu", "org", "autres", "autre", "sauf", "vais", "vas", "va", "venons", "venez", "vont", "aller", "aimer", "aiment", "aimons", "aime", "aimes", "mal", "news", "nntp", "html", "ftp", "net", "part", "puis", "voir", "quand", "tant", "autant", "leur", "leurs", "tien", "tienne", "tiens", "tiennes", "mien", "mienne", "miens", "miennes", "soi", "aussi", "ailleurs", "moins", "alors", "passe", "pass\x{E9}", "avoir" );
225:     @french_present_tense_of_to_be = ( "suis", "es", "est", "\x{EA}tes", "etes", "sommes", "sont" );
226: ## German stop words
227: ## Note that some words here are intentionally misspelled (i.e., spelled without diacritical marks).
228: ## This appears to be necessary since the diacritical marks are not always employed by the participants
229: ## and they are oftentimes lost between the client and server.
230:     @german_stop_words = ( "aber", "alle", "allem", "allen", "aller", "alles", "als", "also", "am", "an", "ander", "andere", "anderem", "anderen", "anderer", "anderes", "anderm", "andern", "anderr", "anders", "auch", "auf", "aus", "bei", "bin", "bis", "bist", "da", "damit", "dann", "der", "den", "des", "dem", "die", "das", "da\x{DF}", "derselbe", "derselben", "denselben", "desselben", "demselben", "dieselbe", "dieselben", "dasselbe", "dazu", "dein", "deine", "deinem", "deinen", "deiner", "deines", "denn", "derer", "dessen", "dich", "dir", "du", "dies", "diese", "diesem", "diesen", "dieser", "dieses", "doch", "dort", "durch", "ein", "eine", "einem", "einen", "einer", "eines", "einig", "einige", "einigem", "einigen", "einiger", "einiges", "einmal", "er", "ihn", "ihm", "es", "etwas", "euer", "eure", "eurem", "euren", "eurer", "eures", "f\x{FC}r", "fur", "gegen", "gewesen", "hab", "habe", "haben", "hat", "hatte", "hatten", "hier", "hin", "hinter", "ich", "mich", "mir", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "euch", "im", "in", "indem", "ins", "ist", "jede", "jedem", "jeden", "jeder", "jedes", "jene", "jenem", "jenen", "jener", "jenes", "jetzt", "kann", "kein", "keine", "keinem", "keinen", "keiner", "keines", "k\x{F6}nnen", "konnen", "k\x{F6}nnte", "konntemachen", "man", "manche", "manchem", "manchen", "mancher", "manches", "mein", "meine", "meinem", "meinen", "meiner", "meines", "mit", "muss", "musste", "nach", "nicht", "nichts", "noch", "nun", "nur", "ob", "oder", "ohne", "sehr", "sein", "seine", "seinem", "seinen", "seiner", "seines", "selbst", "sich", "sie", "ihnen", "sind", "so", "solche", "solchem", "solchen", "solcher", "solches", "soll", "sollte", "sondern", "sonst", "\x{FC}ber", "uber", "um", "und", "uns", "unse", "unsem", "unsen", "unser", "unses", "unter", "viel", "vom", "von", "vor", "w\x{E4}hrend", "wahrend", "war", "waren", "warst", "was", "weg", "weil", "weiter", "welche", "welchem", "welchen", "welcher", "welches", "wenn", "werde", "werden", "wie", "wieder", "will", "wir", "wird", "wirst", "wo", "wollen", "wollte", "w\x{FC}rde", "wurde", "w\x{FC}rden", "wurden", "zu", "zum", "zur", "zwar", "zwischen", "dk", "com", "edu", "org", "news", "http", "ftp", "html", "com", "org", "net", "nntp");
231:     @german_present_tense_of_to_be = ( "bin", "bist", "ist", "sind", "seid" );
232:     @stop_words = ();
233:     if ( $language_locale =~ /^EN/ ) { @stop_words = @english_stop_words; }
234: ## Many French and German discussions include English posts, so English stop words are used for them too.
235:     elsif ( $language_locale =~ /^FR/ ) { push(@stop_words,@french_stop_words,@english_stop_words); }
236:     elsif ( $language_locale =~ /^DE/ ) { push(@stop_words,@german_stop_words,@english_stop_words); }
237:     %isa_stop_word = ();
238:     my $stop_word;
239:     foreach $stop_word (@stop_words) { $isa_stop_word{NFD($stop_word)} = 1; }
240: ## Connect to NNTP server if specified. Note that this connection is made once and only once.
241: ## Attempts to connect and the reconnect to the NNTP server can result in multiple, hanging
242: ## connections and, thus, ultimately a refusal by the server to download new messages.
243:     if ( $nntp_server and ( $is_interactive_p eq 'YES' ) ) {
244:         &print_to_log("Connecting to NNTP server $nntp_server...");
245:         eval { $nntp = Net::NNTP->new($nntp_server) };
246:         if ( $@ ) {
247:         &print_to_log("ERROR: Couldn't connect to NNTP server...");
248:         return;
249:         }
250:         &print_to_log("Logging in to NNTP server $nntp_server...");
251:         if ( $nntp_uid and $nntp_password ) {
252:         eval { $nntp->authinfo($nntp_uid,$nntp_password) };
253:         if ( $@ ) {
254:             &print_to_log("ERROR: Couldn't log in to NNTP server...");
255:             return;
256:         }
257:         }
258:     }
259: ## create the output directory
260:     if (-d $output_directory) { rmtree($output_directory,0,1); }
261:     mkdir($output_directory);
262: ## create a place to put the interface
263:     $interface_directory = $output_directory.$slash.'Interface';
264:     mkdir($interface_directory);
265: ## copy the face images into the interface subdirectory
266:     $face_images_directory = $interface_directory.$slash.'FaceImages';
267:     mkdir($face_images_directory);
268:     my $fimage_file;
269:     opendir FID, 'Resources'.$slash.'FaceImages' or die "Cannot open FaceImages: $!";
270:     my $number_of_face_images = 0;
271:     foreach $fimage_file (readdir FID) {
272:         $number_of_face_images++;
273:         copy('Resources'.$slash.'FaceImages'.$slash.$fimage_file,$face_images_directory.$slash.$fimage_file);
274:     }
275:     closedir FID;
276: ## Three images needed for every face (left, right and front);
277:     $number_of_faces = $number_of_face_images / 3;
278: ## Copy the forward and rewind button images into the Interface directory.
279:     if (-e 'Resources'.$slash.'forward.gif') {
280:         copy('Resources'.$slash.'forward.gif',$interface_directory.$slash.'forward.gif');
281:     }
282:     else { die "Can't find GIF file for forward button\n"; }
283:     if (-e 'Resources'.$slash.'rewind.gif') {
284:         copy('Resources'.$slash.'rewind.gif',$interface_directory.$slash.'rewind.gif');
285:     }
286:     else { die "Can't find GIF file for rewind button\n"; }
287: ## Copy the help files into the Interface directory.
288:     if ( (-e 'Resources'.$slash.'help-en.html')
289:          and (-e 'Resources'.$slash.'help-fr.html')
290:          and (-e 'Resources'.$slash.'help-de.html') ) {
291:         &rewrite_help_file('Resources'.$slash.'help-en.html',$interface_directory.$slash.'help-en.html');
292:         &rewrite_help_file('Resources'.$slash.'help-fr.html',$interface_directory.$slash.'help-fr.html');
293:         &rewrite_help_file('Resources'.$slash.'help-de.html',$interface_directory.$slash.'help-de.html');
294:     }
295:     else { die "Can't find help files\n"; }
296: ## create a place to put the messages and message indices
297:     $message_directory = $interface_directory.$slash.'Messages';
298:     mkdir($message_directory);
299: ## create a file name for a message index to be consulted by CGI scripts
300:     $message_index_file_name = $message_directory.$slash.'message_index';
301: ## Copy the style sheet into the output directory and the CGI directory
302:     if (-e 'Resources'.$slash.'message_style.css') {
303:         copy('Resources'.$slash.'message_style.css',$message_directory.$slash.'message_style.css');
304:         copy('Resources'.$slash.'message_style.css',$cgi_directory.$slash.'message_style.css');
305:     }
306:     else { die "Can't find message_style.css file\n"; }
307: ## Copy the style sheet into the interface directory.
308:     if (-e 'Resources'.$slash.'interface_style.css') {
309:         copy('Resources'.$slash.'interface_style.css',$interface_directory.$slash.'interface_style.css');
310:     }
311:     else { die "Can't find interface_style.css file\n"; }
312:     }
313:     ## initializations done only before the second (or later) batch of messages is processed
314:     else {
315: ## Replace the interface webpages of the last batch with blank pages.
316:     &create_blank_pages();
317: ## Remove the messages from the last batch.
318:     if (-d $message_directory) { rmtree($message_directory,0,1); }
319: ## Recreate a place to put the messages and message indices.
320:     mkdir($message_directory);
321:     copy('Resources'.$slash.'message_style.css',$message_directory.$slash.'message_style.css');
322:     &reorganize_archive();
323:     }
324:     undef %analyzed_p;
325:     $time_of_last_nntp_server_check = 0;
326:     $current_simplified_message_id = 0;
327:     $current_simplified_author_id = 0;
328:     $frame_number = 0;
329:     $current_simplified_sentence_id = 0;
330:     $max_score = 1;
331:     $min_score = 1;
332:     $current_winner = 0;
333:     $previous_winner = 0;
334:     $previous_winning_theme = 'previous';
335:     $current_winning_theme = 'current';
336:     $current_winning_sentence = '';
337:     $social_network_index = 0;
338:     $new_messages_downloaded = 0;
339:     %new_social_network = ();
340:     %old_social_network = ();
341:     %link = ();
342:     %sn2saids = ();
343:     %said2sn = ();
344:     %previous_sn2saids = ();
345:     %said2previous_sn = ();
346:     %sn2theta = ();
347:     %ssid2start = ();
348:     %ssid2end = ();
349:     %ssid2sentence = ();
350:     %ssid2smid = ();
351:     %smid2ssids = ();
352:     %token2stem = ();
353:     %stem2tokens = ();
354:     %stem2ssids = ();
355:     %ssid2stems = ();
356:     %citers = ();
357:     %repliers = ();
358:     %mid2smid = ();
359:     %smid2mid = ();
360:     %smid2sdt = ();
361:     %ana2said = ();
362:     %said2ana = ();
363:     %smid2said = ();
364:     %said2smids = ();
365:     %smid2nsl = ();
366:     %nsl2smids = ();
367:     %nsl2stems = ();
368:     %smid2newsgroups = ();
369:     %smid2date = ();
370:     %smid2subject = ();
371:     %mid2pid = ();
372:     %smid2spid = ();
373:     %spid2smids = ();
374:     %ancestors = ();
375:     %smid2fptrs = ();
376:     %said2lines = ();
377:     %tline2smidslns = ();
378:     %smidln2tline = ();
379:     %signature = ();
380:     %line_numbers_of_signature = ();
381:     %source = ();
382:     %smidln2source = ();
383:     %smid2sources = ();
384:     %prefix = ();
385:     %said2addr = ();
386:     @stop_words = ();
387:     %said2theta = ();
388:     %said2previous_theta = ();
389:     %said2theta_movement = ();
390:     %said2rho = ();
391:     %said2previous_rho = ();
392:     %said2rho_movement = ();
393:     %said2image_id = ();
394:     %said_is_speaker = ();
395:     %isa_quotation = ();
396:     %said2percentage_score = ();
397:     %said2previous_percentage_score = ();
398:     %said2score = ();
399:     %said2previous_score = ();
400:     %winning_sentences = ();
401:     %yahooid = ();
402:
403:     return;
404: }
405:
406:
407: ## REWRITE_HELP_FILE
408: ##
409: sub rewrite_help_file
410: {
411:     my($template_file,$specific_file) = @_;
412:
413:     my(@line,@new_line,$new_line,$word);
414:
415:     open(TEMPLATE,$template_file);
416:     open(HELP,'>'.$specific_file);
417:     while($line = <TEMPLATE>) {
418:     chomp($line);
419:     @line = split /\s+/,$line;
420:     @new_line = ();
421:     foreach $word (@line) {
422:         if ( $word =~ /(.*)Newsgroup(.*)/ ) { push(@new_line,$1.$news_group.$2); }
423:         elsif ( $word =~ /(.*)PauseBetweenFetches(.*)/ ) { push(@new_line,$1.$pause.$2); }
424:         else { push(@new_line,$word); }
425:     }
426:     $new_line = join ' ',@new_line;
427:     print HELP $new_line."\n";
428:     }
429:     close(TEMPLATE);
430:     close(HELP);
431: }
432:
433:
434: ## REWRITE_SEND_SCRIPT_FILE
435: ##
436: sub rewrite_send_script_file
437: {
438:     my($template_file,$specific_file) = @_;
439:
440:     my(@line,@new_line,$new_line,$word);
441:
442:     open(TEMPLATE,$template_file);
443:     open(SEND,'>'.$specific_file);
444:     while($line = <TEMPLATE>) {
445:     chomp($line);
446:     if ( $line =~ /NNTPServer/ ) { $new_line = '$nntp_server = '."'".$nntp_server."';"; }
447:     elsif ( $line =~ /NNTPUID/ ) { $new_line = '$nntp_uid = '."'".$nntp_uid."';"; }
448:     elsif ( $line =~ /NNTPPassword/ ) { $new_line = '$nntp_password = '."'".$nntp_password."';"; }
449:     else { $new_line = $line; }
450:     print SEND $new_line."\n";
451:     }
452:     close(TEMPLATE);
453:     close(SEND);
454: }
455:
456:
457: ## PROCESS_NEXT_MESSAGE
458: ##
459: ## Input: An integer: a pointer indicating the start of the current message in the archive.
460: ##
461: ## Effects: Several hashes are created.
462: ##
463: ## Output: An integer: a pointer indicating the current end of the file
464: ## containing the messages.
465: ##
466: ## Notes: The file handle is for a file of messages separated by __end_of_message__marker__
467: ## Important: no whitespace can follow the __end_of_message_marker_!
468: ## The file pointer indicates where, in the file of message, reading
469: ## of the messages should begin.
470: ##
471: sub process_next_message
472: {
473:     my($start) = @_;
474:
475:     my($message,$message_body,$end,%field_values,$field);
476:     my($marker_length,$message_id,$source_smid,@source_smids,$simplified_message_id);
477:     my($simplified_author_id);
478:
479:     open(ARCHIVEFILE,$archive_file_name) || die "Can't open archive file\, $archive_file_name\, $! \n";
480:     $marker_length = length($end_of_message_marker);
481:     seek(ARCHIVEFILE,$start,0);
482:     $/ = $end_of_message_marker;
483:     $message = <ARCHIVEFILE>;
484:     $end = tell(ARCHIVEFILE);
485:     close(ARCHIVEFILE);
486:     if ( $message ) {
487:     undef %field_values;
488:     %field_values = &extract_fields_and_values_from_message($message);
489:     $message_body = '';
490:     if ( $message =~ m/\n\n/g ) {
491:         $message_body = substr($message,pos($message),length($message)-(pos($message)+$marker_length));
492:     }
493: ## If the message is a cancel message or if it does not have an id, then skip it.
494:     if ( ( defined($field_values{'control:'})
495:            and ( $field_values{'control:'} =~ /cancel/i ) )
496:          or (not defined($field_values{'message-id:'}) ) ) {
497:         $frame_number--;
498:         return($end);
499:     }
500: ## Downcase the message-id and replace any spaces with underscores
501:     $field_values{'message-id:'} = lc($field_values{'message-id:'});
502:     $field_values{'message-id:'} =~ tr/ /_/;
503: ## Make sure the message is not a duplicate of one already in the database.
504:     if ( defined($analyzed_p{$field_values{'message-id:'}}) ) {
505:         $frame_number--;
506:         return($end);
507:     }
508:     $analyzed_p{$field_values{'message-id:'}} = 1;
509:     $simplified_message_id = &assert_simplified_message_id($field_values{'message-id:'});
510:     $message_id = $field_values{'message-id:'};
511:     &assert_simplified_date_and_time($simplified_message_id,$field_values{'date:'});
512:     $simplified_author_id = &assert_simplified_author_id($field_values{'from:'});
513:     &assert_message_authorship($simplified_message_id,$simplified_author_id);
514:     &assert_normalized_subject_line($simplified_message_id,$field_values{'subject:'});
515:     &assert_message_header($simplified_message_id,
516:                    $field_values{'newsgroups:'},
517:                    $field_values{'date:'},
518:                    $field_values{'subject:'});
519:     if (not(defined $field_values{'references:'})) { $field_values{'references:'} = ' '; }
520:     if (not(defined $field_values{'in-reply-to:'})) { $field_values{'in-reply-to:'} = ' '; }
521:     &assert_reference_relation($field_values{'message-id:'},
522:                    $field_values{'references:'},
523:                    $field_values{'in-reply-to:'});
524:     &assert_file_pointers($simplified_message_id,$start,$end);
525:     &assert_message_body_authorship($simplified_author_id,$simplified_message_id,$message_body);
526:     &assert_threading_relations($message_id);
527:     &assert_line_positions($simplified_message_id);
528:     &assert_signature($simplified_author_id);
529:     &assert_sources($simplified_message_id);
530:     &assert_sentences($simplified_message_id);
531:     &write_html_for_message($simplified_message_id);
532:     if (defined($smid2sources{$simplified_message_id}) ) {
533:         @source_smids = @{$smid2sources{$simplified_message_id}};
534:     }
535:     else { @source_smids = (); }
536:     push(@source_smids,$smid2spid{$simplified_message_id});
537:     foreach $source_smid (@source_smids) { &write_html_for_message($source_smid); }
538:     &update_social_networks($simplified_message_id,@source_smids);
539:     &update_scores();
540:     &update_themes($simplified_message_id);
541:     &write_html_for_interface($simplified_message_id,@source_smids);
542:     &write_entry_in_message_index($simplified_message_id);
543:     return($end);
544:     }
545:     else { return($start); }
546: }
547:
548:
549: ## UPDATE_SOCIAL_NETWORKS
550: ##
551: ## Input: A simplified message id (integer) and a list of other
552: ## simplified message ids (integers) that represent either
553: ## the messages quoted or replied to by the first message.
554: ##
555: ## Effects: Three hashes are updates: %link, %social_network2saids,
556: ## and %said2sn. The elements $link(A,B) and
557: ## $link(B,A) are added if B previously cited or replied to
558: ## A, and now, in the current message A has either cited or
559: ## replied to B. (This information is stored in the
560: ## hashes %repliers and %citers updated in other functions.)
561: ## If any new links are added, the social networks for A and
562: ## B are checked. If they are the same, then nothing is done.
563: ## If they are different (and, thus, the new link constitutes
564: ## a bridge between the two social networks), the smaller of
565: ## the two social networks is dissolved and its elements are
566: ## added into the larger social network.
567: ##
568: ## Output: none
569: ##
570: sub update_social_networks
571: {
572:     my($smid,@source_smids) = @_;
573:
574:     my($said,$s_said,$weight_from_said,$weight_from_s_said,@new_links);
575:     my($acquiring_social_network,$disappearing_social_network,$ss);
576:     my($isa_member_of_new_social_network,$sni);
577:
578:     ## For each player, save their current social network. The update
579:     ## may change some of all of the players' social networks. The
580:     ## previous positions of the players is necessary for updating
581:     ## the interface.
582:     undef %said2previous_sn;
583:     undef %previous_sn2saids;
584:     foreach $ss (keys %said2sn) {
585:     $said2previous_sn{$ss} = $said2sn{$ss};
586:     ${%{$previous_sn2saids{$said2sn{$ss}}}}{$ss} = 1;
587:     }
588:     $said = $smid2said{$smid};
589:     ## If the author of the current message is appearing for the first
590:     ## time, s/he will constitute a separate new social network.
591:     ## Since it is not possible for one to join another
592:     ## network with just one post, the new network will not be
593:     ## joined to any existing networks and, consequently, the subsequent
594:     ## code can be skipped.
595:     $isa_member_of_new_social_network = 0;
596:     foreach $sni (keys %new_social_network) {
597:     if ( ${%{$sn2saids{$sni}}}{$said} ) {
598:         $isa_member_of_new_social_network = 1;
599:         last;
600:     }
601:     }
602:     if ( $isa_member_of_new_social_network ) { return; }
603:     ## Otherwise, the new message comes from someone who is already in an
604:     ## existing social network. Test to see if the message constitutes
605:     ## a new link between two, as yet separate, social networks.
606:     else {
607:     @new_links = ();
608:     if ( defined($smid2said{$smid})
609:          and defined($said2sn{$said}) ) {
610:         foreach $s_smid (@source_smids) {
611:         $s_said = $smid2said{$s_smid};
612:   ## If the author of the cited or replied to message
613:   ## has ever cited or replied to the author of the
614:   ## current message, then the two authors should be
615:   ## linked together in a social network.
616:         if ( defined($smid2said{$s_smid})
617:              and ( defined(${%{$citers{$said}}}{$s_said})
618:                or defined(${%{$repliers{$said}}}{$s_said}) ) ) {
619:             if ( not(defined($link{$said,$s_said})) ) { push(@new_links,$s_said); }
620:             $weight_from_s_said = 0;
621:             if ( defined(${%{$citers{$said}}}{$s_said}) ) {
622:             $weight_from_s_said += ${%{$citers{$said}}}{$s_said};
623:             }
624:             if ( defined(${%{$repliers{$said}}}{$s_said}) ) {
625:             $weight_from_s_said += ${%{$repliers{$said}}}{$s_said};
626:             }
627:             $weight_from_said = 0;
628:             if ( defined(${%{$citers{$s_said}}}{$said}) ) {
629:             $weight_from_said += ${%{$citers{$s_said}}}{$said};
630:             }
631:             if ( defined(${%{$repliers{$s_said}}}{$said}) ) {
632:             $weight_from_said += ${%{$repliers{$s_said}}}{$said};
633:             }
634:             if ( $weight_from_s_said < $weight_from_said ) {
635:             $link{$said,$s_said} = $weight_from_s_said;
636:             $link{$s_said,$said} = $weight_from_s_said;
637:             }
638:             else {
639:             $link{$said,$s_said} = $weight_from_said;
640:             $link{$s_said,$said} = $weight_from_said;
641:             }
642:         }
643:         }
644:         foreach $s_said (@new_links) {
645:         if ( defined($said2sn{$s_said})
646:              and ( $said2sn{$s_said} != $said2sn{$said} ) ) {
647:       ## The smaller social network joins the larger one and the small
648:       ## network is dissolved.
649:             if ( scalar(keys %{$sn2saids{$said2sn{$said}}})
650:              > scalar(keys %{$sn2saids{$said2sn{$s_said}}}) ) {
651:             $acquiring_social_network = $said2sn{$said};
652:             $disappearing_social_network = $said2sn{$s_said};
653:             }
654:             else {
655:             $acquiring_social_network = $said2sn{$s_said};
656:             $disappearing_social_network = $said2sn{$said};
657:             }
658:             foreach $ss (keys %{$sn2saids{$disappearing_social_network}}) {
659:             $said2sn{$ss} = $acquiring_social_network;
660:             ${%{$sn2saids{$acquiring_social_network}}}{$ss} = 1;
661:             }
662:             delete($sn2saids{$disappearing_social_network});
663:       ## Note: these might chain: e.g., network 1 might disappear into
664:       ## network 2; and, then network 2 disappear into network 3; etc.
665:       ## all in one move (i.e., all after the posting of just one
666:       ## message if it cites and/or replies to multiple messages
667:       ## from multiple social networks.
668:             $old_social_network{$disappearing_social_network} = $acquiring_social_network;
669:         }
670:         }
671:         &update_old_social_networks();
672:     }
673:     return;
674:     }
675: }
676:
677:
678: ## UPDATE_OLD_SOCIAL_NETWORKS
679: ##
680: sub update_old_social_networks
681: {
682:     my($acquiring_social_network,$osn);
683:
684:     ## If any social networks are being merged into other social networks,
685:     ## make sure that none of the acquiring networks (i.e., those networks
686:     ## that will absorb the old networks) are, themselves old networks. This
687:     ## might happen, because the merging process can "chain." see the note in the
688:     ## update_social_networks subroutine. Thus, the ultimate acquiring network
689:     ## will be that one that does not appear as an old network.
690:     if (keys %old_social_network) {
691:     $acquiring_social_network = -1;
692:     foreach $osn (keys %old_social_network) {
693:         if ( not(defined($old_social_network{$old_social_network{$osn}})) ) {
694:         $acquiring_social_network = $old_social_network{$osn};
695:         last;
696:         }
697:     }
698:     if ( $acquiring_social_network != -1 ) {
699:      ## Rewrite the %old_social_network hash so that
700:      ## every old network points to the final acquiring network.
701:      ## Note that, since this is updated after every message
702:      ## posted to the group, we need only be concerned with what
703:      ## can happen to the social networks with the addition of
704:      ## one message and one author (i.e., the player posting the
705:      ## message). Since a player can only be a member of one
706:      ## social network there can only be, in principle, one
707:      ## acquiring network, even if the one post dissolves
708:      ## the distinctions between several existing networks.
709:         foreach $osn (keys %old_social_network) {
710:         $old_social_network{$osn} = $acquiring_social_network;
711:         }
712:     }
713:     }
714:     return;
715: }
716:
717:
718: ## UPDATE_THEMES
719: ##
720: sub update_themes
721: {
722:     my($current_smid) = @_;
723:
724:     my($current_said,$sni,%subject_stem2saids,$smid,$said,$stem,%saids,$ssid);
725:     my(@shared_subject_stems,%sentence_stem2saids,@shared_sentence_stems);
726:
727:     $current_said = $smid2said{$current_smid};
728:     $sni = $said2sn{$current_said};
729:     ## If the poster of the current message is not a member of
730:     ## a social network, then return.
731:     if ( scalar(keys %{$sn2saids{$sni}}) <= 1 ) {
732:     $current_winning_theme = '';
733:     $current_winning_sentence = '';
734:     return;
735:     }
736:     undef %subject_stem2saids;
737:     ## Otherwise, examine all of the subject lines used in the thread from the
738:     ## current message back to the first post in the thread.
739:     foreach $smid (&smids_of_ancestors($current_smid)) {
740:     if ( defined($smid2said{$smid}) ) {
741:         $said = $smid2said{$smid};
742:         if ( defined($smid2nsl{$smid})
743:          and defined ($nsl2stems{$smid2nsl{$smid}}) ) {
744:         foreach $stem (@{$nsl2stems{$smid2nsl{$smid}}}) {
745:             push(@{$subject_stem2saids{$stem}},$said);
746:         }
747:         }
748:     }
749:     }
750:     ## Remove the duplicates in the %subject_stem2saids entries.
751:     foreach $stem (keys %subject_stem2saids) {
752:     undef %saids;
753:     foreach $said (@{$subject_stem2saids{$stem}}) { $saids{$said} = 1; }
754:     @{$subject_stem2saids{$stem}} = keys %saids;
755:     }
756:     @shared_subject_stems = ();
757:     foreach $stem (keys %subject_stem2saids) { push(@shared_subject_stems,$stem); }
758:     ## After the sort below, the first shared stem should be the stem employed by the
759:     ## greatest number of people in the thread in the subject line of their messages.
760:     @shared_subject_stems = sort { $#{$subject_stem2saids{$b}} <=> $#{$subject_stem2saids{$a}} } @shared_subject_stems;
761:     ## Hunt for a sentence in the current post that contains one of the frequent
762:     ## terms from the subject lines of the thread.
763:     foreach $stem (@shared_subject_stems) {
764:     foreach $ssid (@{$smid2ssids{$current_smid}}) {
765:         if (not $isa_quotation{$ssid}) {
766:         if ( grep { $_ eq $stem } @{$ssid2stems{$ssid}} ) {
767:             if ( defined($stem2tokens{$stem}) ) {
768:             $current_winning_theme = ${@{$stem2tokens{$stem}}}[0];
769:             }
770:             else { $current_winning_theme = $stem; }
771:             $current_winning_sentence = $ssid2sentence{$ssid};
772:             push(@{$winning_sentences{$stem}},$ssid);
773:             return;
774:         }
775:         }
776:     }
777:     }
778:     ## If no winning sentence was found that might represent one of the themes
779:     ## mentioned in the subject lines, then look into the body of the messages
780:     ## for themes not mentioned in the subject lines and try again to find a
781:     ## winning sentence.
782:     undef %sentence_stem2saids;
783:     ## Examine all of the words used in the thread from the
784:     ## current message back to the first post in the thread.
785:     foreach $smid (&smids_of_ancestors($current_smid)) {
786:     if ( defined($smid2said{$smid}) ) {
787:         $said = $smid2said{$smid};
788:         foreach $ssid (@{$smid2ssids{$smid}}) {
789:         if (not $isa_quotation{$ssid}) {
790:             foreach $stem (@{$ssid2stems{$ssid}}) {
791:             push(@{$sentence_stem2saids{$stem}},$said);
792:             }
793:         }
794:         }
795:     }
796:     }
797:     ## Remove the duplicates in the %sentence_stem2saids entries.
798:     foreach $stem (keys %sentence_stem2saids) {
799:     undef %saids;
800:     foreach $said (@{$sentence_stem2saids{$stem}}) { $saids{$said} = 1; }
801:     @{$sentence_stem2saids{$stem}} = keys %saids;
802:     }
803:     @shared_sentence_stems = ();
804:     foreach $stem (keys %sentence_stem2saids) { push(@shared_sentence_stems,$stem); }
805:     ## After the sort below, the first shared stem should be the stem employed by the
806:     ## greatest number of people in the thread in their own sentences.
807:     @shared_sentence_stems = sort { $#{$sentence_stem2saids{$b}} <=> $#{$sentence_stem2saids{$a}} } @shared_sentence_stems;
808:     ## Hunt for a sentence in the current post that contains one of the frequent
809:     ## terms from the subject lines of the thread.
810:     foreach $stem (@shared_sentence_stems) {
811:     foreach $ssid (@{$smid2ssids{$current_smid}}) {
812:         if (not $isa_quotation{$ssid}) {
813:         if ( grep { $_ eq $stem } @{$ssid2stems{$ssid}} ) {
814:             if ( defined($stem2tokens{$stem}) ) {
815:             $current_winning_theme = ${@{$stem2tokens{$stem}}}[0];
816:             }
817:             else { $current_winning_theme = $stem; }
818:             $current_winning_sentence = $ssid2sentence{$ssid};
819:             push(@{$winning_sentences{$stem}},$ssid);
820:             return;
821:         }
822:         }
823:     }
824:     }
825:     ## Otherwise, nothing was found and the winning theme and sentence are
826:     ## set to nil.
827:     $current_winning_theme = '';
828:     $current_winning_sentence = '';
829:     return;
830: }
831:
832:
833: ## ASSERT_SENTENCES
834: ##
835: sub assert_sentences
836: {
837:     my($smid) = @_;
838:
839:     my(@signature_lines,$message_text,$ln,$sentences,$sentence);
840:     my($token,@sentence,@modified_sentence,@line_numbers,$said,$stem);
841:     my(%stem_of_modified_sentence,$ssid);
842:
843:     ## Find which lines are signature lines: these will be skipped.
844:     @signature_lines = sort {$a <=> $b} &find_line_numbers_of_signature($smid);
845:     $message_text = '';
846:     $ln = 0;
847:     while (exists $smidln2line{"$smid $ln"}) {
848:     if ( not(grep {$_ == $ln} @signature_lines) ) {
849:         $message_text .= ' __line_number_'.$ln.'__ ';
850:         if (defined($smidln2tline{"$smid $ln"})) {
851:         $message_text .= $smidln2tline{"$smid $ln"};
852:         }
853:     }
854:     $ln++;
855:     }
856:     ## Get the sentences from the text of the message body.
857:     $sentences=get_sentences($message_text);
858:     $ln = 0;
859:     foreach $sentence (@{$sentences}) {
860:     $current_simplified_sentence_id++;
861:     $ssid = $current_simplified_sentence_id;
862:     $ssid2smid{$ssid} = $smid;
863:     push(@{$smid2ssids{$smid}},$ssid);
864:     @sentence = split /\s+/,$sentence;
865:     @modified_sentence = ();
866:     @line_numbers = $ln;
867:     foreach $token (@sentence) {
868:         if ( $token =~ /__line_number_(\d+)__/ ) {
869:         push(@line_numbers,$1);
870:         }
871:         else { push(@modified_sentence,$token); }
872:     }
873:     $ssid2sentence{$ssid} = join ' ',@modified_sentence;
874:     @line_numbers = sort {$a <=> $b} @line_numbers;
875: ## assert on which lines the sentence starts and ends
876:     $ssid2start{$ssid} = $line_numbers[0];
877:     $ssid2end{$ssid} = $line_numbers[$#line_numbers];
878: ## determine whether or not the sentence is a quotation
879:     $isa_quotation{$ssid} = 0;
880:     foreach $ln (@line_numbers) {
881:         if ( $smidln2source{"$smid $ln"} ne "$smid $ln" ) { $isa_quotation{$ssid} = 1; }
882:     }
883:     $ln = $ssid2end{$ssid};
884:     @modified_sentence = &normalize_sentence(join ' ',@modified_sentence);
885:     $said = $smid2said{$smid};
886:     undef %stem_of_modified_sentence;
887:     foreach $stem (@modified_sentence) {
888:      ## Only count a stem once per sentence.
889:         if ( not(defined($stem_of_modified_sentence{$stem})) ) {
890:         $stem_of_modified_sentence{$stem} = 1;
891:         push(@{$stem2ssids{$stem}},$ssid);
892:         push(@{$ssid2stems{$ssid}},$stem);
893:         }
894:     }
895:     }
896:     return;
897: }
898:
899:
900: ## EXTRACT_FIELDS_AND_VALUES_FROM_MESSAGE
901: ##
902: ## Input: A string containing a message formatted according to
903: ## RFC1036.
904: ##
905: ## Effects: Fill the hash %field_values; the keys of the hash are
906: ## RFC1036-defined fields.
907: ##
908: ## Output: A hash: %field_values
909: ##
910: sub extract_fields_and_values_from_message
911: {
912:     my($message) = @_;
913:
914:     my($field,$message_header,%field_values,$input_record_separator);
915:     my($message_header_line,@message_header_lines);
916:     my(@rewritten_message_header_lines,$last_line);
917:     my @rfc1036_fields = (
918:     'from:',
919:     'subject:',
920:     'date:',
921:     'message-id:',
922:     'control:',
923:     'references:',
924:     'in-reply-to:',
925:     'organization:',
926:     'newsgroups:',
927:     'content-type:'
928:     );
929:     $message_header = '';
930:     if ( $message =~ m/\n\n/g ) { $message_header = substr($message,0,pos($message)); }
931:     ## Make sure that the input record separator is set to "\n"
932:     $input_record_separator = $/;
933:     $/ = "\n";
934:     ## Remove any newlines in the values
935:     @message_header_lines = split /\n/,$message_header;
936:     $last_line = '';
937:     foreach $message_header_line (@message_header_lines) {
938:     if ( $message_header_line =~ /^(\S+\:).*$/ ) {
939:         if ( $last_line ) { push(@rewritten_message_header_lines,$last_line); }
940:         $last_line = $message_header_line;
941:     }
942:     else { $last_line .= $message_header_line; }
943:     }
944:     if ( $last_line ) { push(@rewritten_message_header_lines,$last_line); }
945:     foreach $field (@rfc1036_fields) {
946:     foreach $message_header_line (@rewritten_message_header_lines) {
947:         if ($message_header_line =~ /^$field\s*(.*)\s*$/i) {
948:         $field_values{$field} = $1;
949:         last;
950:         }
951:     }
952:     }
953:     $/ = $input_record_separator;
954:     return (%field_values);
955: }
956:
957:
958: ## ASSERT_SIMPLIFIED_MESSAGE_ID
959: ##
960: ## Input: A string: message_id
961: ##
962: ## Effects: Update the two hashes %mid2smid and %smid2mid
963: ## message_id <--> simplified_message_id
964: ## message_ids are the ones that appear in the messages;
965: ## simplified_message_ids are integers
966: ## If message_id is a new id, then the variable,
967: ## $current_simplified_message_id, is incremented.
968: ##
969: ## Output: An integer: the current value of the variable
970: ## $current_simplified_message_id
971: ##
972: sub assert_simplified_message_id
973: {
974:     my($message_id) = @_;
975:
976:     if (not(defined $mid2smid{$message_id})) {
977:     $current_simplified_message_id++;
978:     $mid2smid{$message_id} = $current_simplified_message_id;
979:     $smid2mid{$current_simplified_message_id} = $message_id;
980:     }
981:     return($current_simplified_message_id);
982: }
983:
984:
985: ## ASSERT_SIMPLIFIED_DATE_AND_TIME
986: ##
987: ## Input: Two strings: simplified_message_id
988: ## and unsimplified_date_and_time
989: ##
990: ## Effects: Update the hash %smid2sdt
991: ## simplified_message_id --> simplified_date_and_time
992: ## simplified_date_and_time is an integer indicating
993: ## the date and time when the message was posted
994: ## (measured in seconds since the epoch, where the
995: ## epoch is 1/1/1970 GMT).
996: ##
997: ## Output: None
998: ##
999: ## Notes: This function uses the Date::Manip package;
1000: ## specifically the methods ParseDate() and UnixDate().
1001: ##
1002: sub assert_simplified_date_and_time
1003: {
1004:     my($simplified_message_id,$unsimplified_date_and_time) = @_;
1005:
1006:     my($date,$seconds);
1007:
1008:     $seconds = -1;
1009:     if (not(exists $smid2sdt{$simplified_message_id})) {
1010:     $date = ParseDate($unsimplified_date_and_time);
1011:     if (!$date) {
1012:         print STDERR "Unparsable date: $unsimplified_date_and_time\n";
1013:     }
1014:     else {
1015:         $seconds = &UnixDate($date,"%s");
1016:         $smid2sdt{$simplified_message_id} = $seconds;
1017:     }
1018:     }
1019:     return;
1020: }
1021:
1022:
1023: ## ASSERT_SIMPLIFIED_AUTHOR_ID
1024: ##
1025: ## Input: A string: an author name and/or email address;
1026: ##
1027: ## Effects: Update the %ana2said and %said2ana hashes:
1028: ## author_name_and_address <--> simplified_author_id
1029: ## simplified_author_ids are integers
1030: ## If author is not in the database, then the variable,
1031: ## $current_simplified_author_id, is incremented.
1032: ##
1033: ## Output: An integer: the current value of the variable
1034: ## $current_simplified_author_id, or the
1035: ## simplified author id previously assigned to the
1036: ## given author name.
1037: ##
1038: ## Notes: The author name and/or email address is unmimed using
1039: ## the method unmime from the package MIME::WordDecoder.
1040: ## Thus, any subsequent fetches from the database assume
1041: ## that all RFC-1522 encoded words in the author name and address
1042: ## have been decoded to the local representation (which
1043: ## should be UTF-8 characters). Note also that all quotation
1044: ## marks are removed from the author's name and address.
1045: ## Also, some ISPs or email clients add a unique id to the
1046: ## author's email address with each post
1047: ## (e.g., wsack+123456@media.mit.edu). The unique id is
1048: ## stripped out of the email address.
1049: ##
1050: sub assert_simplified_author_id
1051: {
1052:     my($author_name_and_address) = @_;
1053:
1054:     my($unmimed_author_name_and_address);
1055:
1056:     $unmimed_author_name_and_address = unmime($author_name_and_address);
1057:     ## Delete quotation marks.
1058:     $unmimed_author_name_and_address =~ s/"//g;
1059:     ## Delete any trailing whitespace.
1060:     $unmimed_author_name_and_address =~ s/\p{IsSpace}+$//;
1061:     ## Delete any unique id added by ISP
1062:     if ( $unmimed_author_name_and_address =~ /^([^+]+)\+[0-9]+@(.+)$/ ) {
1063:     $unmimed_author_name_and_address = $1.'@'.$2;
1064:     }
1065:     if (not(exists $ana2said{$unmimed_author_name_and_address})) {
1066:     $current_simplified_author_id++;
1067:     $ana2said{$unmimed_author_name_and_address} = $current_simplified_author_id;
1068:     $said2ana{$current_simplified_author_id} = $unmimed_author_name_and_address;
1069:     &assert_said2addr($current_simplified_author_id,$unmimed_author_name_and_address);
1070:     return($current_simplified_author_id);
1071:     }
1072:     else { return($ana2said{$unmimed_author_name_and_address}); }
1073: }
1074:
1075:
1076: ## ASSERT_SAID2ADDR
1077: ##
1078: ## Input: An email name and address
1079: ##
1080: ## Effects: Store an abbreviated form of the address in the hash %said2addr
1081: ##
1082: sub assert_said2addr
1083: {
1084:     my($said,$ana) = @_;
1085:
1086:     my($user_name);
1087:
1088:     ($user_name) = split /@/,$ana;
1089:     $user_name =~ /^(.*)(<)(.*)$/; ## Find the last (rightmost) '<' on the line.
1090:     if ($2) { $user_name = $1; } ## Delete everything after the last '<'; and,
1091:     $said2addr{$said} = $user_name;
1092:     return;
1093: }
1094:
1095:
1096: ## ASSERT_MESSAGE_AUTHORSHIP
1097: ##
1098: ## Input: Two strings: (1) a simplified message id; and,
1099: ## (2) a simplified author id
1100: ##
1101: ## Effects: Update the hashes %smid2said and %said2smids
1102: ## [simplified_message_id]* <--> simplified_author_id
1103: ## These hashes keep track of (a) the author of any given
1104: ## message; and, (b) the ids of all of the messages
1105: ## authored by any given poster. If the author
1106: ## is new, assign the author to a social_network.
1107: ##
1108: ## Output: None
1109: ##
1110: sub assert_message_authorship
1111: {
1112:     my($simplified_message_id,$simplified_author_id) = @_;
1113:
1114:     if (not(exists $smid2said{$simplified_message_id})) {
1115:     $smid2said{$simplified_message_id} = $simplified_author_id;
1116:     push(@{$said2smids{$simplified_author_id}},$simplified_message_id);
1117:     }
1118:     if (not(defined($said2sn{$simplified_author_id}))) {
1119:     $social_network_index++;
1120:     $said2sn{$simplified_author_id} = $social_network_index;
1121:     ${%{$sn2saids{$social_network_index}}}{$simplified_author_id} = 1;
1122:     $new_social_network{$social_network_index} = 1;
1123:     }
1124:     return;
1125: }
1126:
1127:
1128: ## ASSERT_MESSAGE_HEADER
1129: ##
1130: ## Input: One integer and three strings: a simplified message id,
1131: ## and the newsgroups, date, and subject fields
1132: ## from a message header. These are the fields unrecorded
1133: ## in other hashes.
1134: ##
1135: ## Effects: Parts of the text of the message header is saved so
1136: ## that the message can be more easily printed.
1137: ##
1138: ## Output: None
1139: ##
1140: sub assert_message_header
1141: {
1142:     my($smid,$newsgroups,$date,$subject) = @_;
1143:
1144:     $smid2newsgroups{$smid} = $newsgroups;
1145:     $smid2date{$smid} = $date;
1146:     $smid2subject{$smid} = unmime($subject);
1147:     return;
1148: }
1149:
1150:
1151: ## DELETE_PREFIXES_FROM_SUBJECT_LINE
1152: ##
1153: ## Input: String in which all characters have been normalized and downcased.
1154: ##
1155: ## Output: String in which all prefixing "re: " and "fwd: " substrings have
1156: ## been deleted.
1157: ##
1158: sub delete_prefixes_from_subject_line
1159: {
1160:     my($subject_line) = @_;
1161:
1162:     $subject_line =~ s/re\: //gi;
1163:     $subject_line =~ s/re\://gi;
1164:     $subject_line =~ s/fw\: //gi;
1165:     $subject_line =~ s/fw\://gi;
1166:     $subject_line =~ s/fwd\: //gi;
1167:     $subject_line =~ s/fwd\://gi;
1168:     return $subject_line;
1169: }
1170:
1171:
1172: ## TOKENIZE
1173: ##
1174: ## Input: A string
1175: ##
1176: ## Output: An array of words. Words, contractions and hyphenations
1177: ## in the input string are separated and punctuation is removed.
1178: ##
1179: sub tokenize
1180: {
1181:     my($s) = @_;
1182:
1183:     $s =~ s/\P{Alnum}/ /g;
1184:     return(split /\P{Alnum}+/,$s);
1185: }
1186:
1187:
1188: ## NORMALIZE_SENTENCE
1189: ##
1190: ## Input: A string
1191: ##
1192: ## Output: A string of stemmed, tokenized words with all punctuation
1193: ## and stop words removed.
1194: ##
1195: ## Notes: Words in the input string are stemmed using the Lingua::Stem
1196: ## package. The language assumed by the stemmer must be
1197: ## specified on the command line: EN-UK, EN-US, FR, and DE are
1198: ## all possibilities. The language specified on the command
1199: ## line also determines which stop-word list is used.
1200: ##
1201: sub normalize_sentence
1202: {
1203:     my($sentence) = @_;
1204:
1205:     my($stemmed_words,$stem,@normalized_sentence,$normalized_sentence);
1206:     my(@words,$i);
1207:
1208:     foreach $token (&tokenize($sentence)) {
1209:     $token = lc($token);
1210:     if (not(exists $isa_stop_word{NFD($token)})) { push(@words,$token); }
1211:     }
1212:     $stemmed_words = $stemmer->stem(@words);
1213:     $i = 0;
1214:     foreach $stem (@{$stemmed_words}) {
1215:     if ($stem !~ /^\s*$/) {
1216:         push(@normalized_sentence,$stem);
1217:         $token2stem{$words[$i]} = $stem;
1218:         if ( not(grep {$_ eq $words[$i]} @{$stem2tokens{$stem}}) ) {
1219:         push(@{$stem2tokens{$stem}},$words[$i]);
1220:         }
1221:     }
1222:     $i++;
1223:     }
1224:     return(@normalized_sentence);
1225: }
1226:
1227:
1228: ## ASSERT_NORMALIZED_SUBJECT_LINE
1229: ##
1230: ## Input: An integer and a string: (1) a simplified message id; and,
1231: ## (2) the subject line of the message.
1232: ##
1233: ## Effects: Update the hashes %smid2nsl and %nsl2smids
1234: ## [simplified_message_id]* <--> normalized_subject_line
1235: ## simplified_message_ids map to one normalized_subject_line;
1236: ## normalized_subject_lines map to a list of simplified_message_ids
1237: ## in the normalized_subject_lines all characters are normalized
1238: ## to UTF8 format; all the words have been stemmed;
1239: ## all punctuation marks and stopwords have been removed;
1240: ## and, all "RE: ", "FWD: ", etc. prefixes have been removed.
1241: ## Also update the hash %nsl2stems so that the contents of
1242: ## the subject lines can be compared to the tokenized and stemmed
1243: ## contents of the message bodies.
1244: ##
1245: ## Output: None
1246: ##
1247: ## Notes: "RE: " and "FWD: " prefixes are removed and the unmime method
1248: ## from the MIME::WordDecoder package is applied to the words in
1249: ## the subject line before the words are tokenized and stemmed by
1250: ## the &normalize_sentence function.
1251: ##
1252: sub assert_normalized_subject_line
1253: {
1254:     my($simplified_message_id,$subject_line) = @_;
1255:
1256:     if (not(exists $smid2nsl{$simplified_message_id})) {
1257:     $subject_line = &delete_prefixes_from_subject_line(unmime($subject_line));
1258:     $smid2nsl{$simplified_message_id} = $subject_line;
1259:     push(@{$nsl2smids{$subject_line}},$simplified_message_id);
1260:     if ( not(defined($nsl2stems{$subject_line})) ) {
1261:         @{$nsl2stems{$subject_line}} = &normalize_sentence($subject_line);
1262:     }
1263:     }
1264:     return;
1265: }
1266:
1267:
1268: ## ASSERT_REFERENCE_RELATION
1269: ##
1270: ## Input: Three strings: one (unsimplified) message id and a space- or comma-delimited
1271: ## list of unsimplified message ids (the contents of the 'references:' field);
1272: ## and/or a single message id (the contents of the 'in-reply-to:' field)
1273: ##
1274: ## Effects: One hash is updated, %mid2pid, message_id --> parent message id. If the
1275: ## first message id is not the id of a message that is a reply to another
1276: ## message, then its parent message id will be 'no_parent'.
1277: ##
1278: ## Output: None
1279: ##
1280: ## Notes: Since some messages may be replies to others not in the corpus, the
1281: ## current message might be a reply to a message not yet processed. So,
1282: ## we assert the threading information here with the full message ids and
1283: ## then later, in &assert_threading_relations, the same information is
1284: ## reasserted using simplified message ids. Note that some messages use the
1285: ## 'in-reply-to:' field instead of the 'references:' field to record threading
1286: ## information. All message ids are coerced to lowercase before they are
1287: ## recorded.
1288: ##
1289: sub assert_reference_relation
1290: {
1291:     my($message_id,$references_list,$in_reply_to) = @_;
1292:
1293:     my(@references,$parent_id);
1294:
1295:     $parent_id = 'no_parent';
1296:     if ( ( $references_list =~ /^\s*$/ ) and ( $in_reply_to =~ /^\s*$/ ) ) {
1297:     $mid2pid{$message_id} = $parent_id;
1298:     }
1299:     elsif ( $references_list !~ /^\s*$/ ) {
1300:     @references = split /\s+/,$references_list;
1301: ## Some reference lists are comma- rather than space-delimited
1302:     if ( $#references == 0 ) {
1303:         $comma = ',';
1304:         @references = split /$comma/,$references_list;
1305:     }
1306:     $parent_id = $references[$#references];
1307:     if ( $parent_id =~ /^\s*$/ ) { $parent_id = 'no_parent'; }
1308:     $mid2pid{$message_id} = lc($parent_id);
1309:     }
1310:     ## else if there is a 'in-reply-to:' field, but not a 'references:' field,
1311:     ## fill in the references with the contents of the 'in-reply-to:' field.
1312:     else {
1313: ## Assume that the parent_id is that token in the field that is
1314: ## bracketed by '<' and '>'. If this is not the case, this
1315: ## will not yield the right result.
1316:     $in_reply_to =~ /(\<[^\>]+\>)/;
1317:     $parent_id = $1;
1318:     if ( $parent_id =~ /^\s*$/ ) {
1319:         print STDERR "Could not find message id in in-reply-to field: $in_reply_to.\n";
1320:         $parent_id = 'no_parent';
1321:     }
1322:     $mid2pid{$message_id} = lc($parent_id);
1323:     }
1324:     return;
1325: }
1326:
1327:
1328: ## ASSERT_THREADING_RELATIONS
1329: ##
1330: ## Input: A string: a message id
1331: ##
1332: ## Effects: The hashes %smid2spid and %spid2smid are updated
1333: ## [simplified_message_id]* <--> simplified_parent_id
1334: ## simplified_parent_ids are simplified_message_ids
1335: ## each simplified_message_id has zero or one parent
1336: ## according to whether the message was (or was not)
1337: ## written in-reply-to another message.
1338: ## Each simplified_parent_id maps to a list of
1339: ## simplified_message_ids according to the number of
1340: ## replies written in response to the posted parent
1341: ## message. A separate hash that aggregates the smid2spid
1342: ## relations by author id is also updated here: %repliers,
1343: ## records which authors have replied to a given author.
1344: ##
1345: ## Output: None
1346: ##
1347: ## Notes: This function re-writes the information stored in
1348: ## the hash %mid2pid and computed in the function
1349: ## assert_reference_relation.
1350: ##
1351: sub assert_threading_relations
1352: {
1353:     my($mid) = @_;
1354:
1355:     my($smid,$pid,$spid,$said,$psaid);
1356:
1357:     $smid = $mid2smid{$mid};
1358:     $pid = $mid2pid{$mid};
1359:     if ($pid ne 'no_parent') {
1360: ## Note: some parent messages may not be included in the processed corpus.
1361: ## The following line creates a simplified message id for such unincluded
1362: ## parent messages.
1363:     if (not (exists $mid2smid{$pid})) {
1364:         $spid = &assert_simplified_message_id($pid);
1365:         $smid2sdt{$spid} = -1;
1366:         $smid2said{$spid} = -1;
1367:         $said2ana{-1} = '';
1368:         $smid2nsl{$spid} = '';
1369:     }
1370:     $spid = $mid2smid{$pid};
1371:     if (not (exists $smid2spid{$smid})) {
1372:         $smid2spid{$smid} = $spid;
1373:         push(@{$spid2smids{$spid}},$smid);
1374:     }
1375:     if ( defined($smid2said{$spid})
1376:          and defined ($smid2said{$smid}) ) {
1377:         $said = $smid2said{$smid};
1378:         $psaid = $smid2said{$spid};
1379:         if ( $said != $psaid ) { ${%{$repliers{$smid2said{$spid}}}}{$smid2said{$smid}}++; }
1380:     }
1381:     }
1382:     ## If a message has no parent, it's entry in %smid2spid is ''.
1383:     else { $smid2spid{$smid} = ''; }
1384:     return;
1385: }
1386:
1387:
1388: ## ASSERT_FILE_POINTERS
1389: ##
1390: ## Input: Three integers: a simplified message id and
1391: ## two file pointers (a start and and end).
1392: ##
1393: ## Effects: The hash %smid2fptrs is updated:
1394: ## simpified_message_id --> start_and_end_file_pointers
1395: ## Since the original text of each message is stored in a file,
1396: ## each simplified_message_id has associated start and
1397: ## end file pointers according to where, in the file,
1398: ## the text of the message is located.
1399: ##
1400: ## Output: None
1401: ##
1402: sub assert_file_pointers
1403: {
1404:     my($simplified_message_id,$start_pointer,$end_pointer) = @_;
1405:
1406:     if (not (exists $smid2fptrs{$simplified_message_id})) {
1407:     push(@{$smid2fptrs{$simplified_message_id}},$start_pointer,$end_pointer);
1408:     }
1409:     return;
1410: }
1411:
1412:
1413: ## ASSERT_MESSAGE_BODY_AUTHORSHIP
1414: ##
1415: ## Input: Two integers (a simplified author id and a simplified message id)
1416: ## and a string (the lines of the body of a message)
1417: ##
1418: ## Effects: One hash of hashes (%said2lines) is updated; and, another hash
1419: ## is updated: %smidln2line (this allows future, quick access to each
1420: ## of the lines of a message);
1421: ## For each author a separate hash is created to record
1422: ## every line of every message posted by the author.
1423: ## Each such hash has the following form: %line2smidslns
1424: ## line_of_text --> [simplified_message_id and line_number]*
1425: ## The text of each line of each message is stored as a key
1426: ## in this hash. The values of the hash are arrays of strings.
1427: ## Each string consists of a simplified_message_id and a line
1428: ## number concatenated together, indicting where the line of text
1429: ## appeared in the corpus of messages. These hashes are used to
1430: ## identify the signature (if any) of each author. Entries for
1431: ## each of the lines of the message_body are entered in the
1432: ## authors' hash.
1433: ##
1434: ## Output: None
1435: ##
1436: sub assert_message_body_authorship
1437: {
1438:     my($said,$smid,$body) = @_;
1439:
1440:     my(%line2smidslns,$line,$ln);
1441:
1442:     if (defined $said2lines{$said}) { %line2smidslns = %{$said2lines{$said}}; }
1443:     else { %line2smidslns = (); }
1444:     $ln = 0;
1445:     foreach $line (split /\n/,$body) {
1446:     push(@{$line2smidslns{$line}},"$smid $ln");
1447:     $smidln2line{"$smid $ln"} = $line;
1448:     $ln++;
1449:     }
1450:     $said2lines{$said} = \%line2smidslns;
1451:     return;
1452: }
1453:
1454:
1455: ## ASSERT_LINE_POSITIONS
1456: ##
1457: ## Input: An integer: a simplified author id
1458: ##
1459: ## Effects: The hash %tline2smidslns is created from the hash %smidln2line
1460: ## truncated_line_of_text --> [simplified_message_id and line_number]*
1461: ## Each line of text is truncated at the beginning to remove any
1462: ## leading whitespace and/or punctuation, and then its position is
1463: ## recorded in a list of simplified_message_id and line_number pairs
1464: ## (each pair is recorded as a string with a space between the two
1465: ## integers). This hash created is later used to identify quotations.
1466: ## hashes and is used to identify quotations. Its inverse is also
1467: ## created: %smidln2tline.
1468: ## The hash %source is also created. The keys of %source are truncated
1469: ## lines of text (where all of the quoting prefixes have been removed).
1470: ## The values are pairs of simplified message ids and line numbers.
1471: ## This hash is also used to identify quotations.
1472: ##
1473: ## Output: None
1474: ##
1475: sub assert_line_positions
1476: {
1477:     my($smid) = @_;
1478:
1479:     my($said,%line2smidslns,$line,$tline,@smidslns,$smidln);
1480:     my($prefix,$ln);
1481:
1482:     $ln = 0;
1483:     while (defined $smidln2line{"$smid $ln"}) {
1484:     $line = $smidln2line{"$smid $ln"};
1485:     $tline = $line;
1486: ## Delete any leading punctuation and any trailing whitespace.
1487:     $tline =~ /^(.*)(>)(.*)$/; ## Find the last (rightmost) '>' on the line.
1488:     if ($2) {
1489:         $tline = $3; ## Delete everything up to the last '>'; and,
1490:         $prefix = $1.'>'; ## store the prefix under the smid and line number.
1491:         $prefix{$smid,$ln} = $prefix;
1492:     }
1493:     $tline =~ s/^\p{IsSpace}+//; ## Delete any whitespace at the beginning of the line.
1494:     $tline =~ s/\p{IsSpace}+$//; ## Delete any trailing whitespace.
1495: ## Repeatedly delete any whitespace or punctuation at the (possibly new) beginning of the line.
1496:     for ($tline) { 1 while s/^\P{IsWord}+//; }
1497: ## If there are any non-whitespace characters in the truncated line,
1498: ## assert its line positions.
1499:     if ( $tline =~ /\S/ ) {
1500:      ## Use the edited line as a key to the new hash. Note that the
1501:      ## hash might already contain a list for the edited line, since
1502:      ## many different lines might reduce to the same truncated form.
1503:         push(@{$tline2smidslns{$tline}},"$smid $ln");
1504:         $smidln2tline{"$smid $ln"} = $tline;
1505:      ## Assume that the oldest messages are processed first.
1506:         if (not defined($source{$tline})) { $source{$tline} = "$smid $ln"; }
1507:     }
1508:     $ln++;
1509:     }
1510:     return;
1511: }
1512:
1513:
1514: ## BY_REPETITIONS
1515: ##
1516: sub by_repetitions
1517: {
1518:     $#{@{$tline2smidslns{$a}}} <=> $#{@{$tline2smidslns{$b}}};
1519: }
1520:
1521:
1522: ## SMIDS_OF_ANCESTORS
1523: ##
1524: ## Input: An integer: a simplified message id.
1525: ##
1526: ## Effects: Cache the results in the hash %ancestors.
1527: ##
1528: ## Output: A list: the simplified message id of the
1529: ## parent, the parent's parent, etc. The hash
1530: ## %smid2spid is traversed from the input id to
1531: ## the root of the thread containing the message.
1532: ##
1533: sub smids_of_ancestors
1534: {
1535:     my($smid) = @_;
1536:
1537:     if ( not($smid) ) { return(); }
1538:     if (not exists($ancestors{$smid})) {
1539:     push(@{$ancestors{$smid}},&smids_of_ancestors($smid2spid{$smid}),$smid);
1540:     }
1541:     return(@{$ancestors{$smid}});
1542: }
1543:
1544:
1545: ## ASSERT_SOURCES
1546: ##
1547: ## Input: An integer: a simplified message id
1548: ##
1549: ## Effects: Create the hash %smidln2source.
1550: ## The keys of %smidln2source are simplified message ids and line
1551: ## numbers, as are the values. %smidln2source contains the same
1552: ## information as %source (see above, in the documentation for
1553: ## the function &assert_line_positions) except it also corrects
1554: ## some of the information in %source. Aggregated counts are
1555: ## also updated in this function: %citers records which authors
1556: ## have cited others; and, %smid2sources records all of the
1557: ## messages cited in the current message.
1558: ##
1559: ## Output: None
1560: ##
1561: sub assert_sources
1562: {
1563:     my($smid) = @_;
1564:
1565:     my($tline,$smidln,$ln,$earliest,$i,$source_smid,$source_ln);
1566:     my($i_minus_one,$i_plus_one,$prefix_above,$source_smid_above,$source_ln_above);
1567:     my($prefix_below,$source_smid_below,$source_ln_below,@tline,%prefix2sourcesmid,@ancestors);
1568:     my($start,%block_starts,$last_source_smid,$last_source_ln,$source,%current_sources);
1569:     my(@signature_lines,$source_author,$said,$source_said);
1570:
1571:     ## Correct the %source hash by examining the sources of each line and
1572:     ## its surrounding context. Process the corpus message-by-message.
1573:     ## Save the result in the hash %smidln2source.
1574:     $ln = 0;
1575:     ## find the source of each line;
1576:     while (defined $smidln2line{"$smid $ln"}) {
1577:     if (defined $smidln2tline{"$smid $ln"}) { $tline = $smidln2tline{"$smid $ln"}; }
1578: ## if no tline is associated with the smid-ln pair, then assume the line is blank.
1579:     else { $tline = ''; }
1580:     if (defined $source{$tline}) {
1581:         ($source_smid,$source_ln) = $source{$tline} =~ /^(\d+) (\d+)$/;
1582:      ## If a line has been assigned as a source another line in the
1583:      ## the same message, then reset its source to itself.
1584:         if ( $source_smid eq $smid ) { $smidln2source{"$smid $ln"} = "$smid $ln"; }
1585:         else { $smidln2source{"$smid $ln"} = $source{$tline}; }
1586:     }
1587: ## if no source exists, assign the source of a line to itself
1588:     else { $smidln2source{"$smid $ln"} = "$smid $ln"; }
1589:     $ln++;
1590:     }
1591:     ## Re-examine the sources of each line using the prefixes.
1592:     ## If a line has a prefix and has no source,
1593:     ## and the line has the same prefix as the line above, or the line below,
1594:     ## and the line above or below has a source that is another message,
1595:     ## then reassign the source of the line.
1596:     undef %prefix2sourcesmid;
1597:     for ($i = 0; $i < $ln; $i++) {
1598:     $i_minus_one = $i-1;
1599:     $i_plus_one = $i+1;
1600:     if (defined $prefix{$smid,$i}) {
1601:         ($source_smid,$source_ln) = $smidln2source{"$smid $i"} =~ /^(\d+) (\d+)$/;
1602:         if ( $source_smid == $smid ) {
1603:   ## Get the prefix (if any) of the line above the current line.
1604:         if ( ( $i > 0 ) and (defined $prefix{$smid,$i_minus_one}) ) {
1605:             $prefix_above = $prefix{$smid,$i_minus_one};
1606:             ($source_smid_above,$source_ln_above) = $smidln2source{"$smid $i_minus_one"} =~ /^(\d+) (\d+)$/;
1607:         }
1608:         else { $prefix_above = ''; }
1609:   ## Get the prefix (if any) of the line below the current line.
1610:         if ( ( $i < ($ln - 1) ) and (defined $prefix{$smid,$i_plus_one}) ) {
1611:             $prefix_below = $prefix{$smid,$i_plus_one};
1612:             ($source_smid_below,$source_ln_below) = $smidln2source{"$smid $i_plus_one"} =~ /^(\d+) (\d+)$/;
1613:         }
1614:         else { $prefix_below = ''; }
1615:         if ( ( $prefix_above eq $prefix{$smid,$i} ) and ( $source_smid_above != $smid ) ) {
1616:             $smidln2source{"$smid $i"} = $smidln2source{"$smid $i_minus_one"};
1617:         }
1618:         elsif ( ( $prefix_below eq $prefix{$smid,$i} ) and ( $source_smid_below != $smid ) ) {
1619:             $smidln2source{"$smid $i"} = $smidln2source{"$smid $i_plus_one"};
1620:         }
1621:         }
1622:      ## Keep track of which prefixes point to which cited messages.
1623:      ## Only assign a prefix to source_smid if the source_smid is not
1624:      ## equal to the current smid.
1625:         ($source_smid,$source_ln) = $smidln2source{"$smid $i"} =~ /^(\d+) (\d+)$/;
1626:         if ( $source_smid != $smid ) {
1627:         $prefix2sourcesmid{$prefix{$smid,$i}} = $source_smid;
1628:         }
1629:     }
1630:     }
1631:     ## Go through the lines again. If a line has a known prefix with a known source,
1632:     ## but the line has not been assigned a source (other than itself), then assign it the
1633:     ## source message associated with its prefix. Match the line against every line in the
1634:     ## source message to determine the exact line in the source message. Sometimes this
1635:     ## won't be possible to do, in which case the line source is simply set to 0.
1636:     ## This procedure is necessary to correctly re-assign the source of lines that
1637:     ## have wrapped or been truncated.
1638:     for ($i = 0; $i < $ln; $i++) {
1639:     if (defined $prefix{$smid,$i}) {
1640:         ($source_smid,$source_ln) = $smidln2source{"$smid $i"} =~ /^(\d+) (\d+)$/;
1641:         if ( ( $source_smid == $smid )
1642:          and ( defined($prefix2sourcesmid{$prefix{$smid,$i}}) )
1643:          and ( $prefix2sourcesmid{$prefix{$smid,$i}} != $smid ) ) {
1644:         $source_smid = $prefix2sourcesmid{$prefix{$smid,$i}};
1645:   ## Get lines of source message and match them against the
1646:   ## current line (if the current line is long enough).
1647:         $source_ln = 0;
1648:   ## First, make the current line into a pattern that can be matched against.
1649:         if (defined $smidln2tline{"$smid $i"}) {
1650:             $tline = $smidln2tline{"$smid $i"};
1651:             @tline = split /\s+/,$tline;
1652:       ## Only attempt to match the line if it is at least five tokens in length.
1653:             if ( $#tline >= 5 ) {
1654:    ## Delete the first two tokens from the front of the line.
1655:             for ($tline) { 1 while s/^\P{IsSpace}+//; }
1656:             for ($tline) { 1 while s/^\p{IsSpace}+//; }
1657:             for ($tline) { 1 while s/^\P{IsSpace}+//; }
1658:             for ($tline) { 1 while s/^\p{IsSpace}+//; }
1659:    ## quote all the meta characters
1660:             $tline = quotemeta($tline);
1661:    ## Now try matching the line
1662:             while (defined $smidln2line{"$source_smid $source_ln"}) {
1663:                 if ( $smidln2line{"$source_smid $source_ln"} =~ /$tline/ ) { last; }
1664:                 else { $source_ln++; }
1665:             }
1666:             }
1667:         }
1668:   ## Reassign the source
1669:         $smidln2source{"$smid $i"} = "$source_smid $source_ln";
1670:         }
1671:     }
1672:     }
1673:     ## Go through the lines one more time. If the source of a given line is
1674:     ## a signature line in the sourced message, then reassign the source of
1675:     ## the line to itself.
1676:     for ($i = 0; $i < $ln; $i++) {
1677:     ($source_smid,$source_ln) = $smidln2source{"$smid $i"} =~ /^(\d+) (\d+)$/;
1678:     @signature_lines = sort {$b <=> $a} &find_line_numbers_of_signature($source_smid);
1679:     if ( grep {$_ == $source_ln} @signature_lines ) { $smidln2source{"$smid $i"} = "$smid $i"; }
1680:     }
1681:     ## If the current message contains at least one line, then
1682:     ## gather the lines into blocks with the same source.
1683:     if ( not(exists($smidln2source{"$smid 0"})) ) { return; }
1684:     ($last_source_smid,$last_source_ln) = $smidln2source{"$smid 0"} =~ /^(\d+) (\d+)$/;
1685:     if ( not($last_source_smid or $last_source_ln) ) { return; }
1686:     undef %block_starts;
1687:     $start = 0;
1688:     push(@{$block_starts{$start}},0);
1689:     for ($i = 1; $i < $ln; $i++) {
1690:     if (defined $smidln2source{"$smid $i"}) {
1691:         ($source_smid,$source_ln) = $smidln2source{"$smid $i"} =~ /^(\d+) (\d+)$/;
1692:         if ( $source_smid or $source_ln ) {
1693:         if ( $source_smid == $last_source_smid ) { push(@{$block_starts{$start}},$i); }
1694:         else { $start = $i; }
1695:         $last_source_smid = $source_smid;
1696:         $last_source_ln = $source_ln;
1697:         }
1698:     }
1699:     }
1700:     ## find the ids of the ancestors of the message;
1701:     @ancestors = &smids_of_ancestors($smid);
1702:     ## Look through the lines again. Only recognize non-ancestor sources if
1703:     ## they are assigned to a block of lines because single, isolated lines
1704:     ## with non-ancestor sources are likely to be incorrectly assigned.
1705:     foreach $start (keys %block_starts) {
1706:     if ( defined($smidln2source{"$smid $start"}) ) {
1707:         ($source_smid,$source_ln) = $smidln2source{"$smid $start"} =~ /^(\d+) (\d+)$/;
1708:         if ( $source_smid or $source_ln ) {
1709:   ## if the source is itself or an ancestor, leave it;
1710:         if ( grep {$_ == $source_smid} @ancestors ) { next; }
1711:   ## else, if a line is by itself and has a non-ancestor source, change the source to itself.
1712:         elsif ( $#{@{$block_starts{$start}}} == 0 ) { $smidln2source{"$smid $start"} = "$smid $start"; }
1713:         }
1714:     }
1715:     }
1716:     @signature_lines = sort {$b <=> $a} &find_line_numbers_of_signature($smid);
1717:     ## Finally, record all of the messages cited by the current message.
1718:     for ($i = 0; $i < $ln; $i++) {
1719:     if ( defined($smidln2source{"$smid $i"}) ) {
1720:         ($source_smid,$source_ln) = $smidln2source{"$smid $i"} =~ /^(\d+) (\d+)$/;
1721:      ## A source is not counted if the source is the same as the message,
1722:      ## if the line is a signature line, or if the line follows a signature line.
1723:         if ( ( $source_smid != $smid )
1724:          and ( not(@signature_lines) or ( $i < $signature_lines[0] ) )
1725:          and ( not(grep {$_ == $i} @signature_lines) ) ) {
1726:         push(@{$smid2sources{$smid}},$source_smid);
1727:         }
1728:     }
1729:     }
1730:     ## Remove any duplicates from the entry in %smid2sources
1731:     ## Also, remove any self-citations.
1732:     undef %current_sources;
1733:     foreach $source (@{$smid2sources{$smid}}) {
1734:     if ($smid2said{$source} != $smid2said{$smid}) {
1735:         $current_sources{$source} = 1;
1736:     }
1737:     }
1738:     @{$smid2sources{$smid}} = keys %current_sources;
1739:     ## Invert %smid2sources to create %source2smids
1740:     foreach $source (@{$smid2sources{$smid}}) {
1741:     if ( not(grep {$_ == $smid} @{$source2smids{$source}}) ) {
1742:         push(@{$source2smids{$source}},$smid);
1743:     }
1744:     }
1745:     ## Keep track of all of the people who cite a given author
1746:     ## (but ignore places where a player cites themself).
1747:     if ( defined($smid2said{$smid}) ) {
1748:     $said = $smid2said{$smid};
1749:     foreach $source (@{$smid2sources{$smid}}) {
1750:         if ( defined($smid2said{$source}) ) {
1751:         $source_said = $smid2said{$source};
1752:         if ( $said != $source_said ) { ${%{$citers{$source_said}}}{$said}++; }
1753:         }
1754:     }
1755:     }
1756:     return;
1757: }
1758:
1759:
1760: ## FIND_POTENTIAL_SIGNATURE_LINES
1761: ##
1762: ## Input: Two integers: a simplified author id and the number of
1763: ## messages posted by that author.
1764: ##
1765: ## Effects: None
1766: ##
1767: ## Output: A list of strings each of which are potentially
1768: ## lines of the author's signature.
1769: ##
1770: ## Notes: A line is a potential signature line if it occurs
1771: ## in $messages_posted messages. Normally, $messages_posted
1772: ## is equal to the total number of messages posted by the
1773: ## author, but may be a lower number if lines which only
1774: ## appear in a significant number (rather than all) of the
1775: ## author's messages are to be considered as potential
1776: ## signature lines.
1777: ##
1778: sub find_potential_signature_lines
1779: {
1780:     my($said,$messages_posted) = @_;
1781:
1782:     my(%line2smidslns,$line,@possible_signature_lines,@potential_signature_lines);
1783:     my(%smid2lns,$smidln,$smid,$ln,@smids);
1784:
1785:     %line2smidslns = %{$said2lines{$said}};
1786:     ## First find all lines with at least $messages_posted appearances.
1787:     foreach $line (keys %line2smidslns) {
1788:     if ( (1 + $#{@{$line2smidslns{$line}}}) >= $messages_posted) {
1789:         push(@possible_signature_lines,$line);
1790:     }
1791:     }
1792:     ## Next, for each of the possible lines, examine the message ids
1793:     ## associated with it to make sure that every message posted by the
1794:     ## author is represented. This subset of possible signature lines
1795:     ## is stored in the variable @potential_signature_lines.
1796:     foreach $line (@possible_signature_lines) {
1797:     undef %smid2lns;
1798:     foreach $smidln (@{$line2smidslns{$line}}) {
1799:         ($smid,$ln) = $smidln =~ /^(\d+) (\d+)$/;
1800:         push(@{$smid2lns{$smid}},$ln);
1801:     }
1802:     @smids = keys %smid2lns;
1803:     if ( $messages_posted <= (1 + $#smids) ) {
1804:         push(@potential_signature_lines,$line);
1805:     }
1806:     }
1807:     return(@potential_signature_lines);
1808: }
1809:
1810:
1811: ## FIND_SIGNATURE_LINES_IN_FIRST_MESSAGE_OF_AUTHOR
1812: ##
1813: ## Input: An integer and a list of strings: an author's simplified
1814: ## id and a list of potential signature lines.
1815: ##
1816: ## Output: A pointer to a hash containing the positions (line numbers)
1817: ## of each of the potential signature lines in the author's
1818: ## first posted message.
1819: ##
1820: sub find_signature_lines_in_first_message_of_author
1821: {
1822:     my($said,@potential_signature_lines) = @_;
1823:
1824:     my(%line2smidslns,$first_smid,%first_smid_lines,$line,$smidln,$smid,$ln);
1825:
1826:     if ( defined($said2lines{$said})
1827:      and defined($said2smids{$said}[0]) ) {
1828:     %line2smidslns = %{$said2lines{$said}};
1829:     $first_smid = $said2smids{$said}[0];
1830:     foreach $line (@potential_signature_lines) {
1831:         if (defined $line2smidslns{$line}) {
1832:         foreach $smidln (@{$line2smidslns{$line}}) {
1833:             ($smid,$ln) = $smidln =~ /^(\d+) (\d+)$/;
1834:             if ( $smid == $first_smid ) {
1835:             $first_smid_lines{$ln} = $line;
1836:             }
1837:         }
1838:         }
1839:     }
1840:     }
1841:     return(\%first_smid_lines);
1842: }
1843:
1844:
1845: ## FIND_LINE_NUMBERS_OF_SIGNATURE
1846: ##
1847: ## Input: An integer: a simplified message id
1848: ##
1849: ## Output: An array of integers: a list of line numbers
1850: ##
1851: ## Notes: The signatures line stored under the id of the
1852: ## author of the message are matched against all
1853: ## of the lines of the message.
1854: ##
1855: sub find_line_numbers_of_signature
1856: {
1857:     my($smid) = @_;
1858:
1859:     my(@signature,@signature_lines,$ln,$sig_index);
1860:
1861:     if (defined $line_numbers_of_signature{$smid}) {
1862:     return(@{$line_numbers_of_signature{$smid}});
1863:     }
1864:     $ln = 0;
1865:     $sig_index = 0;
1866:     if ( (defined $smid2said{$smid})
1867:      and (defined $signature{$smid2said{$smid}}) ) {
1868:     @signature = @{$signature{$smid2said{$smid}}};
1869:     while (defined $smidln2line{"$smid $ln"}) {
1870:         if ( $sig_index > $#signature ) { last; }
1871:         elsif ( defined($smidln2line{"$smid $ln"})
1872:             and $signature[$sig_index]
1873:             and ( $smidln2line{"$smid $ln"} eq $signature[$sig_index] ) ) {
1874:         push(@signature_lines,$ln);
1875:         $sig_index++;
1876:         }
1877:         else {
1878:         undef @signature_lines;
1879:         $sig_index = 0;
1880:         }
1881:         $ln++;
1882:     }
1883:     }
1884:     ## Pad out the signature with a line above and a line below.
1885:     ## This is necessary to cover cases when a line is used
1886:     ## repeatedly but not always by an author.
1887:     if (@signature_lines) {
1888:     unshift(@signature_lines,$signature_lines[0]-1);
1889:     push(@signature_lines,$signature_lines[$#signature_lines]+1);
1890:     }
1891:     @{$line_numbers_of_signature{$smid}} = @signature_lines;
1892:     return(@signature_lines);
1893: }
1894:
1895:
1896: ## ASSERT_SIGNATURE
1897: ##
1898: ## Input: An integer: a simplified author id
1899: ##
1900: ## Effects: The hash %signatures is built: simplified_message_id --> [line of text]*
1901: ## Signature lines are identified through an examination of the
1902: ## hashes of the hash %said2lines. The lines posted by each author are
1903: ## examined to see if any sequence of lines is repeated in every message
1904: ## posted by the author. These repeated lines, included with every message,
1905: ## are identified to be the author's signature. If there are several such
1906: ## sequences, then the longest contiguous sequences of repeated lines is
1907: ## chosen as the author's likely signature. The signature of the author
1908: ## is stored as an array of the repeated lines.
1909: ##
1910: ## Output: None
1911: ##
1912: sub assert_signature
1913: {
1914:     my($said) = @_;
1915:
1916:     my(%line2smidslns,$messages_posted,$line);
1917:     my($smidln,$smid,$ln,@smids,@potential_signature_lines,$first_smid,@lns);
1918:     my(%first_smid_lines,$first_smid_lines_ptr,%sig_starts,$start,$i,$max_score,$score,$winning_start);
1919:     my($lines_in_first_message,$lines_in_signature,@signature,@truncated_signature,$longest);
1920:
1921:     undef $signature{$said};
1922:     %line2smidslns = %{$said2lines{$said}};
1923:     $messages_posted = 1 + $#{@{$said2smids{$said}}};
1924:     if ( $messages_posted == 1 ) { return; }
1925:     undef @potential_signature_lines;
1926:     @potential_signature_lines = &find_potential_signature_lines($said,$messages_posted);
1927:     ## If no potential signature lines are found, then try to find at least
1928:     ## one line that appears in 40% of the author's messages, if the author
1929:     ## has posted at least 10 messages.
1930:     if ( ( not(@potential_signature_lines)
1931:        or not(grep { $_ =~ /\S/ } @potential_signature_lines) )
1932:      and ( $messages_posted >= 10 ) ) {
1933:     $messages_posted = 0.4 * $messages_posted;
1934:     $longest = 0;
1935:     foreach $line (&find_potential_signature_lines($said,$messages_posted)) {
1936:         $line =~ s/^\P{IsWord}+//; ## Delete any whitespace beginning of the line.
1937:         $line =~ s/\P{IsWord}+$//; ## Delete any trailing whitespace.
1938:         if ( length($line) > $longest ) { @{$signature{$said}} = ($line); }
1939:     }
1940:     return;
1941:     }
1942:     $first_smid_lines_ptr = &find_signature_lines_in_first_message_of_author($said,@potential_signature_lines);
1943:     if (not $first_smid_lines_ptr) { return; }
1944:     %first_smid_lines = %{$first_smid_lines_ptr};
1945:     ## Cluster the line numbers of the potential signature lines into continguous groups of lines.
1946:     ## Clusters are indexed in a hash keyed to the line number of the start of the sequence.
1947:     @lns = sort {$a <=> $b} keys %first_smid_lines;
1948:     undef %sig_starts;
1949:     $start = $lns[0];
1950:     for ($i = 1; $i <= $#lns; $i++) {
1951:     if ( $lns[$i] != (1 + $lns[$i-1]) ) { $start = $lns[$i]; }
1952:     push(@{$sig_starts{$start}},$lns[$i]);
1953:     }
1954:     ## For the author's signature, select the longest contiguous sequence of lines that
1955:     ## contains the greatest number of lines with visible (i.e., non-whitespace) characters.
1956:     $max_score = 0;
1957:     $winning_start = -1;
1958:     foreach $start (keys %sig_starts) {
1959:     $score = 0;
1960:     foreach $ln (@{$sig_starts{$start}}) {
1961:         if (defined($first_smid_lines{$ln}) and ($first_smid_lines{$ln} =~ /\S/)) { $score++; }
1962:     }
1963:     if ( $score > $max_score ) {
1964:         $max_score = $score;
1965:         $winning_start = $start;
1966:     }
1967:     }
1968:     $lines_in_signature = 1 + $#{$sig_starts{$winning_start}};
1969:     $lines_in_first_message = 0;
1970:     $first_smid = $said2smids{$said}[0];
1971:     while (defined $smidln2line{"$first_smid $lines_in_first_message"}) {
1972:     $lines_in_first_message++;
1973:     }
1974:     ## If a signature is the same length as a message, then the author is
1975:     ## simply re-posting exactly the same message and does not have a
1976:     ## valid signature.
1977:     if ( $lines_in_signature < $lines_in_first_message ) {
1978:     foreach $ln (@{$sig_starts{$winning_start}}) {
1979:         push(@{$signature{$said}},$first_smid_lines{$ln});
1980:     }
1981: ## Remove any leading blank lines from the signature
1982:     if (defined $signature{$said}) {
1983:         @signature = @{$signature{$said}};
1984:         @truncated_signature = @{$signature{$said}};
1985:         for ($i = 0; $i <= $#signature; $i++) {
1986:         if ( $signature[$i] =~ /\S/ ) {
1987:             @{$signature{$said}} = @truncated_signature;
1988:             last;
1989:         }
1990:         else { shift(@truncated_signature); }
1991:         }
1992:     }
1993: ## Remove any trailing blank lines from the signature
1994:     if (defined $signature{$said}) {
1995:         @signature = @{$signature{$said}};
1996:         @truncated_signature = reverse(@{$signature{$said}});
1997:         for ($i = $#signature; $i >= 0; $i--) {
1998:         if ( $signature[$i] =~ /\S/ ) {
1999:             @{$signature{$said}} = reverse(@truncated_signature);
2000:             last;
2001:         }
2002:         else { shift(@truncated_signature); }
2003:         }
2004:     }
2005:     }
2006:     return;
2007: }
2008:
2009:
2010: ## UPDATE_SCORES_CENTRALITY_BY_PROXIMITY_VERSION
2011: ##
2012: ## Input: none
2013: ##
2014: ## Effects: Update every player's %said2score, and the $min_score,
2015: ## the $max_score, and $current_winner variables.
2016: ##
2017: ## Output: none
2018: ##
2019: ## Notes: Players are scored according to a social network
2020: ## measure based on proximity. The best score is
2021: ## acheived by the player who is cited by or replied
2022: ## to by every other player _and_ who, reciprocates, by
2023: ## replying to or citing every other player.
2024: ## In the social network, two players are linked if
2025: ## they have reciprocally cited and/or replied to one
2026: ## another. Counts of reciprocated quotes and replies
2027: ## are recorded and weight the links between actors in
2028: ## in the social networks. For the purposes of the scoring
2029: ## algorithm, the inverse of the link weight is used.
2030: ## Thus, a player can increase their centrality rating by
2031: ## corresponding frequently with only a few other players,
2032: ## but -- by so doing -- they increase everyone else's score
2033: ## who is connect to them or their correspondent. Also, since
2034: ## one's global score depends upon the size of one's social
2035: ## network correspondence with only one other person does not
2036: ## produce a high score.
2037: ##
2038: ## Ref: "Centralie et pourvoir," Alain Degenne et Michel Forse,
2039: ## _Les Reseaux Sociaux_ (Paris: Armand Colin, 1994).
2040: ##
2041: ## NB: This subroutine has not yet been debugged and may contain
2042: ## errors.
2043: ##
2044: sub update_scores_centrality_by_proximity_version
2045: {
2046:     my($sni,$graph,@graphs,@graph_sizes,@social_network_indices,$i);
2047:     my($s_said,$e_said,$sssp,$said,$total_distance,@es);
2048:
2049:     @social_network_indices = sort { $a <=> $b } keys %sn2saids;
2050:     ## Build a graph for every social network.
2051:     foreach $sni (@social_network_indices) {
2052:     $graph = new Graph::Undirected;
2053:     push(@graph_sizes,scalar(keys %{$sn2saids{$sni}}));
2054:     $graph->add_vertices(keys %{$sn2saids{$sni}});
2055:     foreach $s_said (keys %{$sn2saids{$sni}}) {
2056:         foreach $e_said (keys %{$sn2saids{$sni}}) {
2057:         if ( defined($link{$s_said,$e_said}) ) {
2058:       ## A link's distance is inversely proportional to the
2059:       ## number of exchanges (citations or replies) between
2060:       ## the two players.
2061:             $graph->add_weighted_edge($s_said,(1.0/$link{$s_said,$e_said}),$e_said);
2062:         }
2063:         }
2064:     }
2065:     push(@graphs,$graph);
2066:     }
2067:     ## Score each player in each social network.
2068:     $i = 0;
2069:     foreach $graph (@graphs) {
2070:     foreach $said ($graph->vertices()) {
2071:      ## Use Dijkstra's algorithm to compute the single-source shortest paths
2072:      ## from each player in the social network to every other player in
2073:      ## the network.
2074:         $sssp = $graph->SSSP_Dijkstra($said);
2075:         $total_distance = 0;
2076:         @es = $sssp->edges();
2077:         while (@es) {
2078:         ($s_said,$e_said,@es) = @es;
2079:         $total_distance += $sssp->get_attribute('weight',$s_said,$e_said);
2080:         }
2081:      ## Multiply the total distance my the size of the social network so that
2082:      ## centrality measures from the different social networks can be compared
2083:      ## and so that centrality in a large network counts for more than centrality
2084:      ## in a small network.
2085:         if ( $total_distance > 0 ) {
2086:         $said2score{$said} = ( ( ($graph_sizes[$i] * $graph_sizes[$i]) - $graph_sizes[$i] ) / $total_distance );
2087:         }
2088:         else { $said2score{$said} = 0; }
2089:         if ( $said2score{$said} > $max_score ) {
2090:         $max_score = $said2score{$said};
2091:         $current_winner = $said;
2092:         }
2093:         if ( $said2score{$said} < $min_score ) { $min_score = $said2score{$said}; }
2094:     }
2095:     $i++;
2096:     }
2097: }
2098:
2099:
2100: ## UPDATE_SCORES_CENTRALITY_BY_DEGREE_VERSION
2101: ##
2102: ## Input: none
2103: ##
2104: ## Effects: Update every player's %said2score, and the $min_score,
2105: ## the $max_score, and $current_winner variables.
2106: ##
2107: ## Output: none
2108: ##
2109: ## Notes: This version computes the simplest version of
2110: ## centrality for social networks: centrality is
2111: ## computed as the mnumber of other players each
2112: ## player is connected to. Consequently, with this
2113: ## version, players win points by corresponding
2114: ## with new people in the discussion, but gain no
2115: ## points for continuing to correspond with people
2116: ## they have already been talking with.
2117: ## Actually, this implementation is a bit more
2118: ## nuanced. The first time a pair of players
2119: ## cites and/or replies to one another, they
2120: ## received full credit; the second time they do so,
2121: ## they receive an additional 90% of the initial
2122: ## points; the third time, 80% of the initial points;
2123: ## etc., until the tenth and subsequent exchanges
2124: ## yield them no points. However, since posts to the
2125: ## group will "fade out" after N (e.g., 500) new
2126: ## subsequent posts have been made, a pair of players
2127: ## will need to sustain their exchange in order to
2128: ## keep all of their possible points.
2129: ##
2130: sub update_scores_centrality_by_degree_version
2131: {
2132:     my($sni,@social_network_indices,$s_said,$e_said,$i,$said,@saids_with_max_score);
2133:
2134:     undef %said2score;
2135:     @social_network_indices = sort { $a <=> $b } keys %sn2saids;
2136:     ## Find the degree of every player. Record the degree as the player's
2137:     ## score. Note that the use of the link weight in calculation (as
2138:     ## explained in the comments above.)
2139:     foreach $sni (@social_network_indices) {
2140:     foreach $s_said (keys %{$sn2saids{$sni}}) {
2141:         $said2score{$s_said} = 0;
2142:         foreach $e_said (keys %{$sn2saids{$sni}}) {
2143:         if ( defined($link{$s_said,$e_said}) ) {
2144:             $link_weight = $link{$s_said,$e_said};
2145:             for ( $i = 10; $i > 0; $i--) {
2146:             if ( $link_weight > 0 ) {
2147:                 $said2score{$s_said} += $i;
2148:                 $link_weight--;
2149:             }
2150:             else { last; }
2151:             }
2152:         }
2153:         }
2154:         if ( $said2score{$s_said} > $max_score ) {
2155:         $max_score = $said2score{$s_said};
2156:         $current_winner = $s_said;
2157:         }
2158:         if ( $said2score{$s_said} < $min_score ) { $min_score = $said2score{$s_said}; }
2159:     }
2160:     }
2161:     ## If several players are tied for the winning score, pick the player
2162:     ## who was first on the field (i.e., the one with the lowest integer as
2163:     ## a simplified author id).
2164:     @saids_with_max_score = ();
2165:     foreach $said (keys %said2score) {
2166:     if ( $said2score{$said} == $max_score ) { push(@saids_with_max_score,$said); }
2167:     }
2168:     @saids_with_max_score = sort { $a <=> $b } @saids_with_max_score;
2169:     $current_winner = $saids_with_max_score[0];
2170: }
2171:
2172:
2173: ## UPDATE_SCORES
2174: ##
2175: ## Input: none
2176: ##
2177: ## Effects: Update every player's %said2score, and the $min_score,
2178: ## the $max_score, and $current_winner variables.
2179: ## Also, record every player's previous_score before
2180: ## updating the %said2score hash. Convert all scores to
2181: ## percentages and save in the hash %said2percentage_score.
2182: ##
2183: ## Output: none
2184: ##
2185: ## Notes: This subroutine is a wrapper for the actual
2186: ## score calculating subroutines. The design is meant
2187: ## to facilitate the experimentation with different
2188: ## scoring algorithms. Currently, two possible
2189: ## scoring algorithms based on the measurement of
2190: ## "centrality" in the social networks literature
2191: ## have been implemented.
2192: ##
2193: sub update_scores
2194: {
2195:     my($said,$score_range,$log_score_range);
2196:
2197:     ## Save the previous scores for each author.
2198:     undef %said2previous_score;
2199:     foreach $said (keys %said2smids) {
2200:     if ( defined($said2score{$said}) ) { $said2previous_score{$said} = $said2score{$said}; }
2201:     else { $said2previous_score{$said} = 0; }
2202:     }
2203:     ## Calculate the new scores.
2204:     undef %said2score;
2205:     $max_score = 0;
2206:     $min_score = 1;
2207:     $previous_winner = $current_winner;
2208:     $previous_winning_theme = $current_winning_theme;
2209:     ## If desired, replace this with a call to a different scoring subroutine.
2210:     &update_scores_centrality_by_degree_version();
2211:     ## Save the previous percentage scores for each author.
2212:     undef %said2previous_percentage_score;
2213:     foreach $said (keys %said2score) {
2214:     if (defined($said2percentage_score{$said})) {
2215:         $said2previous_percentage_score{$said} = $said2percentage_score{$said};
2216:     }
2217:     }
2218:     ## Convert each score to a percentage. Scores are plotted in a log scale
2219:     ## to allow for the usual Zipf distribution of citations and replies (i.e.,
2220:     ## only a handful of people are frequently cited and/or replied to and the
2221:     ## majority are rarely cited or replied to; cf., Lodka's Law in the information
2222:     ## science literature.
2223:     undef %said2percentage_score;
2224:     $score_range = 1 + $max_score - $min_score;
2225:     $log_score_range = log($score_range);
2226:     foreach $said (sort {$a <=> $b} keys %said2smids) {
2227:     if ( $log_score_range != 0 ) {
2228: # $said2percentage_score{$said} = int( 100 * ( log(1 + $said2score{$said}) / $log_score_range) );
2229:         $said2percentage_score{$said} = int( 100 * ( $said2score{$said} / $score_range ) );
2230:     }
2231:     else { $said2percentage_score{$said} = 0; }
2232:     }
2233:     return;
2234: }
2235:
2236:
2237: ## ANGLE_DIFFERENCE
2238: ##
2239: sub angle_difference
2240: {
2241:     my($angle1,$angle2) = @_;
2242:
2243:     ## It is assumed that a circle has $radius_of_circle degrees. Find the
2244:     ## least number of degrees between the two angles. This
2245:     ## entails deciding whether to go clockwise or counter-
2246:     ## clockwise.
2247:     if ( ( $angle1 - $angle2 ) > ( ( $angle2 + $radius_of_circle ) - $angle1 ) ) {
2248:     return(($angle2 + $radius_of_circle) - $angle1);
2249:     }
2250:     else { return($angle1 - $angle2); }
2251: }
2252:
2253:
2254: ## ANGLE_SUBTRACT
2255: ##
2256: sub angle_subtract
2257: {
2258:     my($angle1,$angle2) = @_;
2259:
2260:     if ( $angle1 >= $angle2 ) { return($angle1-$angle2); }
2261:     else { return(($angle1-$angle2)+$radius_of_circle); }
2262: }
2263:
2264:
2265: ## ANGLE_ADD
2266: ##
2267: sub angle_add
2268: {
2269:     my($angle1,$angle2) = @_;
2270:
2271:     return(($angle1+$angle2) % $radius_of_circle);
2272: }
2273:
2274:
2275: ## WRITE_HTML_HEADER_FOR_MESSAGE
2276: ##
2277: sub write_html_header_for_message
2278: {
2279:     my($smid) = @_;
2280:
2281:     my($html,$from,$newsgroups,$date,$subject,$mid,$parent,$parent_smid,@replies,$replies);
2282:     my($reply_smid,@cites,$cites,$cite_smid,@cited_bys,$cited_bys,$cited_by_smid);
2283:
2284:     ## HTML HEAD
2285:     $html = '<HTML>'."\n\t".'<HEAD>'."\n\t\t".'<TITLE>message '.$smid.'</TITLE>';
2286:     $html .= "\n\t\t".'<LINK type="text/css" rel="stylesheet" href="message_style.css">'."\n\t".'</HEAD>';
2287:     $html .= "\n\t".'<BODY>'."\n";
2288:     $html .= "\n\t".'<SCRIPT>window.resizeTo((screen.availWidth/2),screen.availHeight);</SCRIPT>'."\n";
2289:     ## Table containing fields of message header
2290:     $html .= "\n\t".'<TABLE>'."\n\t\t".'<TBODY>';
2291:     ## Subject
2292:     $html .= "\n\t\t".'<TR align="left">';
2293:     $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>SUBJECT: </B></TD>';
2294:     if (defined $smid2subject{$smid}) {
2295:     $subject = $smid2subject{$smid};
2296:     $subject = HTML::Entities::encode_entities($subject);
2297:     $subject =~ s/\s/\&nbsp\;/g;
2298:     }
2299:     else { $subject = ''; }
2300:     $html .= "\n\t\t\t".'<TD class=headertext>'.$subject.'</TD>';
2301:     $html .= "\n\t\t".'</TR>';
2302:     ## From
2303:     $html .= "\n\t\t".'<TR align="left">';
2304:     $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>SENDER: </B></TD>';
2305:     if ( (defined $smid2said{$smid})
2306:      and (defined $said2ana{$smid2said{$smid}}) ) {
2307:     $from = $said2ana{$smid2said{$smid}};
2308:     $from = HTML::Entities::encode_entities($from);
2309:     $from =~ s/\s/\&nbsp\;/g;
2310:     }
2311:     else { $from = ''; }
2312:     $html .= "\n\t\t\t".'<TD class=headertext>'.$from.'</TD>';
2313:     $html .= "\n\t\t".'</TR>';
2314:     ## Newsgroups
2315:     $html .= "\n\t\t".'<TR align="left">';
2316:     $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>NEWSGROUPS: </B></TD>';
2317:     if (defined $smid2newsgroups{$smid}) {
2318:     $newsgroups = $smid2newsgroups{$smid};
2319:     $newsgroups = HTML::Entities::encode_entities($newsgroups);
2320:     $newsgroups =~ s/\s/\&nbsp\;/g;
2321:     }
2322:     else { $newsgroups = ''; }
2323:     $html .= "\n\t\t\t".'<TD class=headertext>'.$newsgroups.'</TD>';
2324:     $html .= "\n\t\t".'</TR>';
2325:     ## Date
2326:     $html .= "\n\t\t".'<TR align="left">';
2327:     $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>DATE: </B></TD>';
2328:     if (defined $smid2date{$smid}) {
2329:     $date = $smid2date{$smid};
2330:     $date = HTML::Entities::encode_entities($date);
2331:     $date =~ s/\s/\&nbsp\;/g;
2332:     }
2333:     else { $date = ''; }
2334:     $html .= "\n\t\t\t".'<TD class=headertext>'.$date.'</TD>';
2335:     $html .= "\n\t\t".'</TR>';
2336:     ## Message-ID
2337:     $html .= "\n\t\t".'<TR align="left">';
2338:     $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>ID: </B></TD>';
2339:     if (defined $smid2mid{$smid}) {
2340:     $mid = $smid2mid{$smid};
2341:     $mid = HTML::Entities::encode_entities($mid);
2342:     $mid =~ s/\s/\&nbsp\;/g;
2343:     $mid = '['.$smid.'] '.$mid;
2344:     }
2345:     else { $mid = ''; }
2346:     $html .= "\n\t\t\t".'<TD class=headertext>'.$mid.'</TD>';
2347:     $html .= "\n\t\t".'</TR>';
2348:     ## References
2349:     $html .= "\n\t\t".'<TR align="left">';
2350:     $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>PREVIOUS: </B></TD>';
2351:     if ( (defined $smid2spid{$smid}) and $smid2spid{$smid} ) {
2352:     $parent_smid = $smid2spid{$smid};
2353:     $parent = '['.$parent_smid.']';
2354:     $parent = '<A href="'.$parent_smid.'.html">'.$parent.'</A>';
2355:     }
2356:     else { $parent = ''; }
2357:     $html .= "\n\t\t\t".'<TD class=headertext>'.$parent.'</TD>';
2358:     $html .= "\n\t\t".'</TR>';
2359:     ## Replies
2360:     $html .= "\n\t\t".'<TR align="left">';
2361:     $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>NEXT: </B></TD>';
2362:     undef @replies;
2363:     if (defined $spid2smids{$smid}) {
2364:     foreach $reply_smid (sort { $a <=> $b } @{$spid2smids{$smid}}) {
2365:         push(@replies,'<A href="'.$reply_smid.'.html">['.$reply_smid.']</A>');
2366:     }
2367:     }
2368:     $replies = join ', ',@replies;
2369:     $html .= "\n\t\t\t".'<TD class=headertext>'.$replies.'</TD>';
2370:     $html .= "\n\t\t".'</TR>';
2371:     ## Cites
2372:     $html .= "\n\t\t".'<TR align="left">';
2373:     $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>CITES: </B></TD>';
2374:     undef @cites;
2375:     if (defined $smid2sources{$smid}) {
2376:     foreach $cite_smid (sort { $a <=> $b } @{$smid2sources{$smid}}) {
2377:         push(@cites,'<A href="'.$cite_smid.'.html">['.$cite_smid.']</A>');
2378:     }
2379:     }
2380:     $cites = join ', ',@cites;
2381:     $html .= "\n\t\t\t".'<TD class=headertext>'.$cites.'</TD>';
2382:     $html .= "\n\t\t".'</TR>';
2383:
2384:     ## Cited-by
2385:     $html .= "\n\t\t".'<TR align="left">';
2386:     $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>CITED BY: </B></TD>';
2387:     undef @cited_bys;
2388:     if (defined $source2smids{$smid}) {
2389:     foreach $cited_by_smid (sort { $a <=> $b } @{$source2smids{$smid}}) {
2390:         push(@cited_bys,'<A href="'.$cited_by_smid.'.html">['.$cited_by_smid.']</A>');
2391:     }
2392:     }
2393:     $cited_bys = join ', ',@cited_bys;
2394:     $html .= "\n\t\t\t".'<TD class=headertext>'.$cited_bys.'</TD>';
2395:     $html .= "\n\t\t".'</TR>';
2396:
2397:     ## Indices
2398:     $html .= "\n\t\t".'<TR align="left">';
2399:     $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>MESSAGES BY: </B></TD>';
2400:     $html .= "\n\t\t\t".'<TD class=headertext>';
2401:     $html .= '<A href="http://'.$cgi_url.'/Agonistics/list_messages_by_id.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'">id</A>, ';
2402:     $html .= '<A href="http://'.$cgi_url.'/Agonistics/list_messages_by_date.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'">date</A>, ';
2403:     $html .= '<A href="http://'.$cgi_url.'/Agonistics/list_messages_by_subject.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'">subject</A>, ';
2404:     $html .= '<A href="http://'.$cgi_url.'/Agonistics/list_messages_by_sender.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'">sender</A>, ';
2405:     $html .= '<A href="http://'.$cgi_url.'/Agonistics/list_messages_by_thread.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'">thread</A>';
2406:     $html .= '</TD>'."\n\t\t".'</TR>';
2407:
2408:     ## Reply button for newsgroup discussions
2409:     if ( $nntp_server ) {
2410:     $html .= "\n\t".'<FORM METHOD="POST" ACTION="http://'.$cgi_url.'/Agonistics/write_message.pl" ENCTYPE="multipart/form-data" TARGET="reply_to_message_'.$smid.'">';
2411:     $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="url" VALUE="'.$documents_url.'">';
2412:     $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="cgi_url" VALUE="'.$cgi_url.'">';
2413:     $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="archive" VALUE="'.$archive_name.'">';
2414:     $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="smid" VALUE="'.$smid.'">';
2415:     $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="id" VALUE="'.$smid2mid{$smid}.'">';
2416:     $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="subject" VALUE="'.$smid2subject{$smid}.'">';
2417:     $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="news_group" VALUE="'.$news_group.'">';
2418:     $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="language_locale" VALUE="'.$language_locale.'">';
2419:     }
2420:     ## Reply button for email-based discussions
2421:     ## This should also contain in-reply-to and subject fields, but Mozilla
2422:     ## seems to have a hard time making sense of these other fields.
2423:     else { $html .= "\n\t".'<FORM ACTION="mailto:'.$mailing_list_address.'">'; }
2424:     $html .= "\n\t\t".'<TR align="left">';
2425:     $html .= "\n\t\t\t".'<TD class=headerfieldtext>';
2426:     my $button_value = '';
2427:     if ( $language_locale =~ /^EN/ ) { $button_value = "REPLY"; }
2428:     elsif ( $language_locale =~ /^FR/ ) { $button_value = "R\x{c9}PONDRE"; }
2429:     elsif ( $language_locale =~ /^DE/ ) { $button_value = "ANTWORTEN"; }
2430:     $html .= "\n\t\t\t\t".'<INPUT name="reply_to_message" class=buttonstyle type="SUBMIT" value="'.$button_value.'">';
2431:     $html .= "\n\t\t\t".'</TD>';
2432:     $html .= "\n\t\t\t".'<TD> </TD>';
2433:     $html .= "\n\t\t".'</TR>';
2434:     $html .= "\n\t".'</FORM>';
2435:
2436:     ## End of table
2437:     $html .= "\n\t\t".'</TBODY>'."\n\t".'</TABLE>'."\n\t".'<BR><BR>';
2438:
2439:     return($html);
2440: }
2441:
2442:
2443: ## WRITE_HTML_FOR_MESSAGE
2444: ##
2445: sub write_html_for_message
2446: {
2447:     my($smid) = @_;
2448:
2449:     my($ln,$header,$body,$next_line,$line_prefix,$end);
2450:     my(@signature_lines,$first);
2451:
2452:     ## First, find which lines are signature lines.
2453:     @signature_lines = sort {$a <=> $b } &find_line_numbers_of_signature($smid);
2454:     ## Correct for the padding done in &find_line_numbers_of_signature;
2455:     pop(@signature_lines);
2456:     shift(@signature_lines);
2457:     ## Next, write the head and body of the message as HTML.
2458:     $ln = 0;
2459:     $header = &write_html_header_for_message($smid);
2460:     $body = '';
2461:     while (defined $smidln2line{"$smid $ln"}) {
2462:     $next_line = $smidln2line{"$smid $ln"};
2463:     $next_line = HTML::Entities::encode_entities($next_line);
2464:     $next_line =~ s/\s/\&nbsp\;/g;
2465: ## Every line of every message gets an anchor name so that
2466: ## it can be linked to, if necessary, from another message.
2467:     $line_prefix = '<A name="'.$ln.'"';
2468: ## If a line is a signature line, then it should be highlighted,
2469: ## but not hyperlinked.
2470:     if ( ($#signature_lines >= 0) and ( $ln == $signature_lines[0] ) ) {
2471:         $line_prefix .= ' class=signaturetext';
2472:         ($first,@signature_lines) = @signature_lines;
2473:     }
2474: ## Else if a line is not its own source, then it is a quote from elsewhere
2475: ## and will be hyperlinked.
2476:     elsif ( defined($smidln2source{"$smid $ln"})
2477:         and ( $smidln2source{"$smid $ln"} ne "$smid $ln" ) ) {
2478:         ($source_smid,$source_ln) = $smidln2source{"$smid $ln"} =~ /^(\d+) (\d+)$/;
2479:         if ( $source_smid or $source_ln ) {
2480:         $line_prefix .= ' class=linktext';
2481:         $line_prefix .= ' href="'.$source_smid.'.html#'.$source_ln.'"';
2482:         }
2483:     }
2484: ## Otherwise, the line is a line of text original to the message.
2485:     else { $line_prefix .= ' class=messagetext'; }
2486:     $line_prefix .= '>';
2487:     $next_line = "\n\t".$line_prefix.$next_line.'</A><BR>';
2488:     $body .= $next_line;
2489:     $ln++;
2490:     }
2491:     $end = "\n\t".'</BODY>'."\n".'</HTML>';
2492:     open(OUT,'>'.$message_directory.$slash.$smid.'.html');
2493:     print OUT $header.$body.$end;
2494:     close(OUT);
2495:     return;
2496: }
2497:
2498:
2499: ## FIND_ADJACENT_SOCIAL_NETWORK_FOR_INTERFACE
2500: ##
2501: ## Input: A list of simplified message ids (integers) that represent
2502: ## either the messages quoted or replied to by the current
2503: ## message.
2504: ##
2505: ## Output: An integer: a social network index. If the message posted
2506: ## by the member of the new social network, is a message that
2507: ## replies to or cites the message of an author already in
2508: ## another social network, return the index of the other
2509: ## social network; otherwise, return 0.
2510: ##
2511: ## Notes: The purpose of this function is to try to find a likely
2512: ## place to position the player in a new social network in
2513: ## the interface. The player in the new social network is
2514: ## assumed to be the network's sole member and is the
2515: ## author of the message just posted to the newsgroup.
2516: ##
2517: sub find_adjacent_social_network_for_interface
2518: {
2519:     my(@source_smids) = @_;
2520:
2521:     my($source_smid,$sn);
2522:
2523:     ## The following is necessary because some authors don't
2524:     ## belong to social networks because they are authors
2525:     ## of messages that are referenced, but not included
2526:     ## in the archive being analyzed.
2527:     $sn = -1;
2528:     foreach $source_smid (@source_smids) {
2529:     if ( defined($smid2said{$source_smid})
2530:          and defined($said2sn{$smid2said{$source_smid}}) ) {
2531:         $sn = $said2sn{$smid2said{$source_smid}};
2532:         last;
2533:     }
2534:     }
2535:     return($sn);
2536: }
2537:
2538:
2539: ## EVENLY_MIX
2540: ##
2541: sub evenly_mix
2542: {
2543:     my($ptr_to_part1,$ptr_to_part2,$ptr_to_whole) = @_;
2544:
2545:     my(@part1,@part2,@whole,@new_whole,$w,$p1,$p2,$i,$j,$part1_after_part2);
2546:
2547:     @part1 = @{$ptr_to_part1};
2548:     @part2 = @{$ptr_to_part2};
2549:     if ( scalar(@part2) > scalar(@part1) ) {
2550:     @part1 = @{$ptr_to_part2}; ## assumed to be more numerous
2551:     @part2 = @{$ptr_to_part1};
2552:     }
2553:     @whole = @{$ptr_to_whole};
2554:     if ( scalar(@part2) <= 1 ) { return(@whole); }
2555:     ## Rotate the members of part1 until the one that used to
2556:     ## follow the first elements of part2 is at the front
2557:     ## of @part1.
2558:     ## First, rotate @whole until the first element of @part2
2559:     ## is at the front.
2560:     while ( $whole[0] != $part2[0] ) {
2561:     $w = shift @whole;
2562:     push(@whole,$w);
2563:     }
2564:     $part1_after_part2 = 'undefined';
2565:     ## Second, find the element of @part1 that comes
2566:     ## after the first element opf @part2
2567:     for ($i = 1; $i <= $#whole; $i++) {
2568:     if ( grep { $_ == $whole[$i] } @part1 ) {
2569:         $part1_after_part2 = $whole[$i];
2570:         last;
2571:     }
2572:     }
2573:     ## Third, rotate @part1 around until the
2574:     ## element following the first of @part2
2575:     ## is at its head.
2576:     if ( $part1_after_part2 ne 'undefined' ) {
2577:     while ( $part1[0] != $part1_after_part2 ) {
2578:         $p1 = shift @part1;
2579:         push(@part1,$p1);
2580:     }
2581:     }
2582:     ## Now, interlace @part1 and @part2
2583:     $increment = int(scalar(@part1) / scalar(@part2));
2584:     @new_whole = ();
2585:     foreach $p2 (@part2) {
2586:     push(@new_whole,$p2);
2587:     for ( $j = 0; $j < $increment; $j++ ) {
2588:         if ( @part1 ) {
2589:         push(@new_whole,$part1[0]);
2590:         shift @part1;
2591:         }
2592:     }
2593:     }
2594:     if ( @part1 ) { push(@new_whole,@part1); }
2595:     return(@new_whole);
2596: }
2597:
2598:
2599: ## REDISTRIBUTE_LARGE_SOCIAL_NETWORKS
2600: ##
2601: sub redistribute_large_social_networks
2602: {
2603:     my(@social_network_indices) = @_;
2604:
2605:     my($large,@large,@small,@invisible,@small_and_invisible,$is_visible);
2606:     my($sni,$said);
2607:
2608:     $large = 2; ## any network with two or more players is considered "large."
2609:     @large = ();
2610:     @small = ();
2611:     @invisible = ();
2612:     foreach $sni (@social_network_indices) {
2613:     $is_visible = 0;
2614:     foreach $said (keys %{$sn2saids{$sni}}) {
2615:         if ( &player_is_visible($said) ) {
2616:         $is_visible = 1;
2617:         last;
2618:         }
2619:     }
2620:     if ( $is_visible ) {
2621:         if ( scalar(keys %{$sn2saids{$sni}}) >= $large ) { push(@large,$sni); }
2622:         else { push(@small,$sni); }
2623:     }
2624:     else { push(@invisible,$sni); }
2625:     }
2626:     ## Mix the invisibles into the smalls
2627:     @small_and_invisible = &evenly_mix(\@small,\@invisible,\@social_network_indices);
2628:     ## Now mix the larges into the small and invisible
2629:     return(&evenly_mix(\@small_and_invisible,\@large,\@social_network_indices));
2630: }
2631:
2632:
2633: ## INSERT_NEW_SOCIAL_NETWORKS_FOR_INTERFACE
2634: ##
2635: ## Input: None
2636: ##
2637: ## Output: A list of sorted social network indices with any new social
2638: ## networks inserted into the list.
2639: ##
2640: ## Notes: Insert any new social networks into the ordered list of networks.
2641: ## There should only be one, at the most, in principle, such new
2642: ## network after one new message posted to the newsgroup.
2643: ##
2644: sub insert_new_social_networks_for_interface
2645: {
2646:     my(@social_network_indices,$sni,$esn,$i);
2647:
2648:     ## Produce a list of continuing social networks that does not include
2649:     ## any of the disappearing (i.e., old) social networks. The old
2650:     ## social networks have been taken out of the %said2sn
2651:     ## and sn2saids hashes, but are still keys in the
2652:     ## %sn2theta hash. Take them out of this hash too.
2653:     @social_network_indices = ();
2654:     foreach $sni (keys %sn2saids) {
2655:     if ( grep { $_ == $sni } (keys %old_social_network) ) {
2656:         delete($sn2theta{$sni});
2657:         next;
2658:     }
2659:     if ( grep { $_ == $sni } (keys %new_social_network) ) { next; }
2660:     else { push(@social_network_indices,$sni); }
2661:     }
2662:     ## Sort the existing social networks into counter-clockwise order.
2663:     @social_network_indices = sort { $sn2theta{$a} <=> $sn2theta{$b} } @social_network_indices;
2664:     ## Re-sort the indices so that the large social networks are equally distributed.
2665:     @social_network_indices = &redistribute_large_social_networks(@social_network_indices);
2666:     ## If there is currently only one social network, return it.
2667:     if ( not(@social_network_indices) ) { return(keys %new_social_network); }
2668:     ## Otherwise, insert any new social network indices into this ordered list using
2669:     ## &find_adjacent_social_network_for_interface to find appropriate insertion points.
2670:     foreach $new_social_network (keys %new_social_network) {
2671:     @source_smids = ();
2672:     foreach $said (keys %{$sn2saids{$new_social_network}}) {
2673:         foreach $smid (@{$said2smids{$said}}) {
2674:         if (defined($smid2sources{$smid}) ) {
2675:             push(@source_smids,@{$smid2sources{$smid}});
2676:         }
2677:         if (defined($smid2spid{$smid})) { push(@source_smids,$smid2spid{$smid}); }
2678:         }
2679:     }
2680:     $esn = &find_adjacent_social_network_for_interface(@source_smids);
2681: ## No insertion point found because current message is neither a
2682: ## reply to nor does it include a citation from another message.
2683:     if ( $esn == -1 ) { push(@social_network_indices,$new_social_network); }
2684: ## Otherwise, rotate to position the $esn at the start of the ordered list
2685: ## and insert the new social network next to it by pushing onto the
2686: ## end of the list (remember the order is circular).
2687:     else {
2688:         $i = 0;
2689:         while ( $social_network_indices[0] != $esn ) {
2690:         if ( $i > $#social_network_indices ) {
2691:             print STDOUT "\nError: Problem with the social network indices!\n";
2692:             last;
2693:         }
2694:         $sni = shift @social_network_indices;
2695:         push(@social_network_indices,$sni);
2696:         $i++;
2697:         }
2698:         push(@social_network_indices,$new_social_network);
2699:     }
2700:     }
2701:     return(@social_network_indices);
2702: }
2703:
2704: ## WRITE_HTML_FOR_LEADERS_LIST
2705: ##
2706: sub write_html_for_leaders_list
2707: {
2708:     my($current_smid) = @_;
2709:
2710:     my($html,$said,@saids,$messagesCGIScript,$addr);
2711:
2712:     $messagesCGIScript = 'http://'.$cgi_url.'/Agonistics/index_for_player.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'&msg='.$current_smid.'&id=';
2713:     $html .= "\n\t".'<TABLE align="right" style="width: 20%; text-align: right;">'."\n\t\t".'<TBODY>';
2714:     $html .= "\n\t\t".'<TR align="right">';
2715:     if ( $language_locale =~ /^EN/ ) {
2716:     $html .= "\n\t\t\t".'<TD align="right" class=headerfieldtext><B>c u r r e n t   l e a d e r s</B></TD>';
2717:     }
2718:     elsif ( $language_locale =~ /^FR/ ) {
2719:     $html .= "\n\t\t\t".'<TD align="right" class=headerfieldtext><B>c l a s s e m e n t</B></TD>';
2720:     }
2721:     elsif ( $language_locale =~ /^DE/ ) {
2722:     $html .= "\n\t\t\t".'<TD align="right" class=headerfieldtext><B>L i s t e   d e r   S p i e l e r</B></TD>';
2723:     }
2724:     $html .= "\n\t\t".'</TR>';
2725:     @saids = sort { if ( $said2score{$a} == $said2score{$b} ) { lc($said2addr{$a}) cmp lc($said2addr{$b}); } else { $said2score{$b} <=> $said2score{$a} } } keys %said2score;
2726: # if ( $#saids >= 9 ) { @saids = @saids[0..9]; }
2727:     foreach $said (@saids) {
2728:     if ( defined($said2addr{$said}) and defined($said2score{$said}) ) {
2729:         $addr = HTML::Entities::encode_entities($said2addr{$said});
2730:         $html .= "\n\t\t".'<TR align="right">';
2731:         $html .= "\n\t\t\t".'<TD align="right" id="'.$said.'" class=scorelisttext onMouseOver="turnLabelOnFromLeadersList(this,event);"><B><A href="'.$messagesCGIScript.$said.'" target="messages_posted_by_'.$said2addr{$said}.'">'.$addr.'</A> '.$said2score{$said}.'</B></TD>';
2732:         $html .= "\n\t\t".'</TR>';
2733:     }
2734:     }
2735:     $html .= "\n\t\t".'</TBODY>';
2736:     $html .= "\n\t".'</TABLE>';
2737:     return($html);
2738: }
2739:
2740:
2741: ## WRITE_HTML_FOR_WINNING_SENTENCES
2742: ##
2743: sub write_html_for_winning_sentences
2744: {
2745:     my($current_smid) = @_;
2746:
2747:     my($html,@stems,$stem,$theme,$ssid,@winning_ssids,$i,$j,$ws,$said,@saids);
2748:     my($messagesCGIScript,$messagesURL,$addr,$sentence_text);
2749:
2750:     $messagesCGIScript = 'http://'.$cgi_url.'/Agonistics/index_for_player.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'&msg='.$current_smid.'&id=';
2751:     $messagesURL = 'http://'.$documents_url.'/Agonistics/'.$archive_name.'/Interface/Messages/';
2752:     $html .= "\n\t".'<TABLE align="left" style="width: 20%; text-align: left;" >'."\n\t\t".'<TBODY>';
2753:     $html .= "\n\t\t".'<TR align="left">';
2754:     if ( $language_locale =~ /^FR/ ) {
2755:     $html .= "\n\t\t\t".'<TD align="left" class=headerfieldtext><B>t h è m e s   f r é q u e n t s</B></TD>';
2756:     }
2757:     elsif ( $language_locale =~ /^EN/ ) {
2758:     $html .= "\n\t\t\t".'<TD align="left" class=headerfieldtext><B>s t a t e m e n t s   a b o u t<BR>f r e q u e n t   t h e m e s</B></TD>';
2759:     }
2760:     elsif ( $language_locale =~ /^DE/ ) {
2761:     $html .= "\n\t\t\t".'<TD align="left" class=headerfieldtext><B>T h e m e n   d e r<BR>D i s k u s s i o n</B></TD>';
2762:     }
2763:     $html .= "\n\t\t".'</TR>';
2764:     @stems = sort { $#{@{$winning_sentences{$b}}} <=> $#{@{$winning_sentences{$a}}} } keys %winning_sentences;
2765:     @saids = sort { $said2score{$b} <=> $said2score{$a} } keys %said2score;
2766:     $i = 0;
2767:     foreach $stem (@stems) {
2768:     if ( $i > 2 ) { last; }
2769:     @winning_ssids = ();
2770:     $j = 0;
2771:     foreach $said (@saids) {
2772:         @{$winning_sentences{$stem}} = sort { $smid2sdt{$ssid2smid{$b}} <=> $smid2sdt{$ssid2smid{$a}} } @{$winning_sentences{$stem}};
2773:         foreach $ws (@{$winning_sentences{$stem}}) {
2774:         if ($smid2said{$ssid2smid{$ws}} == $said) {
2775:             push(@winning_ssids,$ws);
2776:             $j++;
2777:         }
2778:         if ( $j >= 10 ) { last; }
2779:         }
2780:         if ( $j >= 10 ) { last; }
2781:     }
2782:     if ( not(@winning_ssids) ) { next; }
2783:     $i++;
2784:     if ( defined($stem2tokens{$stem}) ) { $theme = ${@{$stem2tokens{$stem}}}[0]; }
2785:     else { $theme = $stem; }
2786:     $theme = HTML::Entities::encode_entities($theme);
2787:     $html .= "\n\t\t".'<TR align="left">';
2788:     $html .= "\n\t\t\t".'<TD valign="top" align="left" class=pastthemetext><B>'.$theme.'</B></TD>';
2789:     $html .= "\n\t\t".'</TR>';
2790:     foreach $ssid (@winning_ssids) {
2791:         $html .= "\n\t\t".'<TR align="left">';
2792:         if ( defined($ssid2smid{$ssid})
2793:          and defined($smid2said{$ssid2smid{$ssid}})
2794:          and defined($said2addr{$smid2said{$ssid2smid{$ssid}}})
2795:          and defined($ssid2sentence{$ssid}) ) {
2796:         $addr = HTML::Entities::encode_entities($said2addr{$smid2said{$ssid2smid{$ssid}}});
2797:         $sentence_text = HTML::Entities::encode_entities($ssid2sentence{$ssid});
2798:         $html .= "\n\t\t\t".'<TD align="left"><B class=pastspeakertext><A href="'.$messagesCGIScript.$smid2said{$ssid2smid{$ssid}}.'" target="'.$said2addr{$smid2said{$ssid2smid{$ssid}}}.'">'.$addr.'</A>:</B> <B class=pastsentencetext><A href="'.$messagesURL.$ssid2smid{$ssid}.'.html#'.$ssid2start{$ssid}.'" target="message '.$ssid2smid{$ssid}.'">'.$sentence_text.'</A></B></TD>';
2799:         }
2800:         else { $html .= "\n\t\t\t".'<TD> </TD>'; }
2801:         $html .= "\n\t\t".'</TR>';
2802:     }
2803:     }
2804:     $html .= "\n\t\t".'</TBODY>';
2805:     $html .= "\n\t".'</TABLE>';
2806:     return($html);
2807: }
2808:
2809:
2810: ## ASSIGN_THETAS_FOR_INTERFACE
2811: ##
2812: sub assign_thetas_for_interface
2813: {
2814:     my (@social_network_indices) = @_;
2815:
2816:     my($i,$sni,$last_angle,$index_of_largest_network_with_theta);
2817:     my($size_of_largest_network_with_theta);
2818:
2819:     ## If there is only one social network currently, then assign
2820:     ## position it at 0.
2821:     if ( $#social_network_indices == 0 ) { $sn2theta{$social_network_indices[0]} = 0; }
2822:     ## Otherwise, first rotate the @social_network_indices until a social network
2823:     ## with an existing theta angle is found. Find the largest social network with
2824:     ## an assigned theta to be the anchor point.
2825:     else {
2826:     $index_of_largest_network_with_theta = -1;
2827:     $size_of_largest_network_with_theta = -1;
2828:     for ($i = 0; $i <= $#social_network_indices; $i++) {
2829:         $sni = $social_network_indices[$i];
2830:         if ( defined($social_network_indices[$i])
2831:          and defined($sn2theta{$sni})
2832:          and defined($sn2saids{$sni})
2833:          and (keys %{$sn2saids{$sni}})
2834:          and ( scalar(keys %{$sn2saids{$sni}}) > $size_of_largest_network_with_theta ) ) {
2835:         $size_of_largest_network_with_theta = scalar(keys %{$sn2saids{$sni}});
2836:         $index_of_largest_network_with_theta = $sni;
2837:         }
2838:     }
2839:     if ( $index_of_largest_network_with_theta != -1 ) {
2840:         $sni = -1;
2841:         while ( $sni != $index_of_largest_network_with_theta ) {
2842:         $sni = shift @social_network_indices;
2843:         push(@social_network_indices,$sni);
2844:         }
2845:         $last_angle = $sn2theta{$index_of_largest_network_with_theta};
2846:     }
2847:     else { $last_angle = 0; }
2848:     $angle_increment = ( $radius_of_circle / scalar(@social_network_indices) );
2849:     undef %sn2theta;
2850:     foreach $sni (@social_network_indices) {
2851:         $sn2theta{$sni} = $last_angle;
2852:         $last_angle = &angle_add($last_angle,$angle_increment);
2853:     }
2854:     }
2855:     return;
2856: }
2857:
2858:
2859: ## DISTRIBUTE_PLAYERS_FOR_INTERFACE
2860: ##
2861: sub distribute_players_for_interface
2862: {
2863:     my(@social_network_indices) = @_;
2864:
2865:     foreach $sni (@social_network_indices) {
2866: ## Update the theta and rho positions for each player.
2867:     foreach $said (keys %{$sn2saids{$sni}}) {
2868:         if ( not(defined($said2theta{$said})) ) { $said2previous_theta{$said} = $sn2theta{$said}; }
2869:         else { $said2previous_theta{$said} = $said2theta{$said}; }
2870:         $said2theta{$said} = $sn2theta{$sni};
2871:         $said2theta_movement{$said} = &angle_subtract($said2theta{$said},$said2previous_theta{$said});
2872:         if ( not(defined($said2rho{$said})) ) { $said2previous_rho{$said} = $said2percentage_score{$said}; }
2873:         else { $said2previous_rho{$said} = $said2rho{$said}; }
2874:         $said2rho{$said} = $said2percentage_score{$said};
2875:         if ( defined($said2rho{$said}) and defined($said2previous_rho{$said}) ) {
2876:         $said2rho_movement{$said} = $said2rho{$said} - $said2previous_rho{$said};
2877:         }
2878:         else { $said2rho_movement{$said} = 0; }
2879:     }
2880:     }
2881:     return;
2882: }
2883:
2884:
2885: ## SELECT_FACES_AND_PROFILES_FOR_INTERFACE
2886: ##
2887: sub select_faces_and_profiles_for_interface
2888: {
2889:     my($current_smid) = @_;
2890:
2891:     my($pi,$speaker_said,%x_pos,$said);
2892:
2893:     $speaker_said = $smid2said{$current_smid};
2894:     ## Determine the horizontal position of the author of the current_smid
2895:     $pi = atan2(1,1) * 4;
2896:     $x_pos{$speaker_said} = ( ( $radius_of_circle - $said2rho{$speaker_said} )
2897:                   * cos(($said2theta{$speaker_said}/$radius_of_circle) * 2 * $pi) );
2898:     ## To pick an image and a profile for players, determine whether they are to the left or the
2899:     ## right of the speaker.
2900:     foreach $said (sort {$a <=> $b} keys %said2smids) {
2901:     if ( defined($said2rho{$said}) and defined($said2theta{$said}) ) {
2902:         $x_pos{$said} = ( ( $radius_of_circle - $said2rho{$said} )
2903:                   * cos(($said2theta{$said}/$radius_of_circle) * 2 * $pi) );
2904:     }
2905:     else { $x_pos{$said} = $x_pos{$speaker_said}; }
2906:     if ( $x_pos{$said} == $x_pos{$speaker_said} ) { $said2profile{$said} = ''; }
2907:     elsif ( $x_pos{$said} > $x_pos{$speaker_said} ) { $said2profile{$said} = 'l'; }
2908:     else { $said2profile{$said} = 'r'; }
2909: ## Assign each new image id randomly.
2910:     if (not defined($said2image_id{$said})) {
2911:         $said2image_id{$said} = int(rand($number_of_faces));
2912:         if ( $said2image_id{$said} == 0 ) { $said2image_id{$said} = 1; }
2913:     }
2914:     }
2915: }
2916:
2917:
2918: ## PLAYER_IS_VISIBLE
2919: ##
2920: sub player_is_visible
2921: {
2922:     my($said) = @_;
2923:
2924:     my(@smids);
2925:
2926:     @smids = @{$said2smids{$said}};
2927:     if ( $smids[$#smids] >= ( $current_simplified_message_id - $recency ) ) { return(1); }
2928:     ## Even if the current winner has not posted within $recency messages, keep the winner visible.
2929:     if ( $current_winner == $said ) { return(1); }
2930:     else { return(0); }
2931: }
2932:
2933:
2934: ## WRITE_HTML_FOR_INTERFACE
2935: ##
2936: sub write_html_for_interface
2937: {
2938:     my($current_smid,@source_smids) = @_;
2939:
2940:     my(@social_network_indices,$html_front_divs,$html_profile_divs,$html_label_divs,$html);
2941:     my(@player_defs,$player_def,$said,$speaker_said,@js_code,$js_code,$sni);
2942:     my($html_theme_divs,$theme_defs,$i,$subject_line_div,$shared_stem,@shared_stems);
2943:     my($sentence_div,$sentence_text,$subject_line_text,$start_date,$end_date,$newsgroup_html);
2944:     my($newsgroup_div,$newsgroup_def,$start_date_html,$start_date_div,$start_date_def);
2945:     my($end_date_html,$end_date_div,$end_date_def,$post_button_label);
2946:
2947:     @social_network_indices = &insert_new_social_networks_for_interface();
2948:     undef %new_social_network;
2949:     undef %old_social_network;
2950:     &assign_thetas_for_interface(@social_network_indices);
2951:     &distribute_players_for_interface(@social_network_indices);
2952:     &select_faces_and_profiles_for_interface($current_smid,@social_network_indices);
2953:
2954:     ## Now write the HTML
2955:     $html = '<HTML>
2956: <HEAD>
2957: <META http-equiv="Refresh" content="'.$pause_between_frames.'; URL=agon'.($frame_number+1).'.html">
2958: <TITLE>agonistics</TITLE>
2959: <LINK type="text/css" rel="stylesheet" href="interface_style.css">
2960: </HEAD>
2961: <BODY>';
2962:     $html .= &write_html_for_leaders_list($current_smid);
2963:     $html .= &write_html_for_winning_sentences($current_smid);
2964:     $html_front_divs = '';
2965:     $html_profile_divs = '';
2966:     $html_label_divs = '';
2967:     $html_theme_divs = '';
2968:     @player_defs = ();
2969:     $speaker_said = $smid2said{$current_smid};
2970:     $speaker_sni = $said2sn{$speaker_said};
2971:     foreach $said (sort {$said2theta{$a} <=> $said2theta{$b}} keys %said2smids) {
2972:     if ( not(&player_is_visible($said)) ) { next; }
2973:     if ( $said == $speaker_said ) { $said_is_speaker{$said} = 1; }
2974:     else { $said_is_speaker{$said} = 0; }
2975:     $html_front_divs .= '<DIV id="'.$said.'_front" style="position:absolute; visibility:hidden" onMouseDown="beginDrag(this, event);" onMouseOver="turnLabelOnFromProfile(this, event);"></DIV>'."\n";
2976:     $html_profile_divs .= '<DIV id="'.$said.'_profile" style="position:absolute; visibility:hidden"></DIV>'."\n";
2977:     $html_label_divs .= '<DIV id="'.$said.'_label" style="position:absolute; visibility:hidden"></DIV>'."\n";
2978:     if ( defined($said2addr{$said}) ) {
2979:         $player_def = '{ addr:"'.$said2addr{$said}.'"';
2980:         $player_def .= ', initialRho:'.$said2previous_rho{$said}.', finalRho:'.$said2rho{$said};
2981:         $player_def .= ', rhoMovement:'.$said2rho_movement{$said};
2982:         $player_def .= ', initialTheta:'.$said2previous_theta{$said}.', finalTheta:'.$said2theta{$said};
2983:         $player_def .= ', thetaMovement:'.$said2theta_movement{$said};
2984:         $player_def .= ', profile:"'.$said2profile{$said}.'", imageID:'.$said2image_id{$said};
2985:         $player_def .= ', said:'.$said.', isSpeaker:'.$said_is_speaker{$said};
2986:         $player_def .= ', score:'.$said2score{$said}.', previousScore:'.$said2previous_score{$said};
2987:         if ( $said == $current_winner ) { $player_def .= ', isWinner: 1 }'; }
2988:         else { $player_def .= ', isWinner: 0 }'; }
2989:         push(@player_defs,$player_def);
2990:     }
2991:     }
2992:     $subject_line_div = '<DIV id="subject_line_div" style="position:absolute; visibility:hidden"></DIV>'."\n";
2993:     $sentence_div = '<DIV id="sentence_div" style="position:absolute; visibility:hidden"></DIV>'."\n";
2994:     $newsgroup_html = '<B class=labeltext>n e w s g r o u p   </B>';
2995:     $newsgroup_html .= '<B class=scorelisttext>'.$news_group.'   </B>';
2996:     $newsgroup_div = '<DIV id="newsgroup_div" style="position:absolute; visibility:hidden"></DIV>'."\n";
2997:     $newsgroup_def = 'var newsgroupHTML = "'.$newsgroup_html.'";'."\n";
2998:     $end_date_html = '<B class=labeltext>t  i m e   </B>';
2999:     $end_date_html .= '<B class=scorelisttext>'.&code_the_date($smid2date{$current_smid}).'</B>';
3000:     $end_date_div = '<DIV id="end_date_div" style="position:absolute; visibility:hidden"></DIV>'."\n";
3001:     $end_date_def = 'var endDateHTML = "'.$end_date_html.'";'."\n";
3002:     ## add forward and rewind buttons
3003:     $rewind_button_html = '<A href="http://'.$cgi_url.'/Agonistics/rewind_to_first_frame.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'"><IMG src="rewind.gif"></A>';
3004:     $rewind_button_div = '<DIV id="rewind_button_div" style="position:absolute; visibility:hidden"></DIV>'."\n";
3005:     $rewind_button_def = "var rewindButtonHTML = '".$rewind_button_html."';\n";
3006:     $forward_button_html = '<A href="http://'.$cgi_url.'/Agonistics/forward_to_latest_frame.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'"><IMG src="forward.gif"></A>';
3007:     $forward_button_div = '<DIV id="forward_button_div" style="position:absolute; visibility:hidden"></DIV>'."\n";
3008:     $forward_button_def = "var forwardButtonHTML = '".$forward_button_html."';\n";
3009:     ## add help and post buttons
3010:     if ( $language_locale =~ /^EN/ ) {
3011:     $help_button_html = '<FORM ACTION="http://'.$documents_url.'/Agonistics/'.$archive_name.'/Interface/help-en.html" TARGET="agonistics / help">';
3012:     $help_button_html .= '<INPUT class=buttonstyle type="SUBMIT" value="help "></FORM>';
3013:     $post_button_label = "write";
3014:     }
3015:     elsif ( $language_locale =~ /^FR/ ) {
3016:     $help_button_html = '<FORM ACTION="http://'.$documents_url.'/Agonistics/'.$archive_name.'/Interface/help-fr.html" TARGET="agonistics / aide">';
3017:     $help_button_html .= '<INPUT class=buttonstyle type="SUBMIT" value="aide "></FORM>';
3018:     $post_button_label = "écrire";
3019:     }
3020:     elsif ( $language_locale =~ /^DE/ ) {
3021:     $help_button_html = '<FORM ACTION="http://'.$documents_url.'/Agonistics/'.$archive_name.'/Interface/help-de.html" TARGET="agonistics / hilfe">';
3022:     $help_button_html .= '<INPUT class=buttonstyle type="SUBMIT" value="hilfe    "></FORM>';
3023:     $post_button_label = "schreiben";
3024:     }
3025:     $help_button_div = '<DIV id="help_button_div" style="position:absolute; visibility:hidden"></DIV>'."\n";
3026:     $help_button_def = "var helpButtonHTML = '".$help_button_html."';\n";
3027:     ## Post button for newsgroup discussions
3028:     if ( $nntp_server ) {
3029:     $post_button_html = '<FORM METHOD="POST" ACTION="http://'.$cgi_url.'/Agonistics/write_message.pl" ENCTYPE="multipart/form-data" TARGET="write_a_message">';
3030:     $post_button_html .= '<INPUT TYPE="hidden" NAME="url" VALUE="'.$documents_url.'">';
3031:     $post_button_html .= '<INPUT TYPE="hidden" NAME="cgi_url" VALUE="'.$cgi_url.'">';
3032:     $post_button_html .= '<INPUT TYPE="hidden" NAME="archive" VALUE="'.$archive_name.'">';
3033:     $post_button_html .= '<INPUT TYPE="hidden" NAME="news_group" VALUE="'.$news_group.'">';
3034:     $post_button_html .= '<INPUT TYPE="hidden" NAME="language_locale" VALUE="'.$language_locale.'">';
3035:     }
3036:     ## Post button for email-based discussions
3037:     else { $post_button_html = '<FORM ACTION="mailto:'.$mailing_list_address.'">'; }
3038:     $post_button_html .= '<INPUT name="post_message" class=buttonstyle type="SUBMIT" value="'.$post_button_label.'">';
3039:     $post_button_div = '<DIV id="post_button_div" style="position:absolute; visibility:hidden"></DIV>'."\n";
3040:     $post_button_def = "var postButtonHTML = '".$post_button_html."';\n";
3041:     $def_button_html = '';
3042:     $def_button_div = '<DIV id="def_button_div" style="position:absolute; visibility:hidden"></DIV>'."\n";
3043:     $def_button_def = "var defButtonHTML = '".$def_button_html."';\n";
3044:     ## Player definitions
3045:     $player_defs = join ','."\n",@player_defs;
3046:     $player_defs = 'var players = [ '.$player_defs.'];';
3047:     $current_winning_theme = HTML::Entities::encode_entities($current_winning_theme);
3048:     $theme_defs = '{ network:'.$speaker_sni.', text:"'.$current_winning_theme.'", theta:'.$sn2theta{$speaker_sni}.' }';
3049:     $theme_defs = 'var networkThemes = [ '.$theme_defs.' ];';
3050:     $html_theme_divs .= '<DIV id="theme_'.$speaker_sni.'" style="position:absolute; visibility:hidden"></DIV>'."\n";
3051:     $html .= $html_front_divs."\n\n".$html_profile_divs."\n\n".$html_label_divs."\n\n".$subject_line_div."\n\n".$sentence_div."\n\n".$html_theme_divs."\n\n".$newsgroup_div."\n\n".$end_date_div."\n\n".$rewind_button_div."\n\n".$forward_button_div."\n\n".$help_button_div."\n\n".$post_button_div."\n\n".$def_button_div."\n\n".'<SCRIPT>'."\n".$player_defs."\n\n".$theme_defs."\n\n".$newsgroup_def."\n\n".$end_date_def."\n\n".$rewind_button_def."\n\n".$forward_button_def."\n\n".$help_button_def."\n\n".$post_button_def."\n\n".$def_button_def."\n\n";
3052:     $html .= 'var dir = "Messages";'."\n";
3053:     $html .= 'var messagesCGIScript = "http://'.$cgi_url.'/Agonistics/index_for_player.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'&msg='.$current_smid.'";'."\n";
3054:     $subject_line_text = HTML::Entities::encode_entities($smid2nsl{$current_smid});
3055:     $html .= 'var subjectLineText = "'.$subject_line_text.'";'."\n";
3056:     if ( $current_winning_sentence ) {
3057:     $current_winning_sentence =~ s/"//g;
3058:     $current_winning_sentence = HTML::Entities::encode_entities($current_winning_sentence);
3059:     $html .= 'var sentenceText = "'.$current_winning_sentence.'";'."\n";
3060:     }
3061:     else { $html .= 'var sentenceText = "";'."\n"; }
3062:     if ( defined($said2addr{$current_winner}) ) { $html .= 'var currentWinner = "'.$said2addr{$current_winner}.'";'."\n"; }
3063:     else { $html .= 'var currentWinner = "";'."\n"; }
3064:     if ( defined($said2addr{$speaker_said}) ) { $html .= 'var currentSpeaker = "'.$said2addr{$speaker_said}.'";'."\n"; }
3065:     else { $html .= 'var currentSpeaker = "";'."\n"; }
3066:     open(JS,'Resources'.$slash.'javascript_interface_code.txt');
3067:     @js_code = <JS>;
3068:     close(JS);
3069:     $js_code = join '',@js_code;
3070:     $html .= $js_code;
3071:     open(AGON,'>'.$interface_directory.$slash.'agon'.$frame_number.'.html');
3072:     print AGON $html;
3073:     close(AGON);
3074: }
3075:
3076:
3077: ## CODE_THE_DATE
3078: ##
3079: sub code_the_date
3080: {
3081:     my($date) = @_;
3082:
3083:     return(HTML::Entities::encode_entities(lc(UnixDate($date,"%a %b %e %H:%M:%S %Y"))));
3084: }
3085:
3086:
3087: ## WRITE_MESSAGE_LIST_ID_ENTRY
3088: ##
3089: sub write_message_list_id_entry
3090: {
3091:     my($smid,$is_hyperlinked) = @_;
3092:
3093:     if ( $is_hyperlinked ) { $smid = '<A href="'.$smid.'.html">'.$smid.'</A>'; }
3094:     return("\n\t\t\t".'<TD class=listtext>'.$smid.'</TD>');
3095: }
3096:
3097:
3098: ## WRITE_MESSAGE_LIST_DATE_ENTRY
3099: ##
3100: sub write_message_list_date_entry
3101: {
3102:     my($smid,$is_hyperlinked) = @_;
3103:
3104:     my($date);
3105:
3106:     if (defined $smid2date{$smid}) {
3107:     $date = $smid2date{$smid};
3108:     $date = UnixDate($date,"%d %b %y at %H:%M");
3109:     $date = HTML::Entities::encode_entities($date);
3110:     $date =~ s/\s/\&nbsp\;/g;
3111:     }
3112:     else { $date = ''; }
3113:     if ( $is_hyperlinked ) { $date = '<A href="'.$smid.'.html">'.$date.'</A>'; }
3114:     return("\n\t\t\t".'<TD class=listtext>'.$date.'</TD>');
3115: }
3116:
3117:
3118: ## WRITE_MESSAGE_LIST_SUBJECT_ENTRY
3119: ##
3120: sub write_message_list_subject_entry
3121: {
3122:     my($smid,$is_hyperlinked) = @_;
3123:
3124:     my($subject);
3125:
3126:     if (defined $smid2subject{$smid}) {
3127:     $subject = $smid2subject{$smid};
3128:     $subject = HTML::Entities::encode_entities($subject);
3129:     $subject =~ s/\s/\&nbsp\;/g;
3130:     }
3131:     else { $subject = ''; }
3132:     if ( length($subject) > 60 ) { $subject = substr($subject,0,60); }
3133:     if ( $is_hyperlinked ) { $subject = '<A href="http://'.$documents_url.'/Agonistics/'.$archive_name.'/Interface/Messages/'.$smid.'.html">'.$subject.'</A>'; }
3134:     return("\n\t\t\t".'<TD class=listtext>'.$subject.'</TD>');
3135: }
3136:
3137:
3138: ## WRITE_MESSAGE_LIST_SENDER_ENTRY
3139: ##
3140: sub write_message_list_sender_entry
3141: {
3142:     my($smid,$is_hyperlinked) = @_;
3143:
3144:     my($sender);
3145:
3146:     if ( (defined $smid2said{$smid}) and (defined $said2ana{$smid2said{$smid}} ) ) {
3147:     $sender = $said2ana{$smid2said{$smid}};
3148:     $sender = HTML::Entities::encode_entities($sender);
3149:     $sender =~ s/\s/\&nbsp\;/g;
3150:     }
3151:     else { $sender = ''; }
3152:     if ( $is_hyperlinked ) { $sender = '<A href="'.$smid.'.html">'.$sender.'</A>'; }
3153:     return("\n\t\t\t".'<TD class=listtext>'.$sender.'</TD>');
3154: }
3155:
3156:
3157: ## WRITE_ENTRY_IN_MESSAGE_INDEX
3158: ##
3159: sub write_entry_in_message_index
3160: {
3161:     my($smid) = @_;
3162:
3163:     my($html,$entry);
3164:
3165:     if ( defined($smid2sdt{$smid})
3166:   ## Do not include references to messages that are not in the archive.
3167:      and ( $smid2sdt{$smid} != -1 )
3168:      and defined($smid2said{$smid})
3169:      and defined($said2addr{$smid2said{$smid}})
3170:      and defined($smid2nsl{$smid}) ) {
3171:     $entry = $smid."\t".$said2addr{$smid2said{$smid}}."\t".$smid2sdt{$smid}."\t".$smid2nsl{$smid}."\t";
3172:     if ( ( $smid2spid{$smid} )
3173:          and ( $smid2sdt{$smid2spid{$smid}} != -1 )
3174:          and defined($smid2said{$smid2spid{$smid}})
3175:          and defined($said2addr{$smid2said{$smid2spid{$smid}}})
3176:          and defined($smid2nsl{$smid2spid{$smid}}) ) {
3177:         $entry .= $smid2spid{$smid}."\n";
3178:     }
3179:     else { $entry .= "no_parent\n"; }
3180:     $html .= "\t\t".'<TR align="left">';
3181:     $html .= &write_message_list_id_entry($smid,0);
3182:     $html .= &write_message_list_date_entry($smid,0);
3183:     $html .= &write_message_list_subject_entry($smid,1);
3184:     $html .= &write_message_list_sender_entry($smid,0);
3185:     $html .= "\n\t\t".'</TR>'."\n";
3186:     $entry .= $html;
3187:     $message_index_for_said = $message_index_file_name.'_for_'.$smid2said{$smid}.'.txt';
3188: ## Initialize the file for the player if this is the player's first message posted.
3189:     if ( not(-e $message_index_for_said) ) {
3190:         open(MSGINDEX,'>'.$message_index_for_said);
3191:         close(MSGINDEX);
3192:     }
3193:     open(MSGINDEX,'>>'.$message_index_for_said);
3194:     flock(MSGINDEX, LOCK_EX); ## exclusive lock, use LOCK_SH to read from the file
3195:     print MSGINDEX $entry;
3196:     close(MSGINDEX);
3197:     }
3198: }
3199:
3200: ## CLEAN_RAW_MESSAGE
3201: ##
3202: sub clean_raw_message
3203: {
3204:     my($next_line,$next_msg,$msg_header,$msg_body,%fields_and_values,$pos_next_msg,$boundary);
3205:     my($temp_filename);
3206:
3207:     ## Get rid of any extras carriage returns or line feeds. Windows, UNIX, and Apple
3208:     ## formats differ according to how newlines are encoded. This normalizes those
3209:     ## differences. Also, some newline characters get translated into =20 or =,
3210:     ## so these extra characters are discarded here.
3211:     $/ = "\n";
3212:     open(RAW,$raw_messages_file) || die "Can't open messages file\, $raw_messages_file\, $! \n";
3213:     my $random_number = int(rand(10000));
3214:     $temp_filename = 'temp_file_'.$random_number.'.txt';
3215:     open(TEMP,'>'.$temp_filename);
3216:     while ($next_line = <RAW>) {
3217:     if ( $next_line !~ /^From\s/ ) { ## Get rid of any leading "From " that is not an actual header.
3218:         chomp($next_line);
3219:         for ($next_line) {
3220:         s/^(.*)\s+$/$1/;
3221:         s/^(.*)\=20$/$1/;
3222:         s/^(.*)\=$/$1/;
3223:         }
3224:         print TEMP $next_line."\n";
3225:     }
3226:     }
3227:     close(RAW);
3228:     close(TEMP);
3229:     $/ = $end_of_message_marker;
3230:     open(TEMP,$temp_filename);
3231:     ## Now strip out any HTML formatting in the messages and simplify the headers.
3232:     open(ARCHIVEFILE,'>>'.$archive_file_name) || die "Can't open file for output, $archive_file_name: $!\n";
3233:     while ($next_msg = <TEMP>) {
3234:     undef %fields_and_values;
3235:     %fields_and_values = &extract_fields_and_values_from_message($next_msg);
3236: ## separate message header and body
3237:     if ( $next_msg =~ m/\n\n/g ) {
3238:         $pos_next_msg = pos($next_msg);
3239:         $msg_header = substr($next_msg,0,$pos_next_msg);
3240:         $msg_body = substr($next_msg,$pos_next_msg,length($next_msg)-$pos_next_msg);
3241:         $next_msg = 'newsgroups: '.$news_group."\n";
3242:         foreach $field (sort keys %fields_and_values) {
3243:         $next_msg .= $field.' '.$fields_and_values{$field}."\n";
3244:         }
3245:      ## If the message has been sent in plain text and HTML, strip out the HTML.
3246:         if ( (exists $fields_and_values{'content-type:'})
3247:          and ( $fields_and_values{'content-type:'} =~ /multipart/i ) ) {
3248:         $boundary = &get_boundary_from_content_type($fields_and_values{'content-type:'});
3249:         $next_msg .= "\n".&remove_html_from_message_body($msg_body,$boundary);
3250:         }
3251:         else { $next_msg .= "\n".$msg_body; }
3252:     }
3253:     print ARCHIVEFILE $next_msg;
3254:     }
3255:     $/ = "\n";
3256:     close(ARCHIVEFILE);
3257:     close(TEMP);
3258:     unlink($temp_filename);
3259: }
3260:
3261:
3262: ## GET_BOUNDARY_FROM_CONTENT_TYPE
3263: ##
3264: sub get_boundary_from_content_type
3265: {
3266:     my($content_type) = @_;
3267:
3268:     if ( $content_type =~ /boundary\=\"([^\"]+)\"/ ) { return quotemeta($1); }
3269: }
3270:
3271:
3272: ## REMOVE_HTML_FROM_MESSAGE_BODY
3273: ##
3274: ## It might be better to do this with the MIME::Parser module, especially
3275: ## if the message attachments are to be saved and analyzed, but the
3276: ## function below simply looks for the part of the message that is encoded
3277: ## as plain/text and discards all of the other parts of the message.
3278: ##
3279: sub remove_html_from_message_body
3280: {
3281:     my($msg_body,$top_level_boundary) = @_;
3282:
3283:     my($inputrecordseparator,@msg_body,$boundary,$save_lines_p);
3284:     my($msg_body_part,@lines,$line);
3285:
3286:     $inputrecordseparator = $/;
3287:     $/ = "\n";
3288:     @msg_body = &split_message_body_at_boundary($top_level_boundary,$msg_body);
3289:     $boundary = '__this_is_a_highly_unlikely_mime_message_boundary__';
3290:     $save_lines_p = 0;
3291:     foreach $msg_body_part (@msg_body) {
3292:     @msg_body_part = split /\n/,$msg_body_part;
3293:     undef @lines;
3294:     foreach $line (@msg_body_part) {
3295:         if ( not($save_lines_p) ) {
3296:         if ( $line =~ /boundary\=\"([^\"]+)\"/ ) { $boundary = quotemeta($1); }
3297:         elsif ( $line =~ /^content-type\:\s*text\/plain/mi ) { $save_lines_p = 1; }
3298:         }
3299:         else {
3300:         if ( $line =~ /$boundary/ ) { last; }
3301:         elsif ( ( $line !~ /charset\=/i )
3302:             and ( $line !~ /content\-transfer\-encoding/i ) ) {
3303:             push(@lines,$line);
3304:         }
3305:         }
3306:     }
3307:     if ( @lines ) {
3308:         $plain_text = join "\n",@lines;
3309:         last;
3310:     }
3311:     }
3312:     $/ = $inputrecordseparator;
3313:     return($plain_text."\n".$end_of_message_marker);
3314: }
3315:
3316:
3317: ## SPLIT_MESSAGE_BODY_AT_BOUNDARY
3318: ##
3319: sub split_message_body_at_boundary
3320: {
3321:     my($boundary,$msg_body) = @_;
3322:     my(@msg_body,$line,@msg_parts,$msg_part);
3323:
3324:     $msg_part = '';
3325:     @msg_body = split /\n/,$msg_body;
3326:     foreach $line (@msg_body) {
3327:     if ( $line =~ /$boundary/ ) {
3328:         push(@msg_parts,$msg_part);
3329:         $msg_part = '';
3330:     }
3331:     else { $msg_part .= "\n".$line; }
3332:     }
3333:     if ( $msg_part ) { push(@msg_parts,$msg_part); }
3334:     return @msg_parts;
3335: }
3336:
3337:
3338: ## PRINT_TO_LOG
3339: ##
3340: sub print_to_log
3341: {
3342:     my($status_message_string) = @_;
3343:
3344:     my($time);
3345:
3346:     open(LOG,'>>'.$log_file);
3347:     $time = localtime;
3348:     print LOG $time.' >> ';
3349:     print LOG $status_message_string."\n";
3350:     close(LOG);
3351:     print STDOUT $time.' >> ';
3352:     print STDOUT $status_message_string."\n";
3353: }
3354:
3355:
3356: ## CHECK_YAHOO_MAIL
3357: ##
3358: sub check_yahoo_mail
3359: {
3360:     my(@ids,$id,$msg,$msg_header);
3361:
3362:     my($yahoo_client);
3363:
3364:     $new_messages_downloaded = 0;
3365:     &print_to_log("Logging in...");
3366:     eval {
3367:     $yahoo_client = Mail::Client::Yahoo->login(username => $yahoo_uid,
3368:                            password => $yahoo_password,
3369:                            secure => 0
3370:                           )
3371:     };
3372:     if ( $@ ) {
3373:     &print_to_log("ERROR: Failed login...");
3374:     sleep($pause);
3375:     return;
3376:     }
3377:     &print_to_log("Checking Inbox...");
3378:     eval { $yahoo_client->select_folder('Inbox') };
3379:     if ( $@ ) {
3380:     &print_to_log("ERROR: Couldn't find Inbox...");
3381:     eval { $yahoo_client->logout() };
3382:     if ( $@ ) { &print_to_log("ERROR: Failed logout..."); }
3383:      sleep($pause);
3384:     return;
3385:     }
3386:     eval { @ids = $yahoo_client->message_list() };
3387:     if ( $@ ) {
3388:     &print_to_log("ERROR: Couldn't find message ids in Inbox...");
3389:     eval { $yahoo_client->logout() };
3390:     if ( $@ ) { &print_to_log("ERROR: Failed logout..."); }
3391:     sleep($pause);
3392:     return;
3393:     }
3394:     foreach $id (@ids) {
3395:     if ( defined($yahooid{$id}) ) { next; }
3396:     else {
3397:         $yahooid{$id} = 1;
3398:         eval { $msg = $yahoo_client->message($id) };
3399:         if ( $@ ) {
3400:         &print_to_log("ERROR: Couldn't get message $id...");
3401:         eval { $yahoo_client->logout() };
3402:         if ( $@ ) { &print_to_log("ERROR: Failed logout..."); }
3403:         sleep($pause);
3404:         return;
3405:         }
3406:         $msg_header = $msg->head();
3407:      ## Messages from $news_group are selected by examining the
3408:      ## subject lines.
3409:         if ( $msg_header->get('Subject') =~ /$news_group/i ) {
3410:         open(RAW,'>'.$raw_messages_file);
3411:         $msg->print(\*RAW);
3412:         print RAW "\n".$end_of_message_marker."\n";
3413:         close(RAW);
3414:         &clean_raw_message();
3415:         $new_messages_downloaded++;
3416:         &print_to_log("Downloaded $id");
3417:         eval { $yahoo_client->move_message($id,$yahoo_outbox) };
3418:         if ( $@ ) {
3419:             &print_to_log("ERROR: Unable to move message out of Inbox into $yahoo_outbox...");
3420:             sleep($pause);
3421:             return;
3422:         }
3423:         }
3424:     }
3425:     }
3426:     eval { $yahoo_client->logout() };
3427:     if ( $@ ) { &print_to_log("ERROR: Failed logout..."); }
3428:     if ( $new_messages_downloaded == 0 ) { &print_to_log("No new messages found."); }
3429:     sleep($pause);
3430: }
3431:
3432:
3433:
3434: ## CHECK_NEWSGROUP
3435: ##
3436: sub check_newsgroup
3437: {
3438:     my($since) = @_;
3439:
3440:     my($now,$msgid,$message_ids_ptr,$since_date,$article_lines_ptr,$article);
3441:
3442:     $new_messages_downloaded = 0;
3443:     $now = $since; # or, should this be, time() ? this seems to depend on the NNTP server and the machine
3444:     ## Get a pointer to a list of message ids posted since server was last checked ($since).
3445:     $since_date = &ParseDateString("epoch $since");
3446:     $since_date = &UnixDate($since_date,"%d %b %y at %H:%M");
3447:     &print_to_log("Accessing ".$nntp_server." to download ids of messages posted to ".$news_group." since ".$since_date." GMT");
3448:     eval { $message_ids_ptr = $nntp->newnews($since,$news_group) };
3449:     if ( $@ ) {
3450:     &print_to_log("ERROR: Couldn't get message ids from NNTP server...");
3451:     sleep($pause);
3452:     return($since);
3453:     }
3454:     foreach $msgid (@{$message_ids_ptr}) {
3455:     chomp($msgid);
3456:     if ( exists($nntpid{$msgid}) ) { next; }
3457:     $nntpid{$msgid} = 1;
3458:     eval { $article_lines_ptr = $nntp->article($msgid) };
3459:     if ( $@ ) {
3460:         &print_to_log("ERROR: Couldn't get message $msgid from NNTP server...");
3461:         sleep($pause);
3462:         return($since);
3463:     }
3464:     $article = join '',@{$article_lines_ptr};
3465:     open(RAW,'>'.$raw_messages_file);
3466:     print RAW $article."\n".$end_of_message_marker."\n";
3467:     close(RAW);
3468:     &clean_raw_message();
3469:     $new_messages_downloaded++;
3470:     &print_to_log("Downloaded $msgid");
3471:     }
3472:     if ( $new_messages_downloaded == 0 ) { &print_to_log("No new messages found."); }
3473:     sleep($pause);
3474:     return($now);
3475: }
3476:
3477:
3478: ## CHECK_IMAP_MAIL
3479: ##
3480: ## This is not debugged.
3481: ##
3482: sub check_imap_mail
3483: {
3484:     my(@ids,$id,$msg,$msg_header);
3485:
3486:     my($imap_client,$imapid,$number_of_messages,$lines_ref,$lines,$line,$from_mailing_list_p);
3487:
3488:     $new_messages_downloaded = 0;
3489:     &print_to_log("Connecting to IMAP server $imap_server...");
3490:     eval { $imap_client = new Net::IMAP::Simple::SSL($imap_server) }; ## frequently hangs; cause: unknown
3491:     if ( $@ ) {
3492:     &print_to_log("ERROR: Failed to connect to server...");
3493:     sleep($pause);
3494:     return;
3495:     }
3496:     &print_to_log("Logging in...");
3497:     eval { $imap_client->login( $imap_uid => $imap_password ) };
3498:     if ( $@ ) {
3499:     &print_to_log("ERROR: Failed log in...");
3500:     sleep($pause);
3501:     return;
3502:     }
3503:     &print_to_log("Checking Inbox...");
3504:     eval { $number_of_messages = $imap_client->select( 'Inbox' ) };
3505:     if ( $@ ) {
3506:     &print_to_log("ERROR: Couldn't find Inbox...");
3507:     eval { $imap_client->quit() };
3508:     if ( $@ ) { &print_to_log("ERROR: Failed logout..."); }
3509:      sleep($pause);
3510:     return;
3511:     }
3512:     foreach $imapid ( 1..$number_of_messages ) {
3513:     if ( not($imap_client->seen($imapid)) ) {
3514:      # get the message, returned as a reference to an array of lines
3515:         $lines_ref = $imap_client->get($imapid);
3516:         $from_mailing_list_p = 0;
3517:         foreach $line (@{$lines_ref}) {
3518:         if ( $line =~ /^subject:\s+$news_group/i ) {
3519:             $from_mailing_list_p = 1;
3520:             last;
3521:         }
3522:         }
3523:         if ( $from_mailing_list_p ) {
3524:         $lines = join '',@{$lines_ref};
3525:         open(RAW,'>'.$raw_messages_file);
3526:         print RAW $lines;
3527:         print RAW "\n".$end_of_message_marker."\n";
3528:         close(RAW);
3529:         &clean_raw_message();
3530:         $new_messages_downloaded++;
3531:         &print_to_log("Downloaded $imapid");
3532:         eval { $imap_client->copy($imapid,$imap_outbox) };
3533:         if ( $@ ) {
3534:             &print_to_log("ERROR: Unable to move message out of Inbox into $imap_outbox...");
3535:             eval { $imap_client->quit() };
3536:             if ( $@ ) { &print_to_log("ERROR: Failed logout..."); }
3537:             sleep($pause);
3538:             return;
3539:         }
3540:         eval { $imap_client->delete($imapid) };
3541:         if ( $@ ) {
3542:             &print_to_log("ERROR: Unable to delete message from Inbox after moving it to $imap_outbox...");
3543:             eval { $imap_client->quit() };
3544:             if ( $@ ) { &print_to_log("ERROR: Failed logout..."); }
3545:             sleep($pause);
3546:             return;
3547:         }
3548:         }
3549:     }
3550:     }
3551:     eval { $imap_client->quit() };
3552:     if ( $@ ) { &print_to_log("ERROR: Failed logout..."); }
3553:     if ( $new_messages_downloaded == 0 ) { &print_to_log("No new messages found."); }
3554:     sleep($pause);
3555:     return;
3556:     sleep($pause);
3557: }
3558:
3559:
3560: ## YESTERDAY_IN_SECONDS
3561: ##
3562: sub yesterday_in_seconds
3563: {
3564:     my($yesterday,@yesterday_as_strings,@yesterday);
3565:     my($elem,$sec,$min,$hour,$mday,$mon,$year,$time);
3566:
3567:     ## Use ParseDate to find the date and time of yesterday.
3568:     $yesterday = ParseDate('24 hours ago');
3569:     ## Simplify and format the date and time into a list of strings.
3570:     @yesterday_as_strings = UnixDate($yesterday,"%S","%M","%k","%e","%m","%y");
3571:     ## Convert the strings into integers.
3572:     @yesterday = ();
3573:     foreach $elem (@yesterday_as_strings) {
3574:     $elem += 0;
3575:     push(@yesterday,$elem);
3576:     }
3577:     ($sec,$min,$hour,$mday,$mon,$year) = @yesterday;
3578:     ## Decrement the month ($mon) to accord with the gmtime format
3579:     ## (wherein the month runs from 0..11).
3580:     $mon--;
3581:     ## Calculate the number of seconds between yesterday and 1970.
3582:     $time = timegm($sec,$min,$hour,$mday,$mon,$year);
3583:     return($time);
3584: }
3585:
3586:
3587: ## FETCH_NEW_MESSAGES_FROM_SERVER
3588: ##
3589: ## This function simply calls a more specific function.
3590: ## It is a placeholder for a function that periodically
3591: ## fetches messages off of a server. The function called
3592: ## here must address the following issues:
3593: ##
3594: ## (1) Timing: Some form of wait function should be built into the
3595: ## function so that it is not constantly pinging the
3596: ## server for new messages but, rather, polls from the
3597: ## server only once in a while. The time period to
3598: ## wait will depend on the server.
3599: ##
3600: ## (2) Message format: The fetching function must write the
3601: ## fetched messages into the file named $archive_file_name.
3602: ## These messages should be appended into that file so that
3603: ## more messages can be downloaded even if the system is not
3604: ## done processing the current messages. The messages should
3605: ## be rendered in plain text (i.e., for example, HTML should
3606: ## be stripped out of the messages). The messages should be
3607: ## formatted according to RFC-1036 conventions. One field of
3608: ## the messages must be 'newsgroups:' and must be filled
3609: ## with the name of the newsgroup specified in the agonistics
3610: ## configuration file. Finally, after the end of the
3611: ## text of each message, on a separate line, the
3612: ## $end_of_message_marker must be printed.
3613: ##
3614: ## (3) Log: It is preferrable, but not necessary, for the fetching
3615: ## function to log its errors (and successes) in the $log_file
3616: ## using the &print_to_log function.
3617: ##
3618: ## See &check_yahoo_mail and the associated functions
3619: ## (e.g., &clean_raw_message) for an example implementation of
3620: ## these criteria.
3621: ##
3622: sub fetch_new_messages_from_server
3623: {
3624:     ## Download messages from a Yahoo email account; or,
3625:     if ( $yahoo_uid and $yahoo_password ) { &check_yahoo_mail(); }
3626:     ## download messages from a Usenet newsgroup server; or,
3627:     elsif ( $nntp_server ) {
3628: ## If no messages have been downloaded yet, download all message posted since yesterday.
3629:     if ( $time_of_last_nntp_server_check == 0 ) {
3630:         $time_of_last_nntp_server_check = &yesterday_in_seconds();
3631:     }
3632:     $time_of_last_nntp_server_check = &check_newsgroup($time_of_last_nntp_server_check);
3633:     }
3634:     ## download messages from an IMAP email server.
3635:     elsif ( $imap_server and $imap_uid and $imap_password ) { &check_imap_mail(); }
3636:     else { die "No Yahoo, NNTP or IMAP server was specified for the interactive downloading of messages."; }
3637: }
3638:
3639:
3640: ## REASSEMBLE_MESSAGE
3641: ##
3642: sub reassemble_message
3643: {
3644:     my($smid) = @_;
3645:
3646:     $message_header = '';
3647:     if ( $smid2newsgroups{$smid} ) { $message_header = 'newsgroups: '.$smid2newsgroups{$smid}."\n"; }
3648:     if ( $smid2date{$smid} ) { $message_header .= 'date: '.$smid2date{$smid}."\n"; }
3649:     if ( $smid2mid{$smid} ) {
3650:     $message_header .= 'message-id: '.$smid2mid{$smid}."\n";
3651:     if ( $mid2pid{$smid2mid{$smid}} ) { $message_header .= 'references: '.$mid2pid{$smid2mid{$smid}}."\n"; }
3652:     }
3653:     if ( $smid2said{$smid} and $said2ana{$smid2said{$smid}} ) { $message_header .= 'from: '.$said2ana{$smid2said{$smid}}."\n"; }
3654:     if ( $smid2subject{$smid} ) { $message_header .= 'subject: '.$smid2subject{$smid}."\n"; }
3655:     $message_body = '';
3656:     $ln = 0;
3657:     while ( defined($smidln2line{"$smid $ln"}) ) {
3658:     $message_body .= $smidln2line{"$smid $ln"}."\n";
3659:     $ln++;
3660:     }
3661:     return($message_header."\n".$message_body);
3662: }
3663:
3664:
3665: ## REORGANIZE_ARCHIVE
3666: ##
3667: sub reorganize_archive
3668: {
3669:     my(@smids,$mid,$last_batch_archive_file_name);
3670:
3671:     ## sort the messages by date
3672:     @smids = ();
3673:     foreach $mid (keys %analyzed_p) { push(@smids,$mid2smid{$mid}); }
3674:     @smids = sort { $smid2sdt{$a} <=> $smid2sdt{$b} } @smids;
3675:     ## store the last batch of messages
3676:     $last_batch_archive_file_name = $archive_file_name;
3677:     $last_batch_archive_file_name =~ s/\./\_$batch_of_messages\./;
3678:     if ( $last_batch_archive_file_name eq $archive_file_name ) { die "archive file name not renamed!"; }
3679:     ## Save the archive into a new file and delete the existing archive file
3680:     copy($archive_file_name,$last_batch_archive_file_name);
3681:     unlink($archive_file_name);
3682:     ## print the last ( $max_frames / 2 ) into the (now empty) archive; and,
3683:     ## update %analyzed_p so that it only contains message ids from messages not in the new archive
3684:     open(ARCHIVEFILE,'>'.$archive_file_name);
3685:     $first_smid_index = $#smids / 2;
3686:     foreach $smid (@smids[$first_smid_index..$#smids]) {
3687:     $message = &reassemble_message($smid);
3688:     print ARCHIVEFILE $message.$end_of_message_marker."\n";
3689:     }
3690:     close(ARCHIVEFILE);
3691: }
3692:
3693:
3694: ## CREATE_BLANK_PAGES
3695: ##
3696: sub create_blank_pages
3697: {
3698:     my(@interface_files,$fn,$n,$html);
3699:
3700:     opendir(ID,$interface_directory);
3701:     @interface_files = readdir(ID);
3702:     closedir(ID);
3703:     $html = '<HTML>
3704: <HEAD>
3705: <META http-equiv="Refresh" content="1; URL=agon2.html">
3706: <TITLE>agonistics</TITLE>
3707: <LINK type="text/css" rel="stylesheet" href="interface_style.css">
3708: </HEAD>
3709: <BODY>
3710: </BODY>
3711: </HTML>';
3712:     open(AGON,'>'.$interface_directory.$slash.'agon1.html');
3713:     print AGON $html;
3714:     close(AGON);
3715:     $html =~ s/agon2/agon1/;
3716:     foreach $fn (@interface_files) {
3717:     if ( $fn =~ /^agon([0-9]+)\.html/ ) {
3718:         $n = $1;
3719:         if ( $n != 1 ) {
3720:         open(AGON,'>'.$interface_directory.$slash.'agon'.$n.'.html');
3721:         print AGON $html;
3722:         close(AGON);
3723:         }
3724:     }
3725:     }
3726:     return;
3727: }
3728:
3729:
3730: ## INTERACTIVE_ANALYSIS
3731: ##
3732: ## Messages are periodically pulled from a server and analyzed.
3733: ##
3734: sub interactive_analysis
3735: {
3736:     my($l,$small_pause);
3737:
3738:     while (1) {
3739:     if ( $new_messages_downloaded <= 0 ) { &fetch_new_messages_from_server(); }
3740:     if ( $new_messages_downloaded > 0 ) {
3741:         $frame_number++;
3742:         $file_pointer_in_messages_archive = &process_next_message($file_pointer_in_messages_archive);
3743:         $new_messages_downloaded--;
3744:     }
3745: # write another version of the last page and have it point back to the
3746: # current page.
3747:     if (-e $interface_directory.$slash.'agon'.$frame_number.'.html') {
3748:         open(PREVIOUSAGON,$interface_directory.$slash.'agon'.$frame_number.'.html');
3749:         @current_page_contents = <PREVIOUSAGON>;
3750:         close(PREVIOUSAGON);
3751:         open(NEXTAGON,'>'.$interface_directory.$slash.'agon'.($frame_number + 1).'.html');
3752:         foreach $l (@current_page_contents) {
3753:         if ( $l =~ /content\=\"$pause_between_frames\;\s+URL\=agon([0-9]+)/ ) {
3754:             $l =~ s/agon[0-9]+/agon$frame_number/i;
3755:             $small_pause = int($pause/2);
3756:             $l =~ s/content\=\"$pause_between_frames\;/content\=\"$small_pause\;/i;
3757:         }
3758:         print NEXTAGON $l;
3759:         }
3760:         close(NEXTAGON);
3761:     }
3762:     if ( $frame_number > $max_frames ) { last; }
3763:     }
3764:     ## If max frames reached, restart the analysis
3765:     &analyze_messages();
3766: }
3767:
3768:
3769: ## STATIC_ANALYSIS
3770: ##
3771: ## Messages are analyzed from a static archive of messages.
3772: ##
3773: sub static_analysis
3774: {
3775:     my($l,$small_pause,@interface_files,$fn,$n);
3776:
3777:     while ($file_pointer_in_messages_archive < (-s $archive_file_name) ) {
3778:     $frame_number++;
3779:     $file_pointer_in_messages_archive = &process_next_message($file_pointer_in_messages_archive);
3780:     }
3781:     # write another version of the last page and have it point back to the
3782:     # current page.
3783:     if (-e $interface_directory.$slash.'agon'.$frame_number.'.html') {
3784:     open(PREVIOUSAGON,$interface_directory.$slash.'agon'.$frame_number.'.html');
3785:     @current_page_contents = <PREVIOUSAGON>;
3786:     close(PREVIOUSAGON);
3787:     open(NEXTAGON,'>'.$interface_directory.$slash.'agon'.($frame_number + 1).'.html');
3788:     foreach $l (@current_page_contents) {
3789:         if ( $l =~ /content\=\"$pause_between_frames\;\s+URL\=agon([0-9]+)/ ) {
3790:         $l =~ s/agon[0-9]+/agon$frame_number/i;
3791:         $small_pause = int($pause/2);
3792:         $l =~ s/content\=\"$pause_between_frames\;/content\=\"$small_pause\;/i;
3793:         }
3794:         print NEXTAGON $l;
3795:     }
3796:     close(NEXTAGON);
3797:     }
3798:     ## Erase any blank interface pages still remaining.
3799:     opendir(ID,$interface_directory);
3800:     @interface_files = readdir(ID);
3801:     closedir(ID);
3802:     foreach $fn (@interface_files) {
3803:     if ( $fn =~ /^agon([0-9]+)\.html/ ) {
3804:         $n = $1;
3805:         if ( $n > ($frame_number + 1) ) {
3806:         unlink($interface_directory.$slash.'agon'.$n.'.html');
3807:         }
3808:     }
3809:     }
3810:     ## If max frames reached, restart the analysis
3811:     if ( $frame_number > $max_frames ) { &analyze_messages(); }
3812: }
3813:
3814:
3815: ## ANALYZE_MESSAGES
3816: ##
3817: sub analyze_messages
3818: {
3819:     &initialize_global_variables();
3820:     $batch_of_messages++;
3821:     ## Make sure that clients viewing interface pages are moved to a blank page
3822:     ## before any pages are deleted.
3823:     sleep($pause_between_frames+2);
3824:     ## If the archive does not yet exist, create it.
3825:     if ( not(-e $archive_file_name) ) {
3826:     open(ARCHIVEFILE,'>'.$archive_file_name);
3827:     close(ARCHIVEFILE);
3828:     }
3829:     $file_pointer_in_messages_archive = 0;
3830:     ## case 1: The messages to be analyzed are to be fetched periodically from
3831:     ## a server. This is the interactive mode.
3832:     ## case 2: Messages are analyzed from a static archive of messages.
3833:     ## case 3: There is already an archive messages to be analyzed and then --
3834:     ## after the archive is analyzed -- messages will be pulled from the server and
3835:     ## analyzed interactively.
3836:     if ( (-e $archive_file_name) and ( $is_interactive_p eq 'YES' ) ) {
3837:     &static_analysis();
3838:     &interactive_analysis();
3839:     }
3840:     elsif ( $is_interactive_p eq 'YES' ) { &interactive_analysis(); }
3841:     else { &static_analysis(); }
3842: }
3843:
3844:
3845: &analyze_messages();