agonistics: a language game w a r r e n s a c k <wsack@media.mit.edu> In the spirit of two exhibitions curated by Christiane Paul,
CODeDOC 2: 3: ######################################################################### 4: ## 5: ## RUN_AGONISTICS 6: ## 7: ## Warren Sack (wsack@media.mit.edu) 8: ## 9: ## February 2005 10: ## 11: ## Usage: perl run_agonistics.pl <configuration file> 12: ## 13: ## Preconditions: 13: ## (1) The file configuration file needs to exist and contain 14: ## a correct set of values for all of the necessary 15: ## parameters. 16: ## 17: ## (2) The sub-directory "Resources" needs to be exist within 18: ## the directory that contains this script. The Resources 19: ## sub-directory contains a number of images and CGI scripts. 20: ## 21: ## (3) If this script is going be run by polling new messages 22: ## periodically from a server, a means to connect and 23: ## download those messages needs to be arranged before the 24: ## script is run. See the example configuration files 25: ## *.conf for details concerning, for example, the use of 26: ## a Yahoo email account as an means to poll and archive 27: ## new messages from a mailing list. 28: ## 29: ######################################################################### 30: ## 31: ## Copyright (c) 2005 by Warren Sack 32: ## 33: ## This work is copyrighted with a Creative Commons 34: ## (http://creativecommons.org) Attribution-NonCommercial-NoDerivs 2.0 35: ## License. 36: ## 37: ## Here is a short summary of the license: 38: ## 39: ## You are free to copy, distribute, display, and perform the work 40: ## under the following conditions: 41: ## 42: ## Attribution: You must give the original author (Warren Sack) credit. 43: ## 44: ## Noncommercial: You may not use this work for commercial purposes. 45: ## 46: ## No Derivative Works: You may not alter, transform, or build upon this work. 47: ## 48: ## * For any reuse or distribution, you must make clear to others the 49: ## license terms of this work. 50: ## * Any of these conditions can be waived if you get permission from 51: ## the copyright holder (Warren Sack). 52: ## * Your fair use and other rights are in no way affected by the above. 53: ## * The details and full text of the license can be found at this URL: 54: ## http://creativecommons.org/licenses/by-nc-nd/2.0/legalcode 55: ## 56: ######################################################################### 57: 58: if ( < 0 ) { 59: die "Usage: run_agonistics.pl <configuration file>\n". 60: "This script requires one argument.\n"; 61: } 62: elsif ( not(-e $ARGV[0]) ) { 63: die "Usage: run_agonistics.pl <configuration file>\n". 64: "This script requires a configuration file. The given configuration file cannot be found.\n"; 65: } 66: 67: 68: use utf8; 69: use Unicode::Normalize; 70: use HTML::Entities(); 71: use File::Temp qw(tempfile); 72: use File::Path; 73: use Date::Manip qw(ParseDate ParseDateString UnixDate); 74: use Time::Local; 75: use File::Copy; 76: use MIME::WordDecoder; 77: use Lingua::Stem; 78: use Fcntl qw(:flock); 79: use Crypt::SSLeay; 80: use Mail::Client::Yahoo; 81: use Digest::MD5; 82: use Net::NNTP; 83: use Net::SSLeay; 84: use IO::Socket::SSL; 85: use Net::IMAP::Simple::SSL; 86: 87: $batch_of_messages = 0; 88: 89: ## READ_CONFIG_FILE 90: ## 91: sub read_config_file 92: { 93: my($key,$value); 94: 95: open(CONFIG,$ARGV[0]) || die "Can't find configuration file: $ARGV[0]\n"; 96: while(<CONFIG>) { 97: chomp; 98: if ( /^\#/ ) { next; } 99: if ( /^\s*$/ ) { next; } 100: ($key,$value) = $_ =~ /^(\S+)\s+(\S+)/; 101: $agonistics_config{$key} = $value; 102: } 103: close(CONFIG); 104: } 105: 106: 107: ## INITIALIZE_GLOBAL_VARIABLES 108: ## 109: ## Input: None 110: ## 111: ## Effects: Global variables associated with this package are initialized. 112: ## 113: ## Output: None 114: ## 115: sub initialize_global_variables 116: { 117: ## initializations done only before the first batch of messages is processed 118: if ( $batch_of_messages == 0 ) { 119: ## Open and read the config file into a hash. 120: %agonistics_config = (); 121: &read_config_file(); 122: ## Before setting slash, the machine's OS is checked. 123: if ( $ENV{'OSTYPE'} =~ /^win/i ) { $slash = '\\'; } 124: else { $slash = '/'; } 125: $news_group = $agonistics_config{'Newsgroup'}; 126: $archive_file_name = $agonistics_config{'FileNameOfArchive'}; 127: $language_locale = $agonistics_config{'LanguageTag'}; 128: if ( defined($agonistics_config{'MaxFrames'}) ) { 129: $max_frames = $agonistics_config{'MaxFrames'}; 130: } 131: else { $max_frames = 1000; } 132: if ( defined($agonistics_config{'PauseBetweenFrames'}) ) { 133: $pause_between_frames = $agonistics_config{'PauseBetweenFrames'}; 134: } 135: else { $pause_between_frames = 7; } 136: ## Load the correct end-of-sentence tagger. English and French 137: ## texts both use the English tagger. German texts use a 138: ## different tagger. 139: if ($language_locale eq 'DE') { 140: require Lingua::DE::Sentence; 141: Lingua::DE::Sentence->import( qw(get_sentences) ); 142: } 143: else { 144: require Lingua::EN::Sentence; 145: Lingua::EN::Sentence->import( qw(get_sentences) ); 146: } 147: $archive_name = $agonistics_config{'DirectoryForOutput'}; 148: $recency = $agonistics_config{'Recency'}; 149: $documents_url = $agonistics_config{'DocumentsURL'}; 150: $cgi_url = $agonistics_config{'CGIURL'}; 151: $web_server_directory = $agonistics_config{'WebServerDirectory'}; 152: $web_server_cgi_directory = $agonistics_config{'WebServerCGIDirectory'}; 153: $is_interactive_p = $agonistics_config{'InteractiveMode'}; 154: if ( $is_interactive_p =~ /n/i ) { $is_interactive_p = 'NO'; } 155: else { $is_interactive_p = 'YES'; } 156: ## Record the address to be used for posting messages to the list analyzed. 157: $mailing_list_address = $agonistics_config{'MailingListAddress'}; 158: ## Note information about the Yahoo mail account, if it is to be used. 159: $yahoo_uid = $agonistics_config{'YahooUID'}; 160: $yahoo_password = $agonistics_config{'YahooPassword'}; 161: $yahoo_outbox = $agonistics_config{'YahooOutbox'}; 162: ## Note information about the IMAP mail server and account, if it is to be used. 163: $imap_server = $agonistics_config{'IMAPServer'}; 164: $imap_uid = $agonistics_config{'IMAPUID'}; 165: $imap_password = $agonistics_config{'IMAPPassword'}; 166: $imap_outbox = $agonistics_config{'IMAPOutbox'}; 167: ## Note information about the NNTP account and server, if it is to be used. 168: $nntp_server = $agonistics_config{'NNTPServer'}; 169: $nntp_uid = $agonistics_config{'NNTPUID'}; 170: $nntp_password = $agonistics_config{'NNTPPassword'}; 171: if ( ( $nntp_server and ( $yahoo_uid or $imap_server ) ) 172: or ( $yahoo_uid and ( $nntp_server or $imap_server ) ) 173: or ( $imap_server and ( $nntp_server or $yahoo_uid ) ) ) { 174: die "Only one of the following may be defined: (a) NNTP server; (b) Yahoo UID; (c) IMAP server". 175: "\nTwo of the three need to be commented out in the configuration file $ARGV[0]\n"; 176: } 177: ## How many seconds should the script wait between tries to download 178: ## messages from the server? 179: $pause = $agonistics_config{'PauseBetweenFetches'}; 180: $output_directory = $web_server_directory.$slash.'Agonistics'.$slash.$archive_name; 181: $cgi_directory = $web_server_cgi_directory.$slash.'Agonistics'; 182: ## create the directory to house the CGI scripts 183: mkdir($cgi_directory); 184: ## copy the CGI scripts into the CGI directory 185: my $cgi_script_file; 186: my $send_script_file_found_p = 0; 187: opendir CGISCRIPTS, 'Resources'.$slash.'CGIScripts' or die "Cannot open CGIScripts directory: $!"; 188: foreach $cgi_script_file (readdir CGISCRIPTS) { 189: if ( $cgi_script_file eq 'send_message.pl' ) { 190: &rewrite_send_script_file('Resources'.$slash.'CGIScripts'.$slash.$cgi_script_file,$cgi_directory.$slash.$cgi_script_file); 191: $send_script_file_found_p = 1; 192: } 193: else { copy('Resources'.$slash.'CGIScripts'.$slash.$cgi_script_file,$cgi_directory.$slash.$cgi_script_file); } 194: chmod(0777,$cgi_directory.$slash.$cgi_script_file); 195: } 196: closedir CGISCRIPTS; 197: if ( $send_script_file_found_p == 0 ) { die "Can't find send_message.pl CGI script\n"; } 198: ## Alternatively load the German or English+French end-of-sentence 199: ## tagger depending upon the language specified on the command line. 200: if ( $language_locale =~ /^DE/ ) { 201: require Lingua::DE::Sentence; 202: Lingua::DE::Sentence->import( qw(get_sentences) ); 203: } 204: else { 205: require Lingua::EN::Sentence; 206: Lingua::EN::Sentence->import( qw(get_sentences) ); 207: } 208: $stemmer = Lingua::Stem->new({-locale => $language_locale}); 209: $stemmer->stem_caching({ -level => 2 }); 210: $log_file = $agonistics_config{'FileNameOfLog'}; 211: open(LOG,'>'.$log_file); 212: close(LOG); 213: $radius_of_circle = 100000; 214: my $random_number = int(rand(10000)); 215: $raw_messages_file = 'raw_messages_file_'.$random_number.'.txt'; 216: $end_of_message_marker = '__end_of_message_marker__'; 217: ## English stop words 218: @english_stop_words = ("a", "about", "above", "according", "across", "actually", "adj", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anywhere", "are", "aren", "around", "as", "at", "b", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "behind", "being", "below", "beside", "besides", "between", "beyond", "billion", "both", "but", "by", "c", "can", "can", "cannot", "caption", "co", "could", "couldn", "d", "did", "didn", "do", "does", "doesn", "don", "down", "during", "e", "each", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "f", "few", "fifty", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "further", "g", "h", "had", "has", "hasn", "have", "haven", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "instead", "into", "is", "isn", "it", "its", "itself", "j", "k", "l", "last", "later", "latter", "latterly", "least", "less", "let", "like", "likely", "ll", "ltd", "m", "made", "make", "makes", "many", "maybe", "me", "meantime", "meanwhile", "might", "million", "miss", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "must", "my", "myself", "n", "namely", "neither", "never", "nevertheless", "next", "nine", "ninety", "no", "nobody", "none", "nonetheless", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "overall", "own", "p", "per", "perhaps", "q", "r", "rather", "re", "recent", "recently", "s", "same", "seem", "seemed", "seeming", "seems", "seven", "seventy", "several", "she", "should", "shouldn", "since", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "stop", "such", "t", "taking", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thirty", "this", "those", "though", "thousand", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "toward", "towards", "trillion", "twenty", "two", "u", "under", "unless", "unlike", "unlikely", "until", "up", "upon", "us", "used", "using", "v", "ve", "very", "via", "w", "was", "wasn", "we", "well", "were", "weren", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whomever", "whose", "why", "will", "with", "within", "without", "would", "wouldn", "wrote", "x", "y", "yes", "yet", "you", "your", "yours", "yourself", "yourselves", "z", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "http", "www", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec", "com", "edu", "org", "just", "net", "ftp", "nntp", "http", "html"); 219: @english_present_tense_of_to_be = ( "am", "are", "is" ); 220: ## French stop words 221: ## Note that some words here are intentionally misspelled (i.e., spelled without diacritical marks). 222: ## This appears to be necessary since the diacritical marks are not always employed by the participants 223: ## and they are oftentimes lost between the client and server. 224: @french_stop_words = ( "a", "au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "elles", "en", "et", "eux", "il", "ils", "je", "la", "le", "les", "leur", "lui", "ma", "mais", "me", "m\x{EA}me", "mes", "moi", "mon", "ne", "nos", "notre", "nous", "on", "ou", "par", "pas", "plus", "pour", "qu", "que", "qui", "bien", "bon", "bonne", "bonnes", "faire", "fais", "fait", "faisons", "font", "sa", "se", "ses", "son", "sur", "ta", "te", "tes", "toi", "ton", "tu", "un", "une", "vos", "votre", "vous", "c", "d", "j", "l", "\x{E0}", "m", "n", "s", "t", "y", "\x{E9}t\x{E9}", "ete", "\x{E9}t\x{E9}e", "etee", "\x{E9}t\x{E9}es", "etees", "\x{E9}t\x{E9}s", "etes", "\x{E9}tant", "etant", "\x{E9}tante", "etante", "\x{E9}tants", "etants", "\x{E9}tantes", "etantes", "suis", "es", "est", "sommes", "\x{EA}tes", "etes", "sont", "serai", "seras", "sera", "serons", "serez", "seront", "serais", "serait", "serions", "seriez", "seraient", "\x{E9}tais", "etais", "\x{E9}tait", "etait", "\x{E9}tions", "etions", "\x{E9}tiez", "etiez", "\x{E9}taient", "etaient", "fus", "fut", "f\x{FB}mes", "fumes", "f\x{FB}tes", "futes", "furent", "sois", "soit", "soyons", "soyez", "soient", "fusse", "fusses", "f\x{FB}t", "fut", "fussions", "fussiez", "fussent", "ayant", "ayante", "ayantes", "ayants", "eu", "eue", "eues", "eus", "ai", "as", "avons", "avez", "ont", "aurai", "auras", "aura", "aurons", "aurez", "auront", "aurais", "aurait", "aurions", "auriez", "auraient", "avais", "avait", "avions", "aviez", "avaient", "eut", "e\x{FB}mes", "eumes", "e\x{FB}tes", "eutes", "eurent", "aie", "aies", "ait", "ayons", "ayez", "aient", "eusse", "eusses", "e\x{FB}t", "eut", "eussions", "eussiez", "eussent", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "un", "deux", "trois", "quatre", "cinq", "six", "sept", "huit", "neuf", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "http", "message", "jamais", "fr", "sans", "non", "oui", "dire", "dis", "dit", "disez", "disons", "disent", "faut", "toujours", "que", "quel", "quelle", "quelles", "quelqu", "quelque", "quelques", "quels", "qui", "quiconque", "quoi", "quoiqu", "quoique", "\x{E7}a", "ca", "car", "ce", "ceci", "cela", "celle", "l\x{E0}", "la", "celles", "ci", "celles", "celui", "ces", "cet", "cette", "ceux", "comme", "vrai", "\x{EA}tre", "etre", "\x{E9}crit", "ecrit", "rien", "tout", "tous", "toute", "toutes", "si", "fr", "com", "edu", "org", "autres", "autre", "sauf", "vais", "vas", "va", "venons", "venez", "vont", "aller", "aimer", "aiment", "aimons", "aime", "aimes", "mal", "news", "nntp", "html", "ftp", "net", "part", "puis", "voir", "quand", "tant", "autant", "leur", "leurs", "tien", "tienne", "tiens", "tiennes", "mien", "mienne", "miens", "miennes", "soi", "aussi", "ailleurs", "moins", "alors", "passe", "pass\x{E9}", "avoir" ); 225: @french_present_tense_of_to_be = ( "suis", "es", "est", "\x{EA}tes", "etes", "sommes", "sont" ); 226: ## German stop words 227: ## Note that some words here are intentionally misspelled (i.e., spelled without diacritical marks). 228: ## This appears to be necessary since the diacritical marks are not always employed by the participants 229: ## and they are oftentimes lost between the client and server. 230: @german_stop_words = ( "aber", "alle", "allem", "allen", "aller", "alles", "als", "also", "am", "an", "ander", "andere", "anderem", "anderen", "anderer", "anderes", "anderm", "andern", "anderr", "anders", "auch", "auf", "aus", "bei", "bin", "bis", "bist", "da", "damit", "dann", "der", "den", "des", "dem", "die", "das", "da\x{DF}", "derselbe", "derselben", "denselben", "desselben", "demselben", "dieselbe", "dieselben", "dasselbe", "dazu", "dein", "deine", "deinem", "deinen", "deiner", "deines", "denn", "derer", "dessen", "dich", "dir", "du", "dies", "diese", "diesem", "diesen", "dieser", "dieses", "doch", "dort", "durch", "ein", "eine", "einem", "einen", "einer", "eines", "einig", "einige", "einigem", "einigen", "einiger", "einiges", "einmal", "er", "ihn", "ihm", "es", "etwas", "euer", "eure", "eurem", "euren", "eurer", "eures", "f\x{FC}r", "fur", "gegen", "gewesen", "hab", "habe", "haben", "hat", "hatte", "hatten", "hier", "hin", "hinter", "ich", "mich", "mir", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "euch", "im", "in", "indem", "ins", "ist", "jede", "jedem", "jeden", "jeder", "jedes", "jene", "jenem", "jenen", "jener", "jenes", "jetzt", "kann", "kein", "keine", "keinem", "keinen", "keiner", "keines", "k\x{F6}nnen", "konnen", "k\x{F6}nnte", "konntemachen", "man", "manche", "manchem", "manchen", "mancher", "manches", "mein", "meine", "meinem", "meinen", "meiner", "meines", "mit", "muss", "musste", "nach", "nicht", "nichts", "noch", "nun", "nur", "ob", "oder", "ohne", "sehr", "sein", "seine", "seinem", "seinen", "seiner", "seines", "selbst", "sich", "sie", "ihnen", "sind", "so", "solche", "solchem", "solchen", "solcher", "solches", "soll", "sollte", "sondern", "sonst", "\x{FC}ber", "uber", "um", "und", "uns", "unse", "unsem", "unsen", "unser", "unses", "unter", "viel", "vom", "von", "vor", "w\x{E4}hrend", "wahrend", "war", "waren", "warst", "was", "weg", "weil", "weiter", "welche", "welchem", "welchen", "welcher", "welches", "wenn", "werde", "werden", "wie", "wieder", "will", "wir", "wird", "wirst", "wo", "wollen", "wollte", "w\x{FC}rde", "wurde", "w\x{FC}rden", "wurden", "zu", "zum", "zur", "zwar", "zwischen", "dk", "com", "edu", "org", "news", "http", "ftp", "html", "com", "org", "net", "nntp"); 231: @german_present_tense_of_to_be = ( "bin", "bist", "ist", "sind", "seid" ); 232: @stop_words = (); 233: if ( $language_locale =~ /^EN/ ) { @stop_words = @english_stop_words; } 234: ## Many French and German discussions include English posts, so English stop words are used for them too. 235: elsif ( $language_locale =~ /^FR/ ) { push(@stop_words,@french_stop_words,@english_stop_words); } 236: elsif ( $language_locale =~ /^DE/ ) { push(@stop_words,@german_stop_words,@english_stop_words); } 237: %isa_stop_word = (); 238: my $stop_word; 239: foreach $stop_word (@stop_words) { $isa_stop_word{NFD($stop_word)} = 1; } 240: ## Connect to NNTP server if specified. Note that this connection is made once and only once. 241: ## Attempts to connect and the reconnect to the NNTP server can result in multiple, hanging 242: ## connections and, thus, ultimately a refusal by the server to download new messages. 243: if ( $nntp_server and ( $is_interactive_p eq 'YES' ) ) { 244: &print_to_log("Connecting to NNTP server $nntp_server..."); 245: eval { $nntp = Net::NNTP->new($nntp_server) }; 246: if ( $@ ) { 247: &print_to_log("ERROR: Couldn't connect to NNTP server..."); 248: return; 249: } 250: &print_to_log("Logging in to NNTP server $nntp_server..."); 251: if ( $nntp_uid and $nntp_password ) { 252: eval { $nntp->authinfo($nntp_uid,$nntp_password) }; 253: if ( $@ ) { 254: &print_to_log("ERROR: Couldn't log in to NNTP server..."); 255: return; 256: } 257: } 258: } 259: ## create the output directory 260: if (-d $output_directory) { rmtree($output_directory,0,1); } 261: mkdir($output_directory); 262: ## create a place to put the interface 263: $interface_directory = $output_directory.$slash.'Interface'; 264: mkdir($interface_directory); 265: ## copy the face images into the interface subdirectory 266: $face_images_directory = $interface_directory.$slash.'FaceImages'; 267: mkdir($face_images_directory); 268: my $fimage_file; 269: opendir FID, 'Resources'.$slash.'FaceImages' or die "Cannot open FaceImages: $!"; 270: my $number_of_face_images = 0; 271: foreach $fimage_file (readdir FID) { 272: $number_of_face_images++; 273: copy('Resources'.$slash.'FaceImages'.$slash.$fimage_file,$face_images_directory.$slash.$fimage_file); 274: } 275: closedir FID; 276: ## Three images needed for every face (left, right and front); 277: $number_of_faces = $number_of_face_images / 3; 278: ## Copy the forward and rewind button images into the Interface directory. 279: if (-e 'Resources'.$slash.'forward.gif') { 280: copy('Resources'.$slash.'forward.gif',$interface_directory.$slash.'forward.gif'); 281: } 282: else { die "Can't find GIF file for forward button\n"; } 283: if (-e 'Resources'.$slash.'rewind.gif') { 284: copy('Resources'.$slash.'rewind.gif',$interface_directory.$slash.'rewind.gif'); 285: } 286: else { die "Can't find GIF file for rewind button\n"; } 287: ## Copy the help files into the Interface directory. 288: if ( (-e 'Resources'.$slash.'help-en.html') 289: and (-e 'Resources'.$slash.'help-fr.html') 290: and (-e 'Resources'.$slash.'help-de.html') ) { 291: &rewrite_help_file('Resources'.$slash.'help-en.html',$interface_directory.$slash.'help-en.html'); 292: &rewrite_help_file('Resources'.$slash.'help-fr.html',$interface_directory.$slash.'help-fr.html'); 293: &rewrite_help_file('Resources'.$slash.'help-de.html',$interface_directory.$slash.'help-de.html'); 294: } 295: else { die "Can't find help files\n"; } 296: ## create a place to put the messages and message indices 297: $message_directory = $interface_directory.$slash.'Messages'; 298: mkdir($message_directory); 299: ## create a file name for a message index to be consulted by CGI scripts 300: $message_index_file_name = $message_directory.$slash.'message_index'; 301: ## Copy the style sheet into the output directory and the CGI directory 302: if (-e 'Resources'.$slash.'message_style.css') { 303: copy('Resources'.$slash.'message_style.css',$message_directory.$slash.'message_style.css'); 304: copy('Resources'.$slash.'message_style.css',$cgi_directory.$slash.'message_style.css'); 305: } 306: else { die "Can't find message_style.css file\n"; } 307: ## Copy the style sheet into the interface directory. 308: if (-e 'Resources'.$slash.'interface_style.css') { 309: copy('Resources'.$slash.'interface_style.css',$interface_directory.$slash.'interface_style.css'); 310: } 311: else { die "Can't find interface_style.css file\n"; } 312: } 313: ## initializations done only before the second (or later) batch of messages is processed 314: else { 315: ## Replace the interface webpages of the last batch with blank pages. 316: &create_blank_pages(); 317: ## Remove the messages from the last batch. 318: if (-d $message_directory) { rmtree($message_directory,0,1); } 319: ## Recreate a place to put the messages and message indices. 320: mkdir($message_directory); 321: copy('Resources'.$slash.'message_style.css',$message_directory.$slash.'message_style.css'); 322: &reorganize_archive(); 323: } 324: undef %analyzed_p; 325: $time_of_last_nntp_server_check = 0; 326: $current_simplified_message_id = 0; 327: $current_simplified_author_id = 0; 328: $frame_number = 0; 329: $current_simplified_sentence_id = 0; 330: $max_score = 1; 331: $min_score = 1; 332: $current_winner = 0; 333: $previous_winner = 0; 334: $previous_winning_theme = 'previous'; 335: $current_winning_theme = 'current'; 336: $current_winning_sentence = ''; 337: $social_network_index = 0; 338: $new_messages_downloaded = 0; 339: %new_social_network = (); 340: %old_social_network = (); 341: %link = (); 342: %sn2saids = (); 343: %said2sn = (); 344: %previous_sn2saids = (); 345: %said2previous_sn = (); 346: %sn2theta = (); 347: %ssid2start = (); 348: %ssid2end = (); 349: %ssid2sentence = (); 350: %ssid2smid = (); 351: %smid2ssids = (); 352: %token2stem = (); 353: %stem2tokens = (); 354: %stem2ssids = (); 355: %ssid2stems = (); 356: %citers = (); 357: %repliers = (); 358: %mid2smid = (); 359: %smid2mid = (); 360: %smid2sdt = (); 361: %ana2said = (); 362: %said2ana = (); 363: %smid2said = (); 364: %said2smids = (); 365: %smid2nsl = (); 366: %nsl2smids = (); 367: %nsl2stems = (); 368: %smid2newsgroups = (); 369: %smid2date = (); 370: %smid2subject = (); 371: %mid2pid = (); 372: %smid2spid = (); 373: %spid2smids = (); 374: %ancestors = (); 375: %smid2fptrs = (); 376: %said2lines = (); 377: %tline2smidslns = (); 378: %smidln2tline = (); 379: %signature = (); 380: %line_numbers_of_signature = (); 381: %source = (); 382: %smidln2source = (); 383: %smid2sources = (); 384: %prefix = (); 385: %said2addr = (); 386: @stop_words = (); 387: %said2theta = (); 388: %said2previous_theta = (); 389: %said2theta_movement = (); 390: %said2rho = (); 391: %said2previous_rho = (); 392: %said2rho_movement = (); 393: %said2image_id = (); 394: %said_is_speaker = (); 395: %isa_quotation = (); 396: %said2percentage_score = (); 397: %said2previous_percentage_score = (); 398: %said2score = (); 399: %said2previous_score = (); 400: %winning_sentences = (); 401: %yahooid = (); 402: 403: return; 404: } 405: 406: 407: ## REWRITE_HELP_FILE 408: ## 409: sub rewrite_help_file 410: { 411: my($template_file,$specific_file) = @_; 412: 413: my(@line,@new_line,$new_line,$word); 414: 415: open(TEMPLATE,$template_file); 416: open(HELP,'>'.$specific_file); 417: while($line = <TEMPLATE>) { 418: chomp($line); 419: @line = split /\s+/,$line; 420: @new_line = (); 421: foreach $word (@line) { 422: if ( $word =~ /(.*)Newsgroup(.*)/ ) { push(@new_line,$1.$news_group.$2); } 423: elsif ( $word =~ /(.*)PauseBetweenFetches(.*)/ ) { push(@new_line,$1.$pause.$2); } 424: else { push(@new_line,$word); } 425: } 426: $new_line = join ' ',@new_line; 427: print HELP $new_line."\n"; 428: } 429: close(TEMPLATE); 430: close(HELP); 431: } 432: 433: 434: ## REWRITE_SEND_SCRIPT_FILE 435: ## 436: sub rewrite_send_script_file 437: { 438: my($template_file,$specific_file) = @_; 439: 440: my(@line,@new_line,$new_line,$word); 441: 442: open(TEMPLATE,$template_file); 443: open(SEND,'>'.$specific_file); 444: while($line = <TEMPLATE>) { 445: chomp($line); 446: if ( $line =~ /NNTPServer/ ) { $new_line = '$nntp_server = '."'".$nntp_server."';"; } 447: elsif ( $line =~ /NNTPUID/ ) { $new_line = '$nntp_uid = '."'".$nntp_uid."';"; } 448: elsif ( $line =~ /NNTPPassword/ ) { $new_line = '$nntp_password = '."'".$nntp_password."';"; } 449: else { $new_line = $line; } 450: print SEND $new_line."\n"; 451: } 452: close(TEMPLATE); 453: close(SEND); 454: } 455: 456: 457: ## PROCESS_NEXT_MESSAGE 458: ## 459: ## Input: An integer: a pointer indicating the start of the current message in the archive. 460: ## 461: ## Effects: Several hashes are created. 462: ## 463: ## Output: An integer: a pointer indicating the current end of the file 464: ## containing the messages. 465: ## 466: ## Notes: The file handle is for a file of messages separated by __end_of_message__marker__ 467: ## Important: no whitespace can follow the __end_of_message_marker_! 468: ## The file pointer indicates where, in the file of message, reading 469: ## of the messages should begin. 470: ## 471: sub process_next_message 472: { 473: my($start) = @_; 474: 475: my($message,$message_body,$end,%field_values,$field); 476: my($marker_length,$message_id,$source_smid,@source_smids,$simplified_message_id); 477: my($simplified_author_id); 478: 479: open(ARCHIVEFILE,$archive_file_name) || die "Can't open archive file\, $archive_file_name\, $! \n"; 480: $marker_length = length($end_of_message_marker); 481: seek(ARCHIVEFILE,$start,0); 482: $/ = $end_of_message_marker; 483: $message = <ARCHIVEFILE>; 484: $end = tell(ARCHIVEFILE); 485: close(ARCHIVEFILE); 486: if ( $message ) { 487: undef %field_values; 488: %field_values = &extract_fields_and_values_from_message($message); 489: $message_body = ''; 490: if ( $message =~ m/\n\n/g ) { 491: $message_body = substr($message,pos($message),length($message)-(pos($message)+$marker_length)); 492: } 493: ## If the message is a cancel message or if it does not have an id, then skip it. 494: if ( ( defined($field_values{'control:'}) 495: and ( $field_values{'control:'} =~ /cancel/i ) ) 496: or (not defined($field_values{'message-id:'}) ) ) { 497: $frame_number--; 498: return($end); 499: } 500: ## Downcase the message-id and replace any spaces with underscores 501: $field_values{'message-id:'} = lc($field_values{'message-id:'}); 502: $field_values{'message-id:'} =~ tr/ /_/; 503: ## Make sure the message is not a duplicate of one already in the database. 504: if ( defined($analyzed_p{$field_values{'message-id:'}}) ) { 505: $frame_number--; 506: return($end); 507: } 508: $analyzed_p{$field_values{'message-id:'}} = 1; 509: $simplified_message_id = &assert_simplified_message_id($field_values{'message-id:'}); 510: $message_id = $field_values{'message-id:'}; 511: &assert_simplified_date_and_time($simplified_message_id,$field_values{'date:'}); 512: $simplified_author_id = &assert_simplified_author_id($field_values{'from:'}); 513: &assert_message_authorship($simplified_message_id,$simplified_author_id); 514: &assert_normalized_subject_line($simplified_message_id,$field_values{'subject:'}); 515: &assert_message_header($simplified_message_id, 516: $field_values{'newsgroups:'}, 517: $field_values{'date:'}, 518: $field_values{'subject:'}); 519: if (not(defined $field_values{'references:'})) { $field_values{'references:'} = ' '; } 520: if (not(defined $field_values{'in-reply-to:'})) { $field_values{'in-reply-to:'} = ' '; } 521: &assert_reference_relation($field_values{'message-id:'}, 522: $field_values{'references:'}, 523: $field_values{'in-reply-to:'}); 524: &assert_file_pointers($simplified_message_id,$start,$end); 525: &assert_message_body_authorship($simplified_author_id,$simplified_message_id,$message_body); 526: &assert_threading_relations($message_id); 527: &assert_line_positions($simplified_message_id); 528: &assert_signature($simplified_author_id); 529: &assert_sources($simplified_message_id); 530: &assert_sentences($simplified_message_id); 531: &write_html_for_message($simplified_message_id); 532: if (defined($smid2sources{$simplified_message_id}) ) { 533: @source_smids = @{$smid2sources{$simplified_message_id}}; 534: } 535: else { @source_smids = (); } 536: push(@source_smids,$smid2spid{$simplified_message_id}); 537: foreach $source_smid (@source_smids) { &write_html_for_message($source_smid); } 538: &update_social_networks($simplified_message_id,@source_smids); 539: &update_scores(); 540: &update_themes($simplified_message_id); 541: &write_html_for_interface($simplified_message_id,@source_smids); 542: &write_entry_in_message_index($simplified_message_id); 543: return($end); 544: } 545: else { return($start); } 546: } 547: 548: 549: ## UPDATE_SOCIAL_NETWORKS 550: ## 551: ## Input: A simplified message id (integer) and a list of other 552: ## simplified message ids (integers) that represent either 553: ## the messages quoted or replied to by the first message. 554: ## 555: ## Effects: Three hashes are updates: %link, %social_network2saids, 556: ## and %said2sn. The elements $link(A,B) and 557: ## $link(B,A) are added if B previously cited or replied to 558: ## A, and now, in the current message A has either cited or 559: ## replied to B. (This information is stored in the 560: ## hashes %repliers and %citers updated in other functions.) 561: ## If any new links are added, the social networks for A and 562: ## B are checked. If they are the same, then nothing is done. 563: ## If they are different (and, thus, the new link constitutes 564: ## a bridge between the two social networks), the smaller of 565: ## the two social networks is dissolved and its elements are 566: ## added into the larger social network. 567: ## 568: ## Output: none 569: ## 570: sub update_social_networks 571: { 572: my($smid,@source_smids) = @_; 573: 574: my($said,$s_said,$weight_from_said,$weight_from_s_said,@new_links); 575: my($acquiring_social_network,$disappearing_social_network,$ss); 576: my($isa_member_of_new_social_network,$sni); 577: 578: ## For each player, save their current social network. The update 579: ## may change some of all of the players' social networks. The 580: ## previous positions of the players is necessary for updating 581: ## the interface. 582: undef %said2previous_sn; 583: undef %previous_sn2saids; 584: foreach $ss (keys %said2sn) { 585: $said2previous_sn{$ss} = $said2sn{$ss}; 586: ${%{$previous_sn2saids{$said2sn{$ss}}}}{$ss} = 1; 587: } 588: $said = $smid2said{$smid}; 589: ## If the author of the current message is appearing for the first 590: ## time, s/he will constitute a separate new social network. 591: ## Since it is not possible for one to join another 592: ## network with just one post, the new network will not be 593: ## joined to any existing networks and, consequently, the subsequent 594: ## code can be skipped. 595: $isa_member_of_new_social_network = 0; 596: foreach $sni (keys %new_social_network) { 597: if ( ${%{$sn2saids{$sni}}}{$said} ) { 598: $isa_member_of_new_social_network = 1; 599: last; 600: } 601: } 602: if ( $isa_member_of_new_social_network ) { return; } 603: ## Otherwise, the new message comes from someone who is already in an 604: ## existing social network. Test to see if the message constitutes 605: ## a new link between two, as yet separate, social networks. 606: else { 607: @new_links = (); 608: if ( defined($smid2said{$smid}) 609: and defined($said2sn{$said}) ) { 610: foreach $s_smid (@source_smids) { 611: $s_said = $smid2said{$s_smid}; 612: ## If the author of the cited or replied to message 613: ## has ever cited or replied to the author of the 614: ## current message, then the two authors should be 615: ## linked together in a social network. 616: if ( defined($smid2said{$s_smid}) 617: and ( defined(${%{$citers{$said}}}{$s_said}) 618: or defined(${%{$repliers{$said}}}{$s_said}) ) ) { 619: if ( not(defined($link{$said,$s_said})) ) { push(@new_links,$s_said); } 620: $weight_from_s_said = 0; 621: if ( defined(${%{$citers{$said}}}{$s_said}) ) { 622: $weight_from_s_said += ${%{$citers{$said}}}{$s_said}; 623: } 624: if ( defined(${%{$repliers{$said}}}{$s_said}) ) { 625: $weight_from_s_said += ${%{$repliers{$said}}}{$s_said}; 626: } 627: $weight_from_said = 0; 628: if ( defined(${%{$citers{$s_said}}}{$said}) ) { 629: $weight_from_said += ${%{$citers{$s_said}}}{$said}; 630: } 631: if ( defined(${%{$repliers{$s_said}}}{$said}) ) { 632: $weight_from_said += ${%{$repliers{$s_said}}}{$said}; 633: } 634: if ( $weight_from_s_said < $weight_from_said ) { 635: $link{$said,$s_said} = $weight_from_s_said; 636: $link{$s_said,$said} = $weight_from_s_said; 637: } 638: else { 639: $link{$said,$s_said} = $weight_from_said; 640: $link{$s_said,$said} = $weight_from_said; 641: } 642: } 643: } 644: foreach $s_said (@new_links) { 645: if ( defined($said2sn{$s_said}) 646: and ( $said2sn{$s_said} != $said2sn{$said} ) ) { 647: ## The smaller social network joins the larger one and the small 648: ## network is dissolved. 649: if ( scalar(keys %{$sn2saids{$said2sn{$said}}}) 650: > scalar(keys %{$sn2saids{$said2sn{$s_said}}}) ) { 651: $acquiring_social_network = $said2sn{$said}; 652: $disappearing_social_network = $said2sn{$s_said}; 653: } 654: else { 655: $acquiring_social_network = $said2sn{$s_said}; 656: $disappearing_social_network = $said2sn{$said}; 657: } 658: foreach $ss (keys %{$sn2saids{$disappearing_social_network}}) { 659: $said2sn{$ss} = $acquiring_social_network; 660: ${%{$sn2saids{$acquiring_social_network}}}{$ss} = 1; 661: } 662: delete($sn2saids{$disappearing_social_network}); 663: ## Note: these might chain: e.g., network 1 might disappear into 664: ## network 2; and, then network 2 disappear into network 3; etc. 665: ## all in one move (i.e., all after the posting of just one 666: ## message if it cites and/or replies to multiple messages 667: ## from multiple social networks. 668: $old_social_network{$disappearing_social_network} = $acquiring_social_network; 669: } 670: } 671: &update_old_social_networks(); 672: } 673: return; 674: } 675: } 676: 677: 678: ## UPDATE_OLD_SOCIAL_NETWORKS 679: ## 680: sub update_old_social_networks 681: { 682: my($acquiring_social_network,$osn); 683: 684: ## If any social networks are being merged into other social networks, 685: ## make sure that none of the acquiring networks (i.e., those networks 686: ## that will absorb the old networks) are, themselves old networks. This 687: ## might happen, because the merging process can "chain." see the note in the 688: ## update_social_networks subroutine. Thus, the ultimate acquiring network 689: ## will be that one that does not appear as an old network. 690: if (keys %old_social_network) { 691: $acquiring_social_network = -1; 692: foreach $osn (keys %old_social_network) { 693: if ( not(defined($old_social_network{$old_social_network{$osn}})) ) { 694: $acquiring_social_network = $old_social_network{$osn}; 695: last; 696: } 697: } 698: if ( $acquiring_social_network != -1 ) { 699: ## Rewrite the %old_social_network hash so that 700: ## every old network points to the final acquiring network. 701: ## Note that, since this is updated after every message 702: ## posted to the group, we need only be concerned with what 703: ## can happen to the social networks with the addition of 704: ## one message and one author (i.e., the player posting the 705: ## message). Since a player can only be a member of one 706: ## social network there can only be, in principle, one 707: ## acquiring network, even if the one post dissolves 708: ## the distinctions between several existing networks. 709: foreach $osn (keys %old_social_network) { 710: $old_social_network{$osn} = $acquiring_social_network; 711: } 712: } 713: } 714: return; 715: } 716: 717: 718: ## UPDATE_THEMES 719: ## 720: sub update_themes 721: { 722: my($current_smid) = @_; 723: 724: my($current_said,$sni,%subject_stem2saids,$smid,$said,$stem,%saids,$ssid); 725: my(@shared_subject_stems,%sentence_stem2saids,@shared_sentence_stems); 726: 727: $current_said = $smid2said{$current_smid}; 728: $sni = $said2sn{$current_said}; 729: ## If the poster of the current message is not a member of 730: ## a social network, then return. 731: if ( scalar(keys %{$sn2saids{$sni}}) <= 1 ) { 732: $current_winning_theme = ''; 733: $current_winning_sentence = ''; 734: return; 735: } 736: undef %subject_stem2saids; 737: ## Otherwise, examine all of the subject lines used in the thread from the 738: ## current message back to the first post in the thread. 739: foreach $smid (&smids_of_ancestors($current_smid)) { 740: if ( defined($smid2said{$smid}) ) { 741: $said = $smid2said{$smid}; 742: if ( defined($smid2nsl{$smid}) 743: and defined ($nsl2stems{$smid2nsl{$smid}}) ) { 744: foreach $stem (@{$nsl2stems{$smid2nsl{$smid}}}) { 745: push(@{$subject_stem2saids{$stem}},$said); 746: } 747: } 748: } 749: } 750: ## Remove the duplicates in the %subject_stem2saids entries. 751: foreach $stem (keys %subject_stem2saids) { 752: undef %saids; 753: foreach $said (@{$subject_stem2saids{$stem}}) { $saids{$said} = 1; } 754: @{$subject_stem2saids{$stem}} = keys %saids; 755: } 756: @shared_subject_stems = (); 757: foreach $stem (keys %subject_stem2saids) { push(@shared_subject_stems,$stem); } 758: ## After the sort below, the first shared stem should be the stem employed by the 759: ## greatest number of people in the thread in the subject line of their messages. 760: @shared_subject_stems = sort { $#{$subject_stem2saids{$b}} <=> $#{$subject_stem2saids{$a}} } @shared_subject_stems; 761: ## Hunt for a sentence in the current post that contains one of the frequent 762: ## terms from the subject lines of the thread. 763: foreach $stem (@shared_subject_stems) { 764: foreach $ssid (@{$smid2ssids{$current_smid}}) { 765: if (not $isa_quotation{$ssid}) { 766: if ( grep { $_ eq $stem } @{$ssid2stems{$ssid}} ) { 767: if ( defined($stem2tokens{$stem}) ) { 768: $current_winning_theme = ${@{$stem2tokens{$stem}}}[0]; 769: } 770: else { $current_winning_theme = $stem; } 771: $current_winning_sentence = $ssid2sentence{$ssid}; 772: push(@{$winning_sentences{$stem}},$ssid); 773: return; 774: } 775: } 776: } 777: } 778: ## If no winning sentence was found that might represent one of the themes 779: ## mentioned in the subject lines, then look into the body of the messages 780: ## for themes not mentioned in the subject lines and try again to find a 781: ## winning sentence. 782: undef %sentence_stem2saids; 783: ## Examine all of the words used in the thread from the 784: ## current message back to the first post in the thread. 785: foreach $smid (&smids_of_ancestors($current_smid)) { 786: if ( defined($smid2said{$smid}) ) { 787: $said = $smid2said{$smid}; 788: foreach $ssid (@{$smid2ssids{$smid}}) { 789: if (not $isa_quotation{$ssid}) { 790: foreach $stem (@{$ssid2stems{$ssid}}) { 791: push(@{$sentence_stem2saids{$stem}},$said); 792: } 793: } 794: } 795: } 796: } 797: ## Remove the duplicates in the %sentence_stem2saids entries. 798: foreach $stem (keys %sentence_stem2saids) { 799: undef %saids; 800: foreach $said (@{$sentence_stem2saids{$stem}}) { $saids{$said} = 1; } 801: @{$sentence_stem2saids{$stem}} = keys %saids; 802: } 803: @shared_sentence_stems = (); 804: foreach $stem (keys %sentence_stem2saids) { push(@shared_sentence_stems,$stem); } 805: ## After the sort below, the first shared stem should be the stem employed by the 806: ## greatest number of people in the thread in their own sentences. 807: @shared_sentence_stems = sort { $#{$sentence_stem2saids{$b}} <=> $#{$sentence_stem2saids{$a}} } @shared_sentence_stems; 808: ## Hunt for a sentence in the current post that contains one of the frequent 809: ## terms from the subject lines of the thread. 810: foreach $stem (@shared_sentence_stems) { 811: foreach $ssid (@{$smid2ssids{$current_smid}}) { 812: if (not $isa_quotation{$ssid}) { 813: if ( grep { $_ eq $stem } @{$ssid2stems{$ssid}} ) { 814: if ( defined($stem2tokens{$stem}) ) { 815: $current_winning_theme = ${@{$stem2tokens{$stem}}}[0]; 816: } 817: else { $current_winning_theme = $stem; } 818: $current_winning_sentence = $ssid2sentence{$ssid}; 819: push(@{$winning_sentences{$stem}},$ssid); 820: return; 821: } 822: } 823: } 824: } 825: ## Otherwise, nothing was found and the winning theme and sentence are 826: ## set to nil. 827: $current_winning_theme = ''; 828: $current_winning_sentence = ''; 829: return; 830: } 831: 832: 833: ## ASSERT_SENTENCES 834: ## 835: sub assert_sentences 836: { 837: my($smid) = @_; 838: 839: my(@signature_lines,$message_text,$ln,$sentences,$sentence); 840: my($token,@sentence,@modified_sentence,@line_numbers,$said,$stem); 841: my(%stem_of_modified_sentence,$ssid); 842: 843: ## Find which lines are signature lines: these will be skipped. 844: @signature_lines = sort {$a <=> $b} &find_line_numbers_of_signature($smid); 845: $message_text = ''; 846: $ln = 0; 847: while (exists $smidln2line{"$smid $ln"}) { 848: if ( not(grep {$_ == $ln} @signature_lines) ) { 849: $message_text .= ' __line_number_'.$ln.'__ '; 850: if (defined($smidln2tline{"$smid $ln"})) { 851: $message_text .= $smidln2tline{"$smid $ln"}; 852: } 853: } 854: $ln++; 855: } 856: ## Get the sentences from the text of the message body. 857: $sentences=get_sentences($message_text); 858: $ln = 0; 859: foreach $sentence (@{$sentences}) { 860: $current_simplified_sentence_id++; 861: $ssid = $current_simplified_sentence_id; 862: $ssid2smid{$ssid} = $smid; 863: push(@{$smid2ssids{$smid}},$ssid); 864: @sentence = split /\s+/,$sentence; 865: @modified_sentence = (); 866: @line_numbers = $ln; 867: foreach $token (@sentence) { 868: if ( $token =~ /__line_number_(\d+)__/ ) { 869: push(@line_numbers,$1); 870: } 871: else { push(@modified_sentence,$token); } 872: } 873: $ssid2sentence{$ssid} = join ' ',@modified_sentence; 874: @line_numbers = sort {$a <=> $b} @line_numbers; 875: ## assert on which lines the sentence starts and ends 876: $ssid2start{$ssid} = $line_numbers[0]; 877: $ssid2end{$ssid} = $line_numbers[]; 878: ## determine whether or not the sentence is a quotation 879: $isa_quotation{$ssid} = 0; 880: foreach $ln (@line_numbers) { 881: if ( $smidln2source{"$smid $ln"} ne "$smid $ln" ) { $isa_quotation{$ssid} = 1; } 882: } 883: $ln = $ssid2end{$ssid}; 884: @modified_sentence = &normalize_sentence(join ' ',@modified_sentence); 885: $said = $smid2said{$smid}; 886: undef %stem_of_modified_sentence; 887: foreach $stem (@modified_sentence) { 888: ## Only count a stem once per sentence. 889: if ( not(defined($stem_of_modified_sentence{$stem})) ) { 890: $stem_of_modified_sentence{$stem} = 1; 891: push(@{$stem2ssids{$stem}},$ssid); 892: push(@{$ssid2stems{$ssid}},$stem); 893: } 894: } 895: } 896: return; 897: } 898: 899: 900: ## EXTRACT_FIELDS_AND_VALUES_FROM_MESSAGE 901: ## 902: ## Input: A string containing a message formatted according to 903: ## RFC1036. 904: ## 905: ## Effects: Fill the hash %field_values; the keys of the hash are 906: ## RFC1036-defined fields. 907: ## 908: ## Output: A hash: %field_values 909: ## 910: sub extract_fields_and_values_from_message 911: { 912: my($message) = @_; 913: 914: my($field,$message_header,%field_values,$input_record_separator); 915: my($message_header_line,@message_header_lines); 916: my(@rewritten_message_header_lines,$last_line); 917: my @rfc1036_fields = ( 918: 'from:', 919: 'subject:', 920: 'date:', 921: 'message-id:', 922: 'control:', 923: 'references:', 924: 'in-reply-to:', 925: 'organization:', 926: 'newsgroups:', 927: 'content-type:' 928: ); 929: $message_header = ''; 930: if ( $message =~ m/\n\n/g ) { $message_header = substr($message,0,pos($message)); } 931: ## Make sure that the input record separator is set to "\n" 932: $input_record_separator = $/; 933: $/ = "\n"; 934: ## Remove any newlines in the values 935: @message_header_lines = split /\n/,$message_header; 936: $last_line = ''; 937: foreach $message_header_line (@message_header_lines) { 938: if ( $message_header_line =~ /^(\S+\:).*$/ ) { 939: if ( $last_line ) { push(@rewritten_message_header_lines,$last_line); } 940: $last_line = $message_header_line; 941: } 942: else { $last_line .= $message_header_line; } 943: } 944: if ( $last_line ) { push(@rewritten_message_header_lines,$last_line); } 945: foreach $field (@rfc1036_fields) { 946: foreach $message_header_line (@rewritten_message_header_lines) { 947: if ($message_header_line =~ /^$field\s*(.*)\s*$/i) { 948: $field_values{$field} = $1; 949: last; 950: } 951: } 952: } 953: $/ = $input_record_separator; 954: return (%field_values); 955: } 956: 957: 958: ## ASSERT_SIMPLIFIED_MESSAGE_ID 959: ## 960: ## Input: A string: message_id 961: ## 962: ## Effects: Update the two hashes %mid2smid and %smid2mid 963: ## message_id <--> simplified_message_id 964: ## message_ids are the ones that appear in the messages; 965: ## simplified_message_ids are integers 966: ## If message_id is a new id, then the variable, 967: ## $current_simplified_message_id, is incremented. 968: ## 969: ## Output: An integer: the current value of the variable 970: ## $current_simplified_message_id 971: ## 972: sub assert_simplified_message_id 973: { 974: my($message_id) = @_; 975: 976: if (not(defined $mid2smid{$message_id})) { 977: $current_simplified_message_id++; 978: $mid2smid{$message_id} = $current_simplified_message_id; 979: $smid2mid{$current_simplified_message_id} = $message_id; 980: } 981: return($current_simplified_message_id); 982: } 983: 984: 985: ## ASSERT_SIMPLIFIED_DATE_AND_TIME 986: ## 987: ## Input: Two strings: simplified_message_id 988: ## and unsimplified_date_and_time 989: ## 990: ## Effects: Update the hash %smid2sdt 991: ## simplified_message_id --> simplified_date_and_time 992: ## simplified_date_and_time is an integer indicating 993: ## the date and time when the message was posted 994: ## (measured in seconds since the epoch, where the 995: ## epoch is 1/1/1970 GMT). 996: ## 997: ## Output: None 998: ## 999: ## Notes: This function uses the Date::Manip package; 1000: ## specifically the methods ParseDate() and UnixDate(). 1001: ## 1002: sub assert_simplified_date_and_time 1003: { 1004: my($simplified_message_id,$unsimplified_date_and_time) = @_; 1005: 1006: my($date,$seconds); 1007: 1008: $seconds = -1; 1009: if (not(exists $smid2sdt{$simplified_message_id})) { 1010: $date = ParseDate($unsimplified_date_and_time); 1011: if (!$date) { 1012: print STDERR "Unparsable date: $unsimplified_date_and_time\n"; 1013: } 1014: else { 1015: $seconds = &UnixDate($date,"%s"); 1016: $smid2sdt{$simplified_message_id} = $seconds; 1017: } 1018: } 1019: return; 1020: } 1021: 1022: 1023: ## ASSERT_SIMPLIFIED_AUTHOR_ID 1024: ## 1025: ## Input: A string: an author name and/or email address; 1026: ## 1027: ## Effects: Update the %ana2said and %said2ana hashes: 1028: ## author_name_and_address <--> simplified_author_id 1029: ## simplified_author_ids are integers 1030: ## If author is not in the database, then the variable, 1031: ## $current_simplified_author_id, is incremented. 1032: ## 1033: ## Output: An integer: the current value of the variable 1034: ## $current_simplified_author_id, or the 1035: ## simplified author id previously assigned to the 1036: ## given author name. 1037: ## 1038: ## Notes: The author name and/or email address is unmimed using 1039: ## the method unmime from the package MIME::WordDecoder. 1040: ## Thus, any subsequent fetches from the database assume 1041: ## that all RFC-1522 encoded words in the author name and address 1042: ## have been decoded to the local representation (which 1043: ## should be UTF-8 characters). Note also that all quotation 1044: ## marks are removed from the author's name and address. 1045: ## Also, some ISPs or email clients add a unique id to the 1046: ## author's email address with each post 1047: ## (e.g., wsack+123456@media.mit.edu). The unique id is 1048: ## stripped out of the email address. 1049: ## 1050: sub assert_simplified_author_id 1051: { 1052: my($author_name_and_address) = @_; 1053: 1054: my($unmimed_author_name_and_address); 1055: 1056: $unmimed_author_name_and_address = unmime($author_name_and_address); 1057: ## Delete quotation marks. 1058: $unmimed_author_name_and_address =~ s/"//g; 1059: ## Delete any trailing whitespace. 1060: $unmimed_author_name_and_address =~ s/\p{IsSpace}+$//; 1061: ## Delete any unique id added by ISP 1062: if ( $unmimed_author_name_and_address =~ /^([^+]+)\+[0-9]+@(.+)$/ ) { 1063: $unmimed_author_name_and_address = $1.'@'.$2; 1064: } 1065: if (not(exists $ana2said{$unmimed_author_name_and_address})) { 1066: $current_simplified_author_id++; 1067: $ana2said{$unmimed_author_name_and_address} = $current_simplified_author_id; 1068: $said2ana{$current_simplified_author_id} = $unmimed_author_name_and_address; 1069: &assert_said2addr($current_simplified_author_id,$unmimed_author_name_and_address); 1070: return($current_simplified_author_id); 1071: } 1072: else { return($ana2said{$unmimed_author_name_and_address}); } 1073: } 1074: 1075: 1076: ## ASSERT_SAID2ADDR 1077: ## 1078: ## Input: An email name and address 1079: ## 1080: ## Effects: Store an abbreviated form of the address in the hash %said2addr 1081: ## 1082: sub assert_said2addr 1083: { 1084: my($said,$ana) = @_; 1085: 1086: my($user_name); 1087: 1088: ($user_name) = split /@/,$ana; 1089: $user_name =~ /^(.*)(<)(.*)$/; ## Find the last (rightmost) '<' on the line. 1090: if ($2) { $user_name = $1; } ## Delete everything after the last '<'; and, 1091: $said2addr{$said} = $user_name; 1092: return; 1093: } 1094: 1095: 1096: ## ASSERT_MESSAGE_AUTHORSHIP 1097: ## 1098: ## Input: Two strings: (1) a simplified message id; and, 1099: ## (2) a simplified author id 1100: ## 1101: ## Effects: Update the hashes %smid2said and %said2smids 1102: ## [simplified_message_id]* <--> simplified_author_id 1103: ## These hashes keep track of (a) the author of any given 1104: ## message; and, (b) the ids of all of the messages 1105: ## authored by any given poster. If the author 1106: ## is new, assign the author to a social_network. 1107: ## 1108: ## Output: None 1109: ## 1110: sub assert_message_authorship 1111: { 1112: my($simplified_message_id,$simplified_author_id) = @_; 1113: 1114: if (not(exists $smid2said{$simplified_message_id})) { 1115: $smid2said{$simplified_message_id} = $simplified_author_id; 1116: push(@{$said2smids{$simplified_author_id}},$simplified_message_id); 1117: } 1118: if (not(defined($said2sn{$simplified_author_id}))) { 1119: $social_network_index++; 1120: $said2sn{$simplified_author_id} = $social_network_index; 1121: ${%{$sn2saids{$social_network_index}}}{$simplified_author_id} = 1; 1122: $new_social_network{$social_network_index} = 1; 1123: } 1124: return; 1125: } 1126: 1127: 1128: ## ASSERT_MESSAGE_HEADER 1129: ## 1130: ## Input: One integer and three strings: a simplified message id, 1131: ## and the newsgroups, date, and subject fields 1132: ## from a message header. These are the fields unrecorded 1133: ## in other hashes. 1134: ## 1135: ## Effects: Parts of the text of the message header is saved so 1136: ## that the message can be more easily printed. 1137: ## 1138: ## Output: None 1139: ## 1140: sub assert_message_header 1141: { 1142: my($smid,$newsgroups,$date,$subject) = @_; 1143: 1144: $smid2newsgroups{$smid} = $newsgroups; 1145: $smid2date{$smid} = $date; 1146: $smid2subject{$smid} = unmime($subject); 1147: return; 1148: } 1149: 1150: 1151: ## DELETE_PREFIXES_FROM_SUBJECT_LINE 1152: ## 1153: ## Input: String in which all characters have been normalized and downcased. 1154: ## 1155: ## Output: String in which all prefixing "re: " and "fwd: " substrings have 1156: ## been deleted. 1157: ## 1158: sub delete_prefixes_from_subject_line 1159: { 1160: my($subject_line) = @_; 1161: 1162: $subject_line =~ s/re\: //gi; 1163: $subject_line =~ s/re\://gi; 1164: $subject_line =~ s/fw\: //gi; 1165: $subject_line =~ s/fw\://gi; 1166: $subject_line =~ s/fwd\: //gi; 1167: $subject_line =~ s/fwd\://gi; 1168: return $subject_line; 1169: } 1170: 1171: 1172: ## TOKENIZE 1173: ## 1174: ## Input: A string 1175: ## 1176: ## Output: An array of words. Words, contractions and hyphenations 1177: ## in the input string are separated and punctuation is removed. 1178: ## 1179: sub tokenize 1180: { 1181: my($s) = @_; 1182: 1183: $s =~ s/\P{Alnum}/ /g; 1184: return(split /\P{Alnum}+/,$s); 1185: } 1186: 1187: 1188: ## NORMALIZE_SENTENCE 1189: ## 1190: ## Input: A string 1191: ## 1192: ## Output: A string of stemmed, tokenized words with all punctuation 1193: ## and stop words removed. 1194: ## 1195: ## Notes: Words in the input string are stemmed using the Lingua::Stem 1196: ## package. The language assumed by the stemmer must be 1197: ## specified on the command line: EN-UK, EN-US, FR, and DE are 1198: ## all possibilities. The language specified on the command 1199: ## line also determines which stop-word list is used. 1200: ## 1201: sub normalize_sentence 1202: { 1203: my($sentence) = @_; 1204: 1205: my($stemmed_words,$stem,@normalized_sentence,$normalized_sentence); 1206: my(@words,$i); 1207: 1208: foreach $token (&tokenize($sentence)) { 1209: $token = lc($token); 1210: if (not(exists $isa_stop_word{NFD($token)})) { push(@words,$token); } 1211: } 1212: $stemmed_words = $stemmer->stem(@words); 1213: $i = 0; 1214: foreach $stem (@{$stemmed_words}) { 1215: if ($stem !~ /^\s*$/) { 1216: push(@normalized_sentence,$stem); 1217: $token2stem{$words[$i]} = $stem; 1218: if ( not(grep {$_ eq $words[$i]} @{$stem2tokens{$stem}}) ) { 1219: push(@{$stem2tokens{$stem}},$words[$i]); 1220: } 1221: } 1222: $i++; 1223: } 1224: return(@normalized_sentence); 1225: } 1226: 1227: 1228: ## ASSERT_NORMALIZED_SUBJECT_LINE 1229: ## 1230: ## Input: An integer and a string: (1) a simplified message id; and, 1231: ## (2) the subject line of the message. 1232: ## 1233: ## Effects: Update the hashes %smid2nsl and %nsl2smids 1234: ## [simplified_message_id]* <--> normalized_subject_line 1235: ## simplified_message_ids map to one normalized_subject_line; 1236: ## normalized_subject_lines map to a list of simplified_message_ids 1237: ## in the normalized_subject_lines all characters are normalized 1238: ## to UTF8 format; all the words have been stemmed; 1239: ## all punctuation marks and stopwords have been removed; 1240: ## and, all "RE: ", "FWD: ", etc. prefixes have been removed. 1241: ## Also update the hash %nsl2stems so that the contents of 1242: ## the subject lines can be compared to the tokenized and stemmed 1243: ## contents of the message bodies. 1244: ## 1245: ## Output: None 1246: ## 1247: ## Notes: "RE: " and "FWD: " prefixes are removed and the unmime method 1248: ## from the MIME::WordDecoder package is applied to the words in 1249: ## the subject line before the words are tokenized and stemmed by 1250: ## the &normalize_sentence function. 1251: ## 1252: sub assert_normalized_subject_line 1253: { 1254: my($simplified_message_id,$subject_line) = @_; 1255: 1256: if (not(exists $smid2nsl{$simplified_message_id})) { 1257: $subject_line = &delete_prefixes_from_subject_line(unmime($subject_line)); 1258: $smid2nsl{$simplified_message_id} = $subject_line; 1259: push(@{$nsl2smids{$subject_line}},$simplified_message_id); 1260: if ( not(defined($nsl2stems{$subject_line})) ) { 1261: @{$nsl2stems{$subject_line}} = &normalize_sentence($subject_line); 1262: } 1263: } 1264: return; 1265: } 1266: 1267: 1268: ## ASSERT_REFERENCE_RELATION 1269: ## 1270: ## Input: Three strings: one (unsimplified) message id and a space- or comma-delimited 1271: ## list of unsimplified message ids (the contents of the 'references:' field); 1272: ## and/or a single message id (the contents of the 'in-reply-to:' field) 1273: ## 1274: ## Effects: One hash is updated, %mid2pid, message_id --> parent message id. If the 1275: ## first message id is not the id of a message that is a reply to another 1276: ## message, then its parent message id will be 'no_parent'. 1277: ## 1278: ## Output: None 1279: ## 1280: ## Notes: Since some messages may be replies to others not in the corpus, the 1281: ## current message might be a reply to a message not yet processed. So, 1282: ## we assert the threading information here with the full message ids and 1283: ## then later, in &assert_threading_relations, the same information is 1284: ## reasserted using simplified message ids. Note that some messages use the 1285: ## 'in-reply-to:' field instead of the 'references:' field to record threading 1286: ## information. All message ids are coerced to lowercase before they are 1287: ## recorded. 1288: ## 1289: sub assert_reference_relation 1290: { 1291: my($message_id,$references_list,$in_reply_to) = @_; 1292: 1293: my(@references,$parent_id); 1294: 1295: $parent_id = 'no_parent'; 1296: if ( ( $references_list =~ /^\s*$/ ) and ( $in_reply_to =~ /^\s*$/ ) ) { 1297: $mid2pid{$message_id} = $parent_id; 1298: } 1299: elsif ( $references_list !~ /^\s*$/ ) { 1300: @references = split /\s+/,$references_list; 1301: ## Some reference lists are comma- rather than space-delimited 1302: if ( == 0 ) { 1303: $comma = ','; 1304: @references = split /$comma/,$references_list; 1305: } 1306: $parent_id = $references[]; 1307: if ( $parent_id =~ /^\s*$/ ) { $parent_id = 'no_parent'; } 1308: $mid2pid{$message_id} = lc($parent_id); 1309: } 1310: ## else if there is a 'in-reply-to:' field, but not a 'references:' field, 1311: ## fill in the references with the contents of the 'in-reply-to:' field. 1312: else { 1313: ## Assume that the parent_id is that token in the field that is 1314: ## bracketed by '<' and '>'. If this is not the case, this 1315: ## will not yield the right result. 1316: $in_reply_to =~ /(\<[^\>]+\>)/; 1317: $parent_id = $1; 1318: if ( $parent_id =~ /^\s*$/ ) { 1319: print STDERR "Could not find message id in in-reply-to field: $in_reply_to.\n"; 1320: $parent_id = 'no_parent'; 1321: } 1322: $mid2pid{$message_id} = lc($parent_id); 1323: } 1324: return; 1325: } 1326: 1327: 1328: ## ASSERT_THREADING_RELATIONS 1329: ## 1330: ## Input: A string: a message id 1331: ## 1332: ## Effects: The hashes %smid2spid and %spid2smid are updated 1333: ## [simplified_message_id]* <--> simplified_parent_id 1334: ## simplified_parent_ids are simplified_message_ids 1335: ## each simplified_message_id has zero or one parent 1336: ## according to whether the message was (or was not) 1337: ## written in-reply-to another message. 1338: ## Each simplified_parent_id maps to a list of 1339: ## simplified_message_ids according to the number of 1340: ## replies written in response to the posted parent 1341: ## message. A separate hash that aggregates the smid2spid 1342: ## relations by author id is also updated here: %repliers, 1343: ## records which authors have replied to a given author. 1344: ## 1345: ## Output: None 1346: ## 1347: ## Notes: This function re-writes the information stored in 1348: ## the hash %mid2pid and computed in the function 1349: ## assert_reference_relation. 1350: ## 1351: sub assert_threading_relations 1352: { 1353: my($mid) = @_; 1354: 1355: my($smid,$pid,$spid,$said,$psaid); 1356: 1357: $smid = $mid2smid{$mid}; 1358: $pid = $mid2pid{$mid}; 1359: if ($pid ne 'no_parent') { 1360: ## Note: some parent messages may not be included in the processed corpus. 1361: ## The following line creates a simplified message id for such unincluded 1362: ## parent messages. 1363: if (not (exists $mid2smid{$pid})) { 1364: $spid = &assert_simplified_message_id($pid); 1365: $smid2sdt{$spid} = -1; 1366: $smid2said{$spid} = -1; 1367: $said2ana{-1} = ''; 1368: $smid2nsl{$spid} = ''; 1369: } 1370: $spid = $mid2smid{$pid}; 1371: if (not (exists $smid2spid{$smid})) { 1372: $smid2spid{$smid} = $spid; 1373: push(@{$spid2smids{$spid}},$smid); 1374: } 1375: if ( defined($smid2said{$spid}) 1376: and defined ($smid2said{$smid}) ) { 1377: $said = $smid2said{$smid}; 1378: $psaid = $smid2said{$spid}; 1379: if ( $said != $psaid ) { ${%{$repliers{$smid2said{$spid}}}}{$smid2said{$smid}}++; } 1380: } 1381: } 1382: ## If a message has no parent, it's entry in %smid2spid is ''. 1383: else { $smid2spid{$smid} = ''; } 1384: return; 1385: } 1386: 1387: 1388: ## ASSERT_FILE_POINTERS 1389: ## 1390: ## Input: Three integers: a simplified message id and 1391: ## two file pointers (a start and and end). 1392: ## 1393: ## Effects: The hash %smid2fptrs is updated: 1394: ## simpified_message_id --> start_and_end_file_pointers 1395: ## Since the original text of each message is stored in a file, 1396: ## each simplified_message_id has associated start and 1397: ## end file pointers according to where, in the file, 1398: ## the text of the message is located. 1399: ## 1400: ## Output: None 1401: ## 1402: sub assert_file_pointers 1403: { 1404: my($simplified_message_id,$start_pointer,$end_pointer) = @_; 1405: 1406: if (not (exists $smid2fptrs{$simplified_message_id})) { 1407: push(@{$smid2fptrs{$simplified_message_id}},$start_pointer,$end_pointer); 1408: } 1409: return; 1410: } 1411: 1412: 1413: ## ASSERT_MESSAGE_BODY_AUTHORSHIP 1414: ## 1415: ## Input: Two integers (a simplified author id and a simplified message id) 1416: ## and a string (the lines of the body of a message) 1417: ## 1418: ## Effects: One hash of hashes (%said2lines) is updated; and, another hash 1419: ## is updated: %smidln2line (this allows future, quick access to each 1420: ## of the lines of a message); 1421: ## For each author a separate hash is created to record 1422: ## every line of every message posted by the author. 1423: ## Each such hash has the following form: %line2smidslns 1424: ## line_of_text --> [simplified_message_id and line_number]* 1425: ## The text of each line of each message is stored as a key 1426: ## in this hash. The values of the hash are arrays of strings. 1427: ## Each string consists of a simplified_message_id and a line 1428: ## number concatenated together, indicting where the line of text 1429: ## appeared in the corpus of messages. These hashes are used to 1430: ## identify the signature (if any) of each author. Entries for 1431: ## each of the lines of the message_body are entered in the 1432: ## authors' hash. 1433: ## 1434: ## Output: None 1435: ## 1436: sub assert_message_body_authorship 1437: { 1438: my($said,$smid,$body) = @_; 1439: 1440: my(%line2smidslns,$line,$ln); 1441: 1442: if (defined $said2lines{$said}) { %line2smidslns = %{$said2lines{$said}}; } 1443: else { %line2smidslns = (); } 1444: $ln = 0; 1445: foreach $line (split /\n/,$body) { 1446: push(@{$line2smidslns{$line}},"$smid $ln"); 1447: $smidln2line{"$smid $ln"} = $line; 1448: $ln++; 1449: } 1450: $said2lines{$said} = \%line2smidslns; 1451: return; 1452: } 1453: 1454: 1455: ## ASSERT_LINE_POSITIONS 1456: ## 1457: ## Input: An integer: a simplified author id 1458: ## 1459: ## Effects: The hash %tline2smidslns is created from the hash %smidln2line 1460: ## truncated_line_of_text --> [simplified_message_id and line_number]* 1461: ## Each line of text is truncated at the beginning to remove any 1462: ## leading whitespace and/or punctuation, and then its position is 1463: ## recorded in a list of simplified_message_id and line_number pairs 1464: ## (each pair is recorded as a string with a space between the two 1465: ## integers). This hash created is later used to identify quotations. 1466: ## hashes and is used to identify quotations. Its inverse is also 1467: ## created: %smidln2tline. 1468: ## The hash %source is also created. The keys of %source are truncated 1469: ## lines of text (where all of the quoting prefixes have been removed). 1470: ## The values are pairs of simplified message ids and line numbers. 1471: ## This hash is also used to identify quotations. 1472: ## 1473: ## Output: None 1474: ## 1475: sub assert_line_positions 1476: { 1477: my($smid) = @_; 1478: 1479: my($said,%line2smidslns,$line,$tline,@smidslns,$smidln); 1480: my($prefix,$ln); 1481: 1482: $ln = 0; 1483: while (defined $smidln2line{"$smid $ln"}) { 1484: $line = $smidln2line{"$smid $ln"}; 1485: $tline = $line; 1486: ## Delete any leading punctuation and any trailing whitespace. 1487: $tline =~ /^(.*)(>)(.*)$/; ## Find the last (rightmost) '>' on the line. 1488: if ($2) { 1489: $tline = $3; ## Delete everything up to the last '>'; and, 1490: $prefix = $1.'>'; ## store the prefix under the smid and line number. 1491: $prefix{$smid,$ln} = $prefix; 1492: } 1493: $tline =~ s/^\p{IsSpace}+//; ## Delete any whitespace at the beginning of the line. 1494: $tline =~ s/\p{IsSpace}+$//; ## Delete any trailing whitespace. 1495: ## Repeatedly delete any whitespace or punctuation at the (possibly new) beginning of the line. 1496: for ($tline) { 1 while s/^\P{IsWord}+//; } 1497: ## If there are any non-whitespace characters in the truncated line, 1498: ## assert its line positions. 1499: if ( $tline =~ /\S/ ) { 1500: ## Use the edited line as a key to the new hash. Note that the 1501: ## hash might already contain a list for the edited line, since 1502: ## many different lines might reduce to the same truncated form. 1503: push(@{$tline2smidslns{$tline}},"$smid $ln"); 1504: $smidln2tline{"$smid $ln"} = $tline; 1505: ## Assume that the oldest messages are processed first. 1506: if (not defined($source{$tline})) { $source{$tline} = "$smid $ln"; } 1507: } 1508: $ln++; 1509: } 1510: return; 1511: } 1512: 1513: 1514: ## BY_REPETITIONS 1515: ## 1516: sub by_repetitions 1517: { 1518: $#{@{$tline2smidslns{$a}}} <=> $#{@{$tline2smidslns{$b}}}; 1519: } 1520: 1521: 1522: ## SMIDS_OF_ANCESTORS 1523: ## 1524: ## Input: An integer: a simplified message id. 1525: ## 1526: ## Effects: Cache the results in the hash %ancestors. 1527: ## 1528: ## Output: A list: the simplified message id of the 1529: ## parent, the parent's parent, etc. The hash 1530: ## %smid2spid is traversed from the input id to 1531: ## the root of the thread containing the message. 1532: ## 1533: sub smids_of_ancestors 1534: { 1535: my($smid) = @_; 1536: 1537: if ( not($smid) ) { return(); } 1538: if (not exists($ancestors{$smid})) { 1539: push(@{$ancestors{$smid}},&smids_of_ancestors($smid2spid{$smid}),$smid); 1540: } 1541: return(@{$ancestors{$smid}}); 1542: } 1543: 1544: 1545: ## ASSERT_SOURCES 1546: ## 1547: ## Input: An integer: a simplified message id 1548: ## 1549: ## Effects: Create the hash %smidln2source. 1550: ## The keys of %smidln2source are simplified message ids and line 1551: ## numbers, as are the values. %smidln2source contains the same 1552: ## information as %source (see above, in the documentation for 1553: ## the function &assert_line_positions) except it also corrects 1554: ## some of the information in %source. Aggregated counts are 1555: ## also updated in this function: %citers records which authors 1556: ## have cited others; and, %smid2sources records all of the 1557: ## messages cited in the current message. 1558: ## 1559: ## Output: None 1560: ## 1561: sub assert_sources 1562: { 1563: my($smid) = @_; 1564: 1565: my($tline,$smidln,$ln,$earliest,$i,$source_smid,$source_ln); 1566: my($i_minus_one,$i_plus_one,$prefix_above,$source_smid_above,$source_ln_above); 1567: my($prefix_below,$source_smid_below,$source_ln_below,@tline,%prefix2sourcesmid,@ancestors); 1568: my($start,%block_starts,$last_source_smid,$last_source_ln,$source,%current_sources); 1569: my(@signature_lines,$source_author,$said,$source_said); 1570: 1571: ## Correct the %source hash by examining the sources of each line and 1572: ## its surrounding context. Process the corpus message-by-message. 1573: ## Save the result in the hash %smidln2source. 1574: $ln = 0; 1575: ## find the source of each line; 1576: while (defined $smidln2line{"$smid $ln"}) { 1577: if (defined $smidln2tline{"$smid $ln"}) { $tline = $smidln2tline{"$smid $ln"}; } 1578: ## if no tline is associated with the smid-ln pair, then assume the line is blank. 1579: else { $tline = ''; } 1580: if (defined $source{$tline}) { 1581: ($source_smid,$source_ln) = $source{$tline} =~ /^(\d+) (\d+)$/; 1582: ## If a line has been assigned as a source another line in the 1583: ## the same message, then reset its source to itself. 1584: if ( $source_smid eq $smid ) { $smidln2source{"$smid $ln"} = "$smid $ln"; } 1585: else { $smidln2source{"$smid $ln"} = $source{$tline}; } 1586: } 1587: ## if no source exists, assign the source of a line to itself 1588: else { $smidln2source{"$smid $ln"} = "$smid $ln"; } 1589: $ln++; 1590: } 1591: ## Re-examine the sources of each line using the prefixes. 1592: ## If a line has a prefix and has no source, 1593: ## and the line has the same prefix as the line above, or the line below, 1594: ## and the line above or below has a source that is another message, 1595: ## then reassign the source of the line. 1596: undef %prefix2sourcesmid; 1597: for ($i = 0; $i < $ln; $i++) { 1598: $i_minus_one = $i-1; 1599: $i_plus_one = $i+1; 1600: if (defined $prefix{$smid,$i}) { 1601: ($source_smid,$source_ln) = $smidln2source{"$smid $i"} =~ /^(\d+) (\d+)$/; 1602: if ( $source_smid == $smid ) { 1603: ## Get the prefix (if any) of the line above the current line. 1604: if ( ( $i > 0 ) and (defined $prefix{$smid,$i_minus_one}) ) { 1605: $prefix_above = $prefix{$smid,$i_minus_one}; 1606: ($source_smid_above,$source_ln_above) = $smidln2source{"$smid $i_minus_one"} =~ /^(\d+) (\d+)$/; 1607: } 1608: else { $prefix_above = ''; } 1609: ## Get the prefix (if any) of the line below the current line. 1610: if ( ( $i < ($ln - 1) ) and (defined $prefix{$smid,$i_plus_one}) ) { 1611: $prefix_below = $prefix{$smid,$i_plus_one}; 1612: ($source_smid_below,$source_ln_below) = $smidln2source{"$smid $i_plus_one"} =~ /^(\d+) (\d+)$/; 1613: } 1614: else { $prefix_below = ''; } 1615: if ( ( $prefix_above eq $prefix{$smid,$i} ) and ( $source_smid_above != $smid ) ) { 1616: $smidln2source{"$smid $i"} = $smidln2source{"$smid $i_minus_one"}; 1617: } 1618: elsif ( ( $prefix_below eq $prefix{$smid,$i} ) and ( $source_smid_below != $smid ) ) { 1619: $smidln2source{"$smid $i"} = $smidln2source{"$smid $i_plus_one"}; 1620: } 1621: } 1622: ## Keep track of which prefixes point to which cited messages. 1623: ## Only assign a prefix to source_smid if the source_smid is not 1624: ## equal to the current smid. 1625: ($source_smid,$source_ln) = $smidln2source{"$smid $i"} =~ /^(\d+) (\d+)$/; 1626: if ( $source_smid != $smid ) { 1627: $prefix2sourcesmid{$prefix{$smid,$i}} = $source_smid; 1628: } 1629: } 1630: } 1631: ## Go through the lines again. If a line has a known prefix with a known source, 1632: ## but the line has not been assigned a source (other than itself), then assign it the 1633: ## source message associated with its prefix. Match the line against every line in the 1634: ## source message to determine the exact line in the source message. Sometimes this 1635: ## won't be possible to do, in which case the line source is simply set to 0. 1636: ## This procedure is necessary to correctly re-assign the source of lines that 1637: ## have wrapped or been truncated. 1638: for ($i = 0; $i < $ln; $i++) { 1639: if (defined $prefix{$smid,$i}) { 1640: ($source_smid,$source_ln) = $smidln2source{"$smid $i"} =~ /^(\d+) (\d+)$/; 1641: if ( ( $source_smid == $smid ) 1642: and ( defined($prefix2sourcesmid{$prefix{$smid,$i}}) ) 1643: and ( $prefix2sourcesmid{$prefix{$smid,$i}} != $smid ) ) { 1644: $source_smid = $prefix2sourcesmid{$prefix{$smid,$i}}; 1645: ## Get lines of source message and match them against the 1646: ## current line (if the current line is long enough). 1647: $source_ln = 0; 1648: ## First, make the current line into a pattern that can be matched against. 1649: if (defined $smidln2tline{"$smid $i"}) { 1650: $tline = $smidln2tline{"$smid $i"}; 1651: @tline = split /\s+/,$tline; 1652: ## Only attempt to match the line if it is at least five tokens in length. 1653: if ( >= 5 ) { 1654: ## Delete the first two tokens from the front of the line. 1655: for ($tline) { 1 while s/^\P{IsSpace}+//; } 1656: for ($tline) { 1 while s/^\p{IsSpace}+//; } 1657: for ($tline) { 1 while s/^\P{IsSpace}+//; } 1658: for ($tline) { 1 while s/^\p{IsSpace}+//; } 1659: ## quote all the meta characters 1660: $tline = quotemeta($tline); 1661: ## Now try matching the line 1662: while (defined $smidln2line{"$source_smid $source_ln"}) { 1663: if ( $smidln2line{"$source_smid $source_ln"} =~ /$tline/ ) { last; } 1664: else { $source_ln++; } 1665: } 1666: } 1667: } 1668: ## Reassign the source 1669: $smidln2source{"$smid $i"} = "$source_smid $source_ln"; 1670: } 1671: } 1672: } 1673: ## Go through the lines one more time. If the source of a given line is 1674: ## a signature line in the sourced message, then reassign the source of 1675: ## the line to itself. 1676: for ($i = 0; $i < $ln; $i++) { 1677: ($source_smid,$source_ln) = $smidln2source{"$smid $i"} =~ /^(\d+) (\d+)$/; 1678: @signature_lines = sort {$b <=> $a} &find_line_numbers_of_signature($source_smid); 1679: if ( grep {$_ == $source_ln} @signature_lines ) { $smidln2source{"$smid $i"} = "$smid $i"; } 1680: } 1681: ## If the current message contains at least one line, then 1682: ## gather the lines into blocks with the same source. 1683: if ( not(exists($smidln2source{"$smid 0"})) ) { return; } 1684: ($last_source_smid,$last_source_ln) = $smidln2source{"$smid 0"} =~ /^(\d+) (\d+)$/; 1685: if ( not($last_source_smid or $last_source_ln) ) { return; } 1686: undef %block_starts; 1687: $start = 0; 1688: push(@{$block_starts{$start}},0); 1689: for ($i = 1; $i < $ln; $i++) { 1690: if (defined $smidln2source{"$smid $i"}) { 1691: ($source_smid,$source_ln) = $smidln2source{"$smid $i"} =~ /^(\d+) (\d+)$/; 1692: if ( $source_smid or $source_ln ) { 1693: if ( $source_smid == $last_source_smid ) { push(@{$block_starts{$start}},$i); } 1694: else { $start = $i; } 1695: $last_source_smid = $source_smid; 1696: $last_source_ln = $source_ln; 1697: } 1698: } 1699: } 1700: ## find the ids of the ancestors of the message; 1701: @ancestors = &smids_of_ancestors($smid); 1702: ## Look through the lines again. Only recognize non-ancestor sources if 1703: ## they are assigned to a block of lines because single, isolated lines 1704: ## with non-ancestor sources are likely to be incorrectly assigned. 1705: foreach $start (keys %block_starts) { 1706: if ( defined($smidln2source{"$smid $start"}) ) { 1707: ($source_smid,$source_ln) = $smidln2source{"$smid $start"} =~ /^(\d+) (\d+)$/; 1708: if ( $source_smid or $source_ln ) { 1709: ## if the source is itself or an ancestor, leave it; 1710: if ( grep {$_ == $source_smid} @ancestors ) { next; } 1711: ## else, if a line is by itself and has a non-ancestor source, change the source to itself. 1712: elsif ( $#{@{$block_starts{$start}}} == 0 ) { $smidln2source{"$smid $start"} = "$smid $start"; } 1713: } 1714: } 1715: } 1716: @signature_lines = sort {$b <=> $a} &find_line_numbers_of_signature($smid); 1717: ## Finally, record all of the messages cited by the current message. 1718: for ($i = 0; $i < $ln; $i++) { 1719: if ( defined($smidln2source{"$smid $i"}) ) { 1720: ($source_smid,$source_ln) = $smidln2source{"$smid $i"} =~ /^(\d+) (\d+)$/; 1721: ## A source is not counted if the source is the same as the message, 1722: ## if the line is a signature line, or if the line follows a signature line. 1723: if ( ( $source_smid != $smid ) 1724: and ( not(@signature_lines) or ( $i < $signature_lines[0] ) ) 1725: and ( not(grep {$_ == $i} @signature_lines) ) ) { 1726: push(@{$smid2sources{$smid}},$source_smid); 1727: } 1728: } 1729: } 1730: ## Remove any duplicates from the entry in %smid2sources 1731: ## Also, remove any self-citations. 1732: undef %current_sources; 1733: foreach $source (@{$smid2sources{$smid}}) { 1734: if ($smid2said{$source} != $smid2said{$smid}) { 1735: $current_sources{$source} = 1; 1736: } 1737: } 1738: @{$smid2sources{$smid}} = keys %current_sources; 1739: ## Invert %smid2sources to create %source2smids 1740: foreach $source (@{$smid2sources{$smid}}) { 1741: if ( not(grep {$_ == $smid} @{$source2smids{$source}}) ) { 1742: push(@{$source2smids{$source}},$smid); 1743: } 1744: } 1745: ## Keep track of all of the people who cite a given author 1746: ## (but ignore places where a player cites themself). 1747: if ( defined($smid2said{$smid}) ) { 1748: $said = $smid2said{$smid}; 1749: foreach $source (@{$smid2sources{$smid}}) { 1750: if ( defined($smid2said{$source}) ) { 1751: $source_said = $smid2said{$source}; 1752: if ( $said != $source_said ) { ${%{$citers{$source_said}}}{$said}++; } 1753: } 1754: } 1755: } 1756: return; 1757: } 1758: 1759: 1760: ## FIND_POTENTIAL_SIGNATURE_LINES 1761: ## 1762: ## Input: Two integers: a simplified author id and the number of 1763: ## messages posted by that author. 1764: ## 1765: ## Effects: None 1766: ## 1767: ## Output: A list of strings each of which are potentially 1768: ## lines of the author's signature. 1769: ## 1770: ## Notes: A line is a potential signature line if it occurs 1771: ## in $messages_posted messages. Normally, $messages_posted 1772: ## is equal to the total number of messages posted by the 1773: ## author, but may be a lower number if lines which only 1774: ## appear in a significant number (rather than all) of the 1775: ## author's messages are to be considered as potential 1776: ## signature lines. 1777: ## 1778: sub find_potential_signature_lines 1779: { 1780: my($said,$messages_posted) = @_; 1781: 1782: my(%line2smidslns,$line,@possible_signature_lines,@potential_signature_lines); 1783: my(%smid2lns,$smidln,$smid,$ln,@smids); 1784: 1785: %line2smidslns = %{$said2lines{$said}}; 1786: ## First find all lines with at least $messages_posted appearances. 1787: foreach $line (keys %line2smidslns) { 1788: if ( (1 + $#{@{$line2smidslns{$line}}}) >= $messages_posted) { 1789: push(@possible_signature_lines,$line); 1790: } 1791: } 1792: ## Next, for each of the possible lines, examine the message ids 1793: ## associated with it to make sure that every message posted by the 1794: ## author is represented. This subset of possible signature lines 1795: ## is stored in the variable @potential_signature_lines. 1796: foreach $line (@possible_signature_lines) { 1797: undef %smid2lns; 1798: foreach $smidln (@{$line2smidslns{$line}}) { 1799: ($smid,$ln) = $smidln =~ /^(\d+) (\d+)$/; 1800: push(@{$smid2lns{$smid}},$ln); 1801: } 1802: @smids = keys %smid2lns; 1803: if ( $messages_posted <= (1 + ) ) { 1804: push(@potential_signature_lines,$line); 1805: } 1806: } 1807: return(@potential_signature_lines); 1808: } 1809: 1810: 1811: ## FIND_SIGNATURE_LINES_IN_FIRST_MESSAGE_OF_AUTHOR 1812: ## 1813: ## Input: An integer and a list of strings: an author's simplified 1814: ## id and a list of potential signature lines. 1815: ## 1816: ## Output: A pointer to a hash containing the positions (line numbers) 1817: ## of each of the potential signature lines in the author's 1818: ## first posted message. 1819: ## 1820: sub find_signature_lines_in_first_message_of_author 1821: { 1822: my($said,@potential_signature_lines) = @_; 1823: 1824: my(%line2smidslns,$first_smid,%first_smid_lines,$line,$smidln,$smid,$ln); 1825: 1826: if ( defined($said2lines{$said}) 1827: and defined($said2smids{$said}[0]) ) { 1828: %line2smidslns = %{$said2lines{$said}}; 1829: $first_smid = $said2smids{$said}[0]; 1830: foreach $line (@potential_signature_lines) { 1831: if (defined $line2smidslns{$line}) { 1832: foreach $smidln (@{$line2smidslns{$line}}) { 1833: ($smid,$ln) = $smidln =~ /^(\d+) (\d+)$/; 1834: if ( $smid == $first_smid ) { 1835: $first_smid_lines{$ln} = $line; 1836: } 1837: } 1838: } 1839: } 1840: } 1841: return(\%first_smid_lines); 1842: } 1843: 1844: 1845: ## FIND_LINE_NUMBERS_OF_SIGNATURE 1846: ## 1847: ## Input: An integer: a simplified message id 1848: ## 1849: ## Output: An array of integers: a list of line numbers 1850: ## 1851: ## Notes: The signatures line stored under the id of the 1852: ## author of the message are matched against all 1853: ## of the lines of the message. 1854: ## 1855: sub find_line_numbers_of_signature 1856: { 1857: my($smid) = @_; 1858: 1859: my(@signature,@signature_lines,$ln,$sig_index); 1860: 1861: if (defined $line_numbers_of_signature{$smid}) { 1862: return(@{$line_numbers_of_signature{$smid}}); 1863: } 1864: $ln = 0; 1865: $sig_index = 0; 1866: if ( (defined $smid2said{$smid}) 1867: and (defined $signature{$smid2said{$smid}}) ) { 1868: @signature = @{$signature{$smid2said{$smid}}}; 1869: while (defined $smidln2line{"$smid $ln"}) { 1870: if ( $sig_index > ) { last; } 1871: elsif ( defined($smidln2line{"$smid $ln"}) 1872: and $signature[$sig_index] 1873: and ( $smidln2line{"$smid $ln"} eq $signature[$sig_index] ) ) { 1874: push(@signature_lines,$ln); 1875: $sig_index++; 1876: } 1877: else { 1878: undef @signature_lines; 1879: $sig_index = 0; 1880: } 1881: $ln++; 1882: } 1883: } 1884: ## Pad out the signature with a line above and a line below. 1885: ## This is necessary to cover cases when a line is used 1886: ## repeatedly but not always by an author. 1887: if (@signature_lines) { 1888: unshift(@signature_lines,$signature_lines[0]-1); 1889: push(@signature_lines,$signature_lines[]+1); 1890: } 1891: @{$line_numbers_of_signature{$smid}} = @signature_lines; 1892: return(@signature_lines); 1893: } 1894: 1895: 1896: ## ASSERT_SIGNATURE 1897: ## 1898: ## Input: An integer: a simplified author id 1899: ## 1900: ## Effects: The hash %signatures is built: simplified_message_id --> [line of text]* 1901: ## Signature lines are identified through an examination of the 1902: ## hashes of the hash %said2lines. The lines posted by each author are 1903: ## examined to see if any sequence of lines is repeated in every message 1904: ## posted by the author. These repeated lines, included with every message, 1905: ## are identified to be the author's signature. If there are several such 1906: ## sequences, then the longest contiguous sequences of repeated lines is 1907: ## chosen as the author's likely signature. The signature of the author 1908: ## is stored as an array of the repeated lines. 1909: ## 1910: ## Output: None 1911: ## 1912: sub assert_signature 1913: { 1914: my($said) = @_; 1915: 1916: my(%line2smidslns,$messages_posted,$line); 1917: my($smidln,$smid,$ln,@smids,@potential_signature_lines,$first_smid,@lns); 1918: my(%first_smid_lines,$first_smid_lines_ptr,%sig_starts,$start,$i,$max_score,$score,$winning_start); 1919: my($lines_in_first_message,$lines_in_signature,@signature,@truncated_signature,$longest); 1920: 1921: undef $signature{$said}; 1922: %line2smidslns = %{$said2lines{$said}}; 1923: $messages_posted = 1 + $#{@{$said2smids{$said}}}; 1924: if ( $messages_posted == 1 ) { return; } 1925: undef @potential_signature_lines; 1926: @potential_signature_lines = &find_potential_signature_lines($said,$messages_posted); 1927: ## If no potential signature lines are found, then try to find at least 1928: ## one line that appears in 40% of the author's messages, if the author 1929: ## has posted at least 10 messages. 1930: if ( ( not(@potential_signature_lines) 1931: or not(grep { $_ =~ /\S/ } @potential_signature_lines) ) 1932: and ( $messages_posted >= 10 ) ) { 1933: $messages_posted = 0.4 * $messages_posted; 1934: $longest = 0; 1935: foreach $line (&find_potential_signature_lines($said,$messages_posted)) { 1936: $line =~ s/^\P{IsWord}+//; ## Delete any whitespace beginning of the line. 1937: $line =~ s/\P{IsWord}+$//; ## Delete any trailing whitespace. 1938: if ( length($line) > $longest ) { @{$signature{$said}} = ($line); } 1939: } 1940: return; 1941: } 1942: $first_smid_lines_ptr = &find_signature_lines_in_first_message_of_author($said,@potential_signature_lines); 1943: if (not $first_smid_lines_ptr) { return; } 1944: %first_smid_lines = %{$first_smid_lines_ptr}; 1945: ## Cluster the line numbers of the potential signature lines into continguous groups of lines. 1946: ## Clusters are indexed in a hash keyed to the line number of the start of the sequence. 1947: @lns = sort {$a <=> $b} keys %first_smid_lines; 1948: undef %sig_starts; 1949: $start = $lns[0]; 1950: for ($i = 1; $i <= ; $i++) { 1951: if ( $lns[$i] != (1 + $lns[$i-1]) ) { $start = $lns[$i]; } 1952: push(@{$sig_starts{$start}},$lns[$i]); 1953: } 1954: ## For the author's signature, select the longest contiguous sequence of lines that 1955: ## contains the greatest number of lines with visible (i.e., non-whitespace) characters. 1956: $max_score = 0; 1957: $winning_start = -1; 1958: foreach $start (keys %sig_starts) { 1959: $score = 0; 1960: foreach $ln (@{$sig_starts{$start}}) { 1961: if (defined($first_smid_lines{$ln}) and ($first_smid_lines{$ln} =~ /\S/)) { $score++; } 1962: } 1963: if ( $score > $max_score ) { 1964: $max_score = $score; 1965: $winning_start = $start; 1966: } 1967: } 1968: $lines_in_signature = 1 + $#{$sig_starts{$winning_start}}; 1969: $lines_in_first_message = 0; 1970: $first_smid = $said2smids{$said}[0]; 1971: while (defined $smidln2line{"$first_smid $lines_in_first_message"}) { 1972: $lines_in_first_message++; 1973: } 1974: ## If a signature is the same length as a message, then the author is 1975: ## simply re-posting exactly the same message and does not have a 1976: ## valid signature. 1977: if ( $lines_in_signature < $lines_in_first_message ) { 1978: foreach $ln (@{$sig_starts{$winning_start}}) { 1979: push(@{$signature{$said}},$first_smid_lines{$ln}); 1980: } 1981: ## Remove any leading blank lines from the signature 1982: if (defined $signature{$said}) { 1983: @signature = @{$signature{$said}}; 1984: @truncated_signature = @{$signature{$said}}; 1985: for ($i = 0; $i <= ; $i++) { 1986: if ( $signature[$i] =~ /\S/ ) { 1987: @{$signature{$said}} = @truncated_signature; 1988: last; 1989: } 1990: else { shift(@truncated_signature); } 1991: } 1992: } 1993: ## Remove any trailing blank lines from the signature 1994: if (defined $signature{$said}) { 1995: @signature = @{$signature{$said}}; 1996: @truncated_signature = reverse(@{$signature{$said}}); 1997: for ($i = ; $i >= 0; $i--) { 1998: if ( $signature[$i] =~ /\S/ ) { 1999: @{$signature{$said}} = reverse(@truncated_signature); 2000: last; 2001: } 2002: else { shift(@truncated_signature); } 2003: } 2004: } 2005: } 2006: return; 2007: } 2008: 2009: 2010: ## UPDATE_SCORES_CENTRALITY_BY_PROXIMITY_VERSION 2011: ## 2012: ## Input: none 2013: ## 2014: ## Effects: Update every player's %said2score, and the $min_score, 2015: ## the $max_score, and $current_winner variables. 2016: ## 2017: ## Output: none 2018: ## 2019: ## Notes: Players are scored according to a social network 2020: ## measure based on proximity. The best score is 2021: ## acheived by the player who is cited by or replied 2022: ## to by every other player _and_ who, reciprocates, by 2023: ## replying to or citing every other player. 2024: ## In the social network, two players are linked if 2025: ## they have reciprocally cited and/or replied to one 2026: ## another. Counts of reciprocated quotes and replies 2027: ## are recorded and weight the links between actors in 2028: ## in the social networks. For the purposes of the scoring 2029: ## algorithm, the inverse of the link weight is used. 2030: ## Thus, a player can increase their centrality rating by 2031: ## corresponding frequently with only a few other players, 2032: ## but -- by so doing -- they increase everyone else's score 2033: ## who is connect to them or their correspondent. Also, since 2034: ## one's global score depends upon the size of one's social 2035: ## network correspondence with only one other person does not 2036: ## produce a high score. 2037: ## 2038: ## Ref: "Centralie et pourvoir," Alain Degenne et Michel Forse, 2039: ## _Les Reseaux Sociaux_ (Paris: Armand Colin, 1994). 2040: ## 2041: ## NB: This subroutine has not yet been debugged and may contain 2042: ## errors. 2043: ## 2044: sub update_scores_centrality_by_proximity_version 2045: { 2046: my($sni,$graph,@graphs,@graph_sizes,@social_network_indices,$i); 2047: my($s_said,$e_said,$sssp,$said,$total_distance,@es); 2048: 2049: @social_network_indices = sort { $a <=> $b } keys %sn2saids; 2050: ## Build a graph for every social network. 2051: foreach $sni (@social_network_indices) { 2052: $graph = new Graph::Undirected; 2053: push(@graph_sizes,scalar(keys %{$sn2saids{$sni}})); 2054: $graph->add_vertices(keys %{$sn2saids{$sni}}); 2055: foreach $s_said (keys %{$sn2saids{$sni}}) { 2056: foreach $e_said (keys %{$sn2saids{$sni}}) { 2057: if ( defined($link{$s_said,$e_said}) ) { 2058: ## A link's distance is inversely proportional to the 2059: ## number of exchanges (citations or replies) between 2060: ## the two players. 2061: $graph->add_weighted_edge($s_said,(1.0/$link{$s_said,$e_said}),$e_said); 2062: } 2063: } 2064: } 2065: push(@graphs,$graph); 2066: } 2067: ## Score each player in each social network. 2068: $i = 0; 2069: foreach $graph (@graphs) { 2070: foreach $said ($graph->vertices()) { 2071: ## Use Dijkstra's algorithm to compute the single-source shortest paths 2072: ## from each player in the social network to every other player in 2073: ## the network. 2074: $sssp = $graph->SSSP_Dijkstra($said); 2075: $total_distance = 0; 2076: @es = $sssp->edges(); 2077: while (@es) { 2078: ($s_said,$e_said,@es) = @es; 2079: $total_distance += $sssp->get_attribute('weight',$s_said,$e_said); 2080: } 2081: ## Multiply the total distance my the size of the social network so that 2082: ## centrality measures from the different social networks can be compared 2083: ## and so that centrality in a large network counts for more than centrality 2084: ## in a small network. 2085: if ( $total_distance > 0 ) { 2086: $said2score{$said} = ( ( ($graph_sizes[$i] * $graph_sizes[$i]) - $graph_sizes[$i] ) / $total_distance ); 2087: } 2088: else { $said2score{$said} = 0; } 2089: if ( $said2score{$said} > $max_score ) { 2090: $max_score = $said2score{$said}; 2091: $current_winner = $said; 2092: } 2093: if ( $said2score{$said} < $min_score ) { $min_score = $said2score{$said}; } 2094: } 2095: $i++; 2096: } 2097: } 2098: 2099: 2100: ## UPDATE_SCORES_CENTRALITY_BY_DEGREE_VERSION 2101: ## 2102: ## Input: none 2103: ## 2104: ## Effects: Update every player's %said2score, and the $min_score, 2105: ## the $max_score, and $current_winner variables. 2106: ## 2107: ## Output: none 2108: ## 2109: ## Notes: This version computes the simplest version of 2110: ## centrality for social networks: centrality is 2111: ## computed as the mnumber of other players each 2112: ## player is connected to. Consequently, with this 2113: ## version, players win points by corresponding 2114: ## with new people in the discussion, but gain no 2115: ## points for continuing to correspond with people 2116: ## they have already been talking with. 2117: ## Actually, this implementation is a bit more 2118: ## nuanced. The first time a pair of players 2119: ## cites and/or replies to one another, they 2120: ## received full credit; the second time they do so, 2121: ## they receive an additional 90% of the initial 2122: ## points; the third time, 80% of the initial points; 2123: ## etc., until the tenth and subsequent exchanges 2124: ## yield them no points. However, since posts to the 2125: ## group will "fade out" after N (e.g., 500) new 2126: ## subsequent posts have been made, a pair of players 2127: ## will need to sustain their exchange in order to 2128: ## keep all of their possible points. 2129: ## 2130: sub update_scores_centrality_by_degree_version 2131: { 2132: my($sni,@social_network_indices,$s_said,$e_said,$i,$said,@saids_with_max_score); 2133: 2134: undef %said2score; 2135: @social_network_indices = sort { $a <=> $b } keys %sn2saids; 2136: ## Find the degree of every player. Record the degree as the player's 2137: ## score. Note that the use of the link weight in calculation (as 2138: ## explained in the comments above.) 2139: foreach $sni (@social_network_indices) { 2140: foreach $s_said (keys %{$sn2saids{$sni}}) { 2141: $said2score{$s_said} = 0; 2142: foreach $e_said (keys %{$sn2saids{$sni}}) { 2143: if ( defined($link{$s_said,$e_said}) ) { 2144: $link_weight = $link{$s_said,$e_said}; 2145: for ( $i = 10; $i > 0; $i--) { 2146: if ( $link_weight > 0 ) { 2147: $said2score{$s_said} += $i; 2148: $link_weight--; 2149: } 2150: else { last; } 2151: } 2152: } 2153: } 2154: if ( $said2score{$s_said} > $max_score ) { 2155: $max_score = $said2score{$s_said}; 2156: $current_winner = $s_said; 2157: } 2158: if ( $said2score{$s_said} < $min_score ) { $min_score = $said2score{$s_said}; } 2159: } 2160: } 2161: ## If several players are tied for the winning score, pick the player 2162: ## who was first on the field (i.e., the one with the lowest integer as 2163: ## a simplified author id). 2164: @saids_with_max_score = (); 2165: foreach $said (keys %said2score) { 2166: if ( $said2score{$said} == $max_score ) { push(@saids_with_max_score,$said); } 2167: } 2168: @saids_with_max_score = sort { $a <=> $b } @saids_with_max_score; 2169: $current_winner = $saids_with_max_score[0]; 2170: } 2171: 2172: 2173: ## UPDATE_SCORES 2174: ## 2175: ## Input: none 2176: ## 2177: ## Effects: Update every player's %said2score, and the $min_score, 2178: ## the $max_score, and $current_winner variables. 2179: ## Also, record every player's previous_score before 2180: ## updating the %said2score hash. Convert all scores to 2181: ## percentages and save in the hash %said2percentage_score. 2182: ## 2183: ## Output: none 2184: ## 2185: ## Notes: This subroutine is a wrapper for the actual 2186: ## score calculating subroutines. The design is meant 2187: ## to facilitate the experimentation with different 2188: ## scoring algorithms. Currently, two possible 2189: ## scoring algorithms based on the measurement of 2190: ## "centrality" in the social networks literature 2191: ## have been implemented. 2192: ## 2193: sub update_scores 2194: { 2195: my($said,$score_range,$log_score_range); 2196: 2197: ## Save the previous scores for each author. 2198: undef %said2previous_score; 2199: foreach $said (keys %said2smids) { 2200: if ( defined($said2score{$said}) ) { $said2previous_score{$said} = $said2score{$said}; } 2201: else { $said2previous_score{$said} = 0; } 2202: } 2203: ## Calculate the new scores. 2204: undef %said2score; 2205: $max_score = 0; 2206: $min_score = 1; 2207: $previous_winner = $current_winner; 2208: $previous_winning_theme = $current_winning_theme; 2209: ## If desired, replace this with a call to a different scoring subroutine. 2210: &update_scores_centrality_by_degree_version(); 2211: ## Save the previous percentage scores for each author. 2212: undef %said2previous_percentage_score; 2213: foreach $said (keys %said2score) { 2214: if (defined($said2percentage_score{$said})) { 2215: $said2previous_percentage_score{$said} = $said2percentage_score{$said}; 2216: } 2217: } 2218: ## Convert each score to a percentage. Scores are plotted in a log scale 2219: ## to allow for the usual Zipf distribution of citations and replies (i.e., 2220: ## only a handful of people are frequently cited and/or replied to and the 2221: ## majority are rarely cited or replied to; cf., Lodka's Law in the information 2222: ## science literature. 2223: undef %said2percentage_score; 2224: $score_range = 1 + $max_score - $min_score; 2225: $log_score_range = log($score_range); 2226: foreach $said (sort {$a <=> $b} keys %said2smids) { 2227: if ( $log_score_range != 0 ) { 2228: # $said2percentage_score{$said} = int( 100 * ( log(1 + $said2score{$said}) / $log_score_range) ); 2229: $said2percentage_score{$said} = int( 100 * ( $said2score{$said} / $score_range ) ); 2230: } 2231: else { $said2percentage_score{$said} = 0; } 2232: } 2233: return; 2234: } 2235: 2236: 2237: ## ANGLE_DIFFERENCE 2238: ## 2239: sub angle_difference 2240: { 2241: my($angle1,$angle2) = @_; 2242: 2243: ## It is assumed that a circle has $radius_of_circle degrees. Find the 2244: ## least number of degrees between the two angles. This 2245: ## entails deciding whether to go clockwise or counter- 2246: ## clockwise. 2247: if ( ( $angle1 - $angle2 ) > ( ( $angle2 + $radius_of_circle ) - $angle1 ) ) { 2248: return(($angle2 + $radius_of_circle) - $angle1); 2249: } 2250: else { return($angle1 - $angle2); } 2251: } 2252: 2253: 2254: ## ANGLE_SUBTRACT 2255: ## 2256: sub angle_subtract 2257: { 2258: my($angle1,$angle2) = @_; 2259: 2260: if ( $angle1 >= $angle2 ) { return($angle1-$angle2); } 2261: else { return(($angle1-$angle2)+$radius_of_circle); } 2262: } 2263: 2264: 2265: ## ANGLE_ADD 2266: ## 2267: sub angle_add 2268: { 2269: my($angle1,$angle2) = @_; 2270: 2271: return(($angle1+$angle2) % $radius_of_circle); 2272: } 2273: 2274: 2275: ## WRITE_HTML_HEADER_FOR_MESSAGE 2276: ## 2277: sub write_html_header_for_message 2278: { 2279: my($smid) = @_; 2280: 2281: my($html,$from,$newsgroups,$date,$subject,$mid,$parent,$parent_smid,@replies,$replies); 2282: my($reply_smid,@cites,$cites,$cite_smid,@cited_bys,$cited_bys,$cited_by_smid); 2283: 2284: ## HTML HEAD 2285: $html = '<HTML>'."\n\t".'<HEAD>'."\n\t\t".'<TITLE>message '.$smid.'</TITLE>'; 2286: $html .= "\n\t\t".'<LINK type="text/css" rel="stylesheet" href="message_style.css">'."\n\t".'</HEAD>'; 2287: $html .= "\n\t".'<BODY>'."\n"; 2288: $html .= "\n\t".'<SCRIPT>window.resizeTo((screen.availWidth/2),screen.availHeight);</SCRIPT>'."\n"; 2289: ## Table containing fields of message header 2290: $html .= "\n\t".'<TABLE>'."\n\t\t".'<TBODY>'; 2291: ## Subject 2292: $html .= "\n\t\t".'<TR align="left">'; 2293: $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>SUBJECT: </B></TD>'; 2294: if (defined $smid2subject{$smid}) { 2295: $subject = $smid2subject{$smid}; 2296: $subject = HTML::Entities::encode_entities($subject); 2297: $subject =~ s/\s/\ \;/g; 2298: } 2299: else { $subject = ''; } 2300: $html .= "\n\t\t\t".'<TD class=headertext>'.$subject.'</TD>'; 2301: $html .= "\n\t\t".'</TR>'; 2302: ## From 2303: $html .= "\n\t\t".'<TR align="left">'; 2304: $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>SENDER: </B></TD>'; 2305: if ( (defined $smid2said{$smid}) 2306: and (defined $said2ana{$smid2said{$smid}}) ) { 2307: $from = $said2ana{$smid2said{$smid}}; 2308: $from = HTML::Entities::encode_entities($from); 2309: $from =~ s/\s/\ \;/g; 2310: } 2311: else { $from = ''; } 2312: $html .= "\n\t\t\t".'<TD class=headertext>'.$from.'</TD>'; 2313: $html .= "\n\t\t".'</TR>'; 2314: ## Newsgroups 2315: $html .= "\n\t\t".'<TR align="left">'; 2316: $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>NEWSGROUPS: </B></TD>'; 2317: if (defined $smid2newsgroups{$smid}) { 2318: $newsgroups = $smid2newsgroups{$smid}; 2319: $newsgroups = HTML::Entities::encode_entities($newsgroups); 2320: $newsgroups =~ s/\s/\ \;/g; 2321: } 2322: else { $newsgroups = ''; } 2323: $html .= "\n\t\t\t".'<TD class=headertext>'.$newsgroups.'</TD>'; 2324: $html .= "\n\t\t".'</TR>'; 2325: ## Date 2326: $html .= "\n\t\t".'<TR align="left">'; 2327: $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>DATE: </B></TD>'; 2328: if (defined $smid2date{$smid}) { 2329: $date = $smid2date{$smid}; 2330: $date = HTML::Entities::encode_entities($date); 2331: $date =~ s/\s/\ \;/g; 2332: } 2333: else { $date = ''; } 2334: $html .= "\n\t\t\t".'<TD class=headertext>'.$date.'</TD>'; 2335: $html .= "\n\t\t".'</TR>'; 2336: ## Message-ID 2337: $html .= "\n\t\t".'<TR align="left">'; 2338: $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>ID: </B></TD>'; 2339: if (defined $smid2mid{$smid}) { 2340: $mid = $smid2mid{$smid}; 2341: $mid = HTML::Entities::encode_entities($mid); 2342: $mid =~ s/\s/\ \;/g; 2343: $mid = '['.$smid.'] '.$mid; 2344: } 2345: else { $mid = ''; } 2346: $html .= "\n\t\t\t".'<TD class=headertext>'.$mid.'</TD>'; 2347: $html .= "\n\t\t".'</TR>'; 2348: ## References 2349: $html .= "\n\t\t".'<TR align="left">'; 2350: $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>PREVIOUS: </B></TD>'; 2351: if ( (defined $smid2spid{$smid}) and $smid2spid{$smid} ) { 2352: $parent_smid = $smid2spid{$smid}; 2353: $parent = '['.$parent_smid.']'; 2354: $parent = '<A href="'.$parent_smid.'.html">'.$parent.'</A>'; 2355: } 2356: else { $parent = ''; } 2357: $html .= "\n\t\t\t".'<TD class=headertext>'.$parent.'</TD>'; 2358: $html .= "\n\t\t".'</TR>'; 2359: ## Replies 2360: $html .= "\n\t\t".'<TR align="left">'; 2361: $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>NEXT: </B></TD>'; 2362: undef @replies; 2363: if (defined $spid2smids{$smid}) { 2364: foreach $reply_smid (sort { $a <=> $b } @{$spid2smids{$smid}}) { 2365: push(@replies,'<A href="'.$reply_smid.'.html">['.$reply_smid.']</A>'); 2366: } 2367: } 2368: $replies = join ', ',@replies; 2369: $html .= "\n\t\t\t".'<TD class=headertext>'.$replies.'</TD>'; 2370: $html .= "\n\t\t".'</TR>'; 2371: ## Cites 2372: $html .= "\n\t\t".'<TR align="left">'; 2373: $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>CITES: </B></TD>'; 2374: undef @cites; 2375: if (defined $smid2sources{$smid}) { 2376: foreach $cite_smid (sort { $a <=> $b } @{$smid2sources{$smid}}) { 2377: push(@cites,'<A href="'.$cite_smid.'.html">['.$cite_smid.']</A>'); 2378: } 2379: } 2380: $cites = join ', ',@cites; 2381: $html .= "\n\t\t\t".'<TD class=headertext>'.$cites.'</TD>'; 2382: $html .= "\n\t\t".'</TR>'; 2383: 2384: ## Cited-by 2385: $html .= "\n\t\t".'<TR align="left">'; 2386: $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>CITED BY: </B></TD>'; 2387: undef @cited_bys; 2388: if (defined $source2smids{$smid}) { 2389: foreach $cited_by_smid (sort { $a <=> $b } @{$source2smids{$smid}}) { 2390: push(@cited_bys,'<A href="'.$cited_by_smid.'.html">['.$cited_by_smid.']</A>'); 2391: } 2392: } 2393: $cited_bys = join ', ',@cited_bys; 2394: $html .= "\n\t\t\t".'<TD class=headertext>'.$cited_bys.'</TD>'; 2395: $html .= "\n\t\t".'</TR>'; 2396: 2397: ## Indices 2398: $html .= "\n\t\t".'<TR align="left">'; 2399: $html .= "\n\t\t\t".'<TD class=headerfieldtext><B>MESSAGES BY: </B></TD>'; 2400: $html .= "\n\t\t\t".'<TD class=headertext>'; 2401: $html .= '<A href="http://'.$cgi_url.'/Agonistics/list_messages_by_id.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'">id</A>, '; 2402: $html .= '<A href="http://'.$cgi_url.'/Agonistics/list_messages_by_date.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'">date</A>, '; 2403: $html .= '<A href="http://'.$cgi_url.'/Agonistics/list_messages_by_subject.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'">subject</A>, '; 2404: $html .= '<A href="http://'.$cgi_url.'/Agonistics/list_messages_by_sender.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'">sender</A>, '; 2405: $html .= '<A href="http://'.$cgi_url.'/Agonistics/list_messages_by_thread.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'">thread</A>'; 2406: $html .= '</TD>'."\n\t\t".'</TR>'; 2407: 2408: ## Reply button for newsgroup discussions 2409: if ( $nntp_server ) { 2410: $html .= "\n\t".'<FORM METHOD="POST" ACTION="http://'.$cgi_url.'/Agonistics/write_message.pl" ENCTYPE="multipart/form-data" TARGET="reply_to_message_'.$smid.'">'; 2411: $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="url" VALUE="'.$documents_url.'">'; 2412: $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="cgi_url" VALUE="'.$cgi_url.'">'; 2413: $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="archive" VALUE="'.$archive_name.'">'; 2414: $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="smid" VALUE="'.$smid.'">'; 2415: $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="id" VALUE="'.$smid2mid{$smid}.'">'; 2416: $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="subject" VALUE="'.$smid2subject{$smid}.'">'; 2417: $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="news_group" VALUE="'.$news_group.'">'; 2418: $html .= "\n\t\t".'<INPUT TYPE="hidden" NAME="language_locale" VALUE="'.$language_locale.'">'; 2419: } 2420: ## Reply button for email-based discussions 2421: ## This should also contain in-reply-to and subject fields, but Mozilla 2422: ## seems to have a hard time making sense of these other fields. 2423: else { $html .= "\n\t".'<FORM ACTION="mailto:'.$mailing_list_address.'">'; } 2424: $html .= "\n\t\t".'<TR align="left">'; 2425: $html .= "\n\t\t\t".'<TD class=headerfieldtext>'; 2426: my $button_value = ''; 2427: if ( $language_locale =~ /^EN/ ) { $button_value = "REPLY"; } 2428: elsif ( $language_locale =~ /^FR/ ) { $button_value = "R\x{c9}PONDRE"; } 2429: elsif ( $language_locale =~ /^DE/ ) { $button_value = "ANTWORTEN"; } 2430: $html .= "\n\t\t\t\t".'<INPUT name="reply_to_message" class=buttonstyle type="SUBMIT" value="'.$button_value.'">'; 2431: $html .= "\n\t\t\t".'</TD>'; 2432: $html .= "\n\t\t\t".'<TD> </TD>'; 2433: $html .= "\n\t\t".'</TR>'; 2434: $html .= "\n\t".'</FORM>'; 2435: 2436: ## End of table 2437: $html .= "\n\t\t".'</TBODY>'."\n\t".'</TABLE>'."\n\t".'<BR><BR>'; 2438: 2439: return($html); 2440: } 2441: 2442: 2443: ## WRITE_HTML_FOR_MESSAGE 2444: ## 2445: sub write_html_for_message 2446: { 2447: my($smid) = @_; 2448: 2449: my($ln,$header,$body,$next_line,$line_prefix,$end); 2450: my(@signature_lines,$first); 2451: 2452: ## First, find which lines are signature lines. 2453: @signature_lines = sort {$a <=> $b } &find_line_numbers_of_signature($smid); 2454: ## Correct for the padding done in &find_line_numbers_of_signature; 2455: pop(@signature_lines); 2456: shift(@signature_lines); 2457: ## Next, write the head and body of the message as HTML. 2458: $ln = 0; 2459: $header = &write_html_header_for_message($smid); 2460: $body = ''; 2461: while (defined $smidln2line{"$smid $ln"}) { 2462: $next_line = $smidln2line{"$smid $ln"}; 2463: $next_line = HTML::Entities::encode_entities($next_line); 2464: $next_line =~ s/\s/\ \;/g; 2465: ## Every line of every message gets an anchor name so that 2466: ## it can be linked to, if necessary, from another message. 2467: $line_prefix = '<A name="'.$ln.'"'; 2468: ## If a line is a signature line, then it should be highlighted, 2469: ## but not hyperlinked. 2470: if ( ( >= 0) and ( $ln == $signature_lines[0] ) ) { 2471: $line_prefix .= ' class=signaturetext'; 2472: ($first,@signature_lines) = @signature_lines; 2473: } 2474: ## Else if a line is not its own source, then it is a quote from elsewhere 2475: ## and will be hyperlinked. 2476: elsif ( defined($smidln2source{"$smid $ln"}) 2477: and ( $smidln2source{"$smid $ln"} ne "$smid $ln" ) ) { 2478: ($source_smid,$source_ln) = $smidln2source{"$smid $ln"} =~ /^(\d+) (\d+)$/; 2479: if ( $source_smid or $source_ln ) { 2480: $line_prefix .= ' class=linktext'; 2481: $line_prefix .= ' href="'.$source_smid.'.html#'.$source_ln.'"'; 2482: } 2483: } 2484: ## Otherwise, the line is a line of text original to the message. 2485: else { $line_prefix .= ' class=messagetext'; } 2486: $line_prefix .= '>'; 2487: $next_line = "\n\t".$line_prefix.$next_line.'</A><BR>'; 2488: $body .= $next_line; 2489: $ln++; 2490: } 2491: $end = "\n\t".'</BODY>'."\n".'</HTML>'; 2492: open(OUT,'>'.$message_directory.$slash.$smid.'.html'); 2493: print OUT $header.$body.$end; 2494: close(OUT); 2495: return; 2496: } 2497: 2498: 2499: ## FIND_ADJACENT_SOCIAL_NETWORK_FOR_INTERFACE 2500: ## 2501: ## Input: A list of simplified message ids (integers) that represent 2502: ## either the messages quoted or replied to by the current 2503: ## message. 2504: ## 2505: ## Output: An integer: a social network index. If the message posted 2506: ## by the member of the new social network, is a message that 2507: ## replies to or cites the message of an author already in 2508: ## another social network, return the index of the other 2509: ## social network; otherwise, return 0. 2510: ## 2511: ## Notes: The purpose of this function is to try to find a likely 2512: ## place to position the player in a new social network in 2513: ## the interface. The player in the new social network is 2514: ## assumed to be the network's sole member and is the 2515: ## author of the message just posted to the newsgroup. 2516: ## 2517: sub find_adjacent_social_network_for_interface 2518: { 2519: my(@source_smids) = @_; 2520: 2521: my($source_smid,$sn); 2522: 2523: ## The following is necessary because some authors don't 2524: ## belong to social networks because they are authors 2525: ## of messages that are referenced, but not included 2526: ## in the archive being analyzed. 2527: $sn = -1; 2528: foreach $source_smid (@source_smids) { 2529: if ( defined($smid2said{$source_smid}) 2530: and defined($said2sn{$smid2said{$source_smid}}) ) { 2531: $sn = $said2sn{$smid2said{$source_smid}}; 2532: last; 2533: } 2534: } 2535: return($sn); 2536: } 2537: 2538: 2539: ## EVENLY_MIX 2540: ## 2541: sub evenly_mix 2542: { 2543: my($ptr_to_part1,$ptr_to_part2,$ptr_to_whole) = @_; 2544: 2545: my(@part1,@part2,@whole,@new_whole,$w,$p1,$p2,$i,$j,$part1_after_part2); 2546: 2547: @part1 = @{$ptr_to_part1}; 2548: @part2 = @{$ptr_to_part2}; 2549: if ( scalar(@part2) > scalar(@part1) ) { 2550: @part1 = @{$ptr_to_part2}; ## assumed to be more numerous 2551: @part2 = @{$ptr_to_part1}; 2552: } 2553: @whole = @{$ptr_to_whole}; 2554: if ( scalar(@part2) <= 1 ) { return(@whole); } 2555: ## Rotate the members of part1 until the one that used to 2556: ## follow the first elements of part2 is at the front 2557: ## of @part1. 2558: ## First, rotate @whole until the first element of @part2 2559: ## is at the front. 2560: while ( $whole[0] != $part2[0] ) { 2561: $w = shift @whole; 2562: push(@whole,$w); 2563: } 2564: $part1_after_part2 = 'undefined'; 2565: ## Second, find the element of @part1 that comes 2566: ## after the first element opf @part2 2567: for ($i = 1; $i <= ; $i++) { 2568: if ( grep { $_ == $whole[$i] } @part1 ) { 2569: $part1_after_part2 = $whole[$i]; 2570: last; 2571: } 2572: } 2573: ## Third, rotate @part1 around until the 2574: ## element following the first of @part2 2575: ## is at its head. 2576: if ( $part1_after_part2 ne 'undefined' ) { 2577: while ( $part1[0] != $part1_after_part2 ) { 2578: $p1 = shift @part1; 2579: push(@part1,$p1); 2580: } 2581: } 2582: ## Now, interlace @part1 and @part2 2583: $increment = int(scalar(@part1) / scalar(@part2)); 2584: @new_whole = (); 2585: foreach $p2 (@part2) { 2586: push(@new_whole,$p2); 2587: for ( $j = 0; $j < $increment; $j++ ) { 2588: if ( @part1 ) { 2589: push(@new_whole,$part1[0]); 2590: shift @part1; 2591: } 2592: } 2593: } 2594: if ( @part1 ) { push(@new_whole,@part1); } 2595: return(@new_whole); 2596: } 2597: 2598: 2599: ## REDISTRIBUTE_LARGE_SOCIAL_NETWORKS 2600: ## 2601: sub redistribute_large_social_networks 2602: { 2603: my(@social_network_indices) = @_; 2604: 2605: my($large,@large,@small,@invisible,@small_and_invisible,$is_visible); 2606: my($sni,$said); 2607: 2608: $large = 2; ## any network with two or more players is considered "large." 2609: @large = (); 2610: @small = (); 2611: @invisible = (); 2612: foreach $sni (@social_network_indices) { 2613: $is_visible = 0; 2614: foreach $said (keys %{$sn2saids{$sni}}) { 2615: if ( &player_is_visible($said) ) { 2616: $is_visible = 1; 2617: last; 2618: } 2619: } 2620: if ( $is_visible ) { 2621: if ( scalar(keys %{$sn2saids{$sni}}) >= $large ) { push(@large,$sni); } 2622: else { push(@small,$sni); } 2623: } 2624: else { push(@invisible,$sni); } 2625: } 2626: ## Mix the invisibles into the smalls 2627: @small_and_invisible = &evenly_mix(\@small,\@invisible,\@social_network_indices); 2628: ## Now mix the larges into the small and invisible 2629: return(&evenly_mix(\@small_and_invisible,\@large,\@social_network_indices)); 2630: } 2631: 2632: 2633: ## INSERT_NEW_SOCIAL_NETWORKS_FOR_INTERFACE 2634: ## 2635: ## Input: None 2636: ## 2637: ## Output: A list of sorted social network indices with any new social 2638: ## networks inserted into the list. 2639: ## 2640: ## Notes: Insert any new social networks into the ordered list of networks. 2641: ## There should only be one, at the most, in principle, such new 2642: ## network after one new message posted to the newsgroup. 2643: ## 2644: sub insert_new_social_networks_for_interface 2645: { 2646: my(@social_network_indices,$sni,$esn,$i); 2647: 2648: ## Produce a list of continuing social networks that does not include 2649: ## any of the disappearing (i.e., old) social networks. The old 2650: ## social networks have been taken out of the %said2sn 2651: ## and sn2saids hashes, but are still keys in the 2652: ## %sn2theta hash. Take them out of this hash too. 2653: @social_network_indices = (); 2654: foreach $sni (keys %sn2saids) { 2655: if ( grep { $_ == $sni } (keys %old_social_network) ) { 2656: delete($sn2theta{$sni}); 2657: next; 2658: } 2659: if ( grep { $_ == $sni } (keys %new_social_network) ) { next; } 2660: else { push(@social_network_indices,$sni); } 2661: } 2662: ## Sort the existing social networks into counter-clockwise order. 2663: @social_network_indices = sort { $sn2theta{$a} <=> $sn2theta{$b} } @social_network_indices; 2664: ## Re-sort the indices so that the large social networks are equally distributed. 2665: @social_network_indices = &redistribute_large_social_networks(@social_network_indices); 2666: ## If there is currently only one social network, return it. 2667: if ( not(@social_network_indices) ) { return(keys %new_social_network); } 2668: ## Otherwise, insert any new social network indices into this ordered list using 2669: ## &find_adjacent_social_network_for_interface to find appropriate insertion points. 2670: foreach $new_social_network (keys %new_social_network) { 2671: @source_smids = (); 2672: foreach $said (keys %{$sn2saids{$new_social_network}}) { 2673: foreach $smid (@{$said2smids{$said}}) { 2674: if (defined($smid2sources{$smid}) ) { 2675: push(@source_smids,@{$smid2sources{$smid}}); 2676: } 2677: if (defined($smid2spid{$smid})) { push(@source_smids,$smid2spid{$smid}); } 2678: } 2679: } 2680: $esn = &find_adjacent_social_network_for_interface(@source_smids); 2681: ## No insertion point found because current message is neither a 2682: ## reply to nor does it include a citation from another message. 2683: if ( $esn == -1 ) { push(@social_network_indices,$new_social_network); } 2684: ## Otherwise, rotate to position the $esn at the start of the ordered list 2685: ## and insert the new social network next to it by pushing onto the 2686: ## end of the list (remember the order is circular). 2687: else { 2688: $i = 0; 2689: while ( $social_network_indices[0] != $esn ) { 2690: if ( $i > ) { 2691: print STDOUT "\nError: Problem with the social network indices!\n"; 2692: last; 2693: } 2694: $sni = shift @social_network_indices; 2695: push(@social_network_indices,$sni); 2696: $i++; 2697: } 2698: push(@social_network_indices,$new_social_network); 2699: } 2700: } 2701: return(@social_network_indices); 2702: } 2703: 2704: ## WRITE_HTML_FOR_LEADERS_LIST 2705: ## 2706: sub write_html_for_leaders_list 2707: { 2708: my($current_smid) = @_; 2709: 2710: my($html,$said,@saids,$messagesCGIScript,$addr); 2711: 2712: $messagesCGIScript = 'http://'.$cgi_url.'/Agonistics/index_for_player.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'&msg='.$current_smid.'&id='; 2713: $html .= "\n\t".'<TABLE align="right" style="width: 20%; text-align: right;">'."\n\t\t".'<TBODY>'; 2714: $html .= "\n\t\t".'<TR align="right">'; 2715: if ( $language_locale =~ /^EN/ ) { 2716: $html .= "\n\t\t\t".'<TD align="right" class=headerfieldtext><B>c u r r e n t l e a d e r s</B></TD>'; 2717: } 2718: elsif ( $language_locale =~ /^FR/ ) { 2719: $html .= "\n\t\t\t".'<TD align="right" class=headerfieldtext><B>c l a s s e m e n t</B></TD>'; 2720: } 2721: elsif ( $language_locale =~ /^DE/ ) { 2722: $html .= "\n\t\t\t".'<TD align="right" class=headerfieldtext><B>L i s t e d e r S p i e l e r</B></TD>'; 2723: } 2724: $html .= "\n\t\t".'</TR>'; 2725: @saids = sort { if ( $said2score{$a} == $said2score{$b} ) { lc($said2addr{$a}) cmp lc($said2addr{$b}); } else { $said2score{$b} <=> $said2score{$a} } } keys %said2score; 2726: # if ( $#saids >= 9 ) { @saids = @saids[0..9]; } 2727: foreach $said (@saids) { 2728: if ( defined($said2addr{$said}) and defined($said2score{$said}) ) { 2729: $addr = HTML::Entities::encode_entities($said2addr{$said}); 2730: $html .= "\n\t\t".'<TR align="right">'; 2731: $html .= "\n\t\t\t".'<TD align="right" id="'.$said.'" class=scorelisttext onMouseOver="turnLabelOnFromLeadersList(this,event);"><B><A href="'.$messagesCGIScript.$said.'" target="messages_posted_by_'.$said2addr{$said}.'">'.$addr.'</A> '.$said2score{$said}.'</B></TD>'; 2732: $html .= "\n\t\t".'</TR>'; 2733: } 2734: } 2735: $html .= "\n\t\t".'</TBODY>'; 2736: $html .= "\n\t".'</TABLE>'; 2737: return($html); 2738: } 2739: 2740: 2741: ## WRITE_HTML_FOR_WINNING_SENTENCES 2742: ## 2743: sub write_html_for_winning_sentences 2744: { 2745: my($current_smid) = @_; 2746: 2747: my($html,@stems,$stem,$theme,$ssid,@winning_ssids,$i,$j,$ws,$said,@saids); 2748: my($messagesCGIScript,$messagesURL,$addr,$sentence_text); 2749: 2750: $messagesCGIScript = 'http://'.$cgi_url.'/Agonistics/index_for_player.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'&msg='.$current_smid.'&id='; 2751: $messagesURL = 'http://'.$documents_url.'/Agonistics/'.$archive_name.'/Interface/Messages/'; 2752: $html .= "\n\t".'<TABLE align="left" style="width: 20%; text-align: left;" >'."\n\t\t".'<TBODY>'; 2753: $html .= "\n\t\t".'<TR align="left">'; 2754: if ( $language_locale =~ /^FR/ ) { 2755: $html .= "\n\t\t\t".'<TD align="left" class=headerfieldtext><B>t h è m e s f r é q u e n t s</B></TD>'; 2756: } 2757: elsif ( $language_locale =~ /^EN/ ) { 2758: $html .= "\n\t\t\t".'<TD align="left" class=headerfieldtext><B>s t a t e m e n t s a b o u t<BR>f r e q u e n t t h e m e s</B></TD>'; 2759: } 2760: elsif ( $language_locale =~ /^DE/ ) { 2761: $html .= "\n\t\t\t".'<TD align="left" class=headerfieldtext><B>T h e m e n d e r<BR>D i s k u s s i o n</B></TD>'; 2762: } 2763: $html .= "\n\t\t".'</TR>'; 2764: @stems = sort { $#{@{$winning_sentences{$b}}} <=> $#{@{$winning_sentences{$a}}} } keys %winning_sentences; 2765: @saids = sort { $said2score{$b} <=> $said2score{$a} } keys %said2score; 2766: $i = 0; 2767: foreach $stem (@stems) { 2768: if ( $i > 2 ) { last; } 2769: @winning_ssids = (); 2770: $j = 0; 2771: foreach $said (@saids) { 2772: @{$winning_sentences{$stem}} = sort { $smid2sdt{$ssid2smid{$b}} <=> $smid2sdt{$ssid2smid{$a}} } @{$winning_sentences{$stem}}; 2773: foreach $ws (@{$winning_sentences{$stem}}) { 2774: if ($smid2said{$ssid2smid{$ws}} == $said) { 2775: push(@winning_ssids,$ws); 2776: $j++; 2777: } 2778: if ( $j >= 10 ) { last; } 2779: } 2780: if ( $j >= 10 ) { last; } 2781: } 2782: if ( not(@winning_ssids) ) { next; } 2783: $i++; 2784: if ( defined($stem2tokens{$stem}) ) { $theme = ${@{$stem2tokens{$stem}}}[0]; } 2785: else { $theme = $stem; } 2786: $theme = HTML::Entities::encode_entities($theme); 2787: $html .= "\n\t\t".'<TR align="left">'; 2788: $html .= "\n\t\t\t".'<TD valign="top" align="left" class=pastthemetext><B>'.$theme.'</B></TD>'; 2789: $html .= "\n\t\t".'</TR>'; 2790: foreach $ssid (@winning_ssids) { 2791: $html .= "\n\t\t".'<TR align="left">'; 2792: if ( defined($ssid2smid{$ssid}) 2793: and defined($smid2said{$ssid2smid{$ssid}}) 2794: and defined($said2addr{$smid2said{$ssid2smid{$ssid}}}) 2795: and defined($ssid2sentence{$ssid}) ) { 2796: $addr = HTML::Entities::encode_entities($said2addr{$smid2said{$ssid2smid{$ssid}}}); 2797: $sentence_text = HTML::Entities::encode_entities($ssid2sentence{$ssid}); 2798: $html .= "\n\t\t\t".'<TD align="left"><B class=pastspeakertext><A href="'.$messagesCGIScript.$smid2said{$ssid2smid{$ssid}}.'" target="'.$said2addr{$smid2said{$ssid2smid{$ssid}}}.'">'.$addr.'</A>:</B> <B class=pastsentencetext><A href="'.$messagesURL.$ssid2smid{$ssid}.'.html#'.$ssid2start{$ssid}.'" target="message '.$ssid2smid{$ssid}.'">'.$sentence_text.'</A></B></TD>'; 2799: } 2800: else { $html .= "\n\t\t\t".'<TD> </TD>'; } 2801: $html .= "\n\t\t".'</TR>'; 2802: } 2803: } 2804: $html .= "\n\t\t".'</TBODY>'; 2805: $html .= "\n\t".'</TABLE>'; 2806: return($html); 2807: } 2808: 2809: 2810: ## ASSIGN_THETAS_FOR_INTERFACE 2811: ## 2812: sub assign_thetas_for_interface 2813: { 2814: my (@social_network_indices) = @_; 2815: 2816: my($i,$sni,$last_angle,$index_of_largest_network_with_theta); 2817: my($size_of_largest_network_with_theta); 2818: 2819: ## If there is only one social network currently, then assign 2820: ## position it at 0. 2821: if ( == 0 ) { $sn2theta{$social_network_indices[0]} = 0; } 2822: ## Otherwise, first rotate the @social_network_indices until a social network 2823: ## with an existing theta angle is found. Find the largest social network with 2824: ## an assigned theta to be the anchor point. 2825: else { 2826: $index_of_largest_network_with_theta = -1; 2827: $size_of_largest_network_with_theta = -1; 2828: for ($i = 0; $i <= ; $i++) { 2829: $sni = $social_network_indices[$i]; 2830: if ( defined($social_network_indices[$i]) 2831: and defined($sn2theta{$sni}) 2832: and defined($sn2saids{$sni}) 2833: and (keys %{$sn2saids{$sni}}) 2834: and ( scalar(keys %{$sn2saids{$sni}}) > $size_of_largest_network_with_theta ) ) { 2835: $size_of_largest_network_with_theta = scalar(keys %{$sn2saids{$sni}}); 2836: $index_of_largest_network_with_theta = $sni; 2837: } 2838: } 2839: if ( $index_of_largest_network_with_theta != -1 ) { 2840: $sni = -1; 2841: while ( $sni != $index_of_largest_network_with_theta ) { 2842: $sni = shift @social_network_indices; 2843: push(@social_network_indices,$sni); 2844: } 2845: $last_angle = $sn2theta{$index_of_largest_network_with_theta}; 2846: } 2847: else { $last_angle = 0; } 2848: $angle_increment = ( $radius_of_circle / scalar(@social_network_indices) ); 2849: undef %sn2theta; 2850: foreach $sni (@social_network_indices) { 2851: $sn2theta{$sni} = $last_angle; 2852: $last_angle = &angle_add($last_angle,$angle_increment); 2853: } 2854: } 2855: return; 2856: } 2857: 2858: 2859: ## DISTRIBUTE_PLAYERS_FOR_INTERFACE 2860: ## 2861: sub distribute_players_for_interface 2862: { 2863: my(@social_network_indices) = @_; 2864: 2865: foreach $sni (@social_network_indices) { 2866: ## Update the theta and rho positions for each player. 2867: foreach $said (keys %{$sn2saids{$sni}}) { 2868: if ( not(defined($said2theta{$said})) ) { $said2previous_theta{$said} = $sn2theta{$said}; } 2869: else { $said2previous_theta{$said} = $said2theta{$said}; } 2870: $said2theta{$said} = $sn2theta{$sni}; 2871: $said2theta_movement{$said} = &angle_subtract($said2theta{$said},$said2previous_theta{$said}); 2872: if ( not(defined($said2rho{$said})) ) { $said2previous_rho{$said} = $said2percentage_score{$said}; } 2873: else { $said2previous_rho{$said} = $said2rho{$said}; } 2874: $said2rho{$said} = $said2percentage_score{$said}; 2875: if ( defined($said2rho{$said}) and defined($said2previous_rho{$said}) ) { 2876: $said2rho_movement{$said} = $said2rho{$said} - $said2previous_rho{$said}; 2877: } 2878: else { $said2rho_movement{$said} = 0; } 2879: } 2880: } 2881: return; 2882: } 2883: 2884: 2885: ## SELECT_FACES_AND_PROFILES_FOR_INTERFACE 2886: ## 2887: sub select_faces_and_profiles_for_interface 2888: { 2889: my($current_smid) = @_; 2890: 2891: my($pi,$speaker_said,%x_pos,$said); 2892: 2893: $speaker_said = $smid2said{$current_smid}; 2894: ## Determine the horizontal position of the author of the current_smid 2895: $pi = atan2(1,1) * 4; 2896: $x_pos{$speaker_said} = ( ( $radius_of_circle - $said2rho{$speaker_said} ) 2897: * cos(($said2theta{$speaker_said}/$radius_of_circle) * 2 * $pi) ); 2898: ## To pick an image and a profile for players, determine whether they are to the left or the 2899: ## right of the speaker. 2900: foreach $said (sort {$a <=> $b} keys %said2smids) { 2901: if ( defined($said2rho{$said}) and defined($said2theta{$said}) ) { 2902: $x_pos{$said} = ( ( $radius_of_circle - $said2rho{$said} ) 2903: * cos(($said2theta{$said}/$radius_of_circle) * 2 * $pi) ); 2904: } 2905: else { $x_pos{$said} = $x_pos{$speaker_said}; } 2906: if ( $x_pos{$said} == $x_pos{$speaker_said} ) { $said2profile{$said} = ''; } 2907: elsif ( $x_pos{$said} > $x_pos{$speaker_said} ) { $said2profile{$said} = 'l'; } 2908: else { $said2profile{$said} = 'r'; } 2909: ## Assign each new image id randomly. 2910: if (not defined($said2image_id{$said})) { 2911: $said2image_id{$said} = int(rand($number_of_faces)); 2912: if ( $said2image_id{$said} == 0 ) { $said2image_id{$said} = 1; } 2913: } 2914: } 2915: } 2916: 2917: 2918: ## PLAYER_IS_VISIBLE 2919: ## 2920: sub player_is_visible 2921: { 2922: my($said) = @_; 2923: 2924: my(@smids); 2925: 2926: @smids = @{$said2smids{$said}}; 2927: if ( $smids[] >= ( $current_simplified_message_id - $recency ) ) { return(1); } 2928: ## Even if the current winner has not posted within $recency messages, keep the winner visible. 2929: if ( $current_winner == $said ) { return(1); } 2930: else { return(0); } 2931: } 2932: 2933: 2934: ## WRITE_HTML_FOR_INTERFACE 2935: ## 2936: sub write_html_for_interface 2937: { 2938: my($current_smid,@source_smids) = @_; 2939: 2940: my(@social_network_indices,$html_front_divs,$html_profile_divs,$html_label_divs,$html); 2941: my(@player_defs,$player_def,$said,$speaker_said,@js_code,$js_code,$sni); 2942: my($html_theme_divs,$theme_defs,$i,$subject_line_div,$shared_stem,@shared_stems); 2943: my($sentence_div,$sentence_text,$subject_line_text,$start_date,$end_date,$newsgroup_html); 2944: my($newsgroup_div,$newsgroup_def,$start_date_html,$start_date_div,$start_date_def); 2945: my($end_date_html,$end_date_div,$end_date_def,$post_button_label); 2946: 2947: @social_network_indices = &insert_new_social_networks_for_interface(); 2948: undef %new_social_network; 2949: undef %old_social_network; 2950: &assign_thetas_for_interface(@social_network_indices); 2951: &distribute_players_for_interface(@social_network_indices); 2952: &select_faces_and_profiles_for_interface($current_smid,@social_network_indices); 2953: 2954: ## Now write the HTML 2955: $html = '<HTML> 2956: <HEAD> 2957: <META http-equiv="Refresh" content="'.$pause_between_frames.'; URL=agon'.($frame_number+1).'.html"> 2958: <TITLE>agonistics</TITLE> 2959: <LINK type="text/css" rel="stylesheet" href="interface_style.css"> 2960: </HEAD> 2961: <BODY>'; 2962: $html .= &write_html_for_leaders_list($current_smid); 2963: $html .= &write_html_for_winning_sentences($current_smid); 2964: $html_front_divs = ''; 2965: $html_profile_divs = ''; 2966: $html_label_divs = ''; 2967: $html_theme_divs = ''; 2968: @player_defs = (); 2969: $speaker_said = $smid2said{$current_smid}; 2970: $speaker_sni = $said2sn{$speaker_said}; 2971: foreach $said (sort {$said2theta{$a} <=> $said2theta{$b}} keys %said2smids) { 2972: if ( not(&player_is_visible($said)) ) { next; } 2973: if ( $said == $speaker_said ) { $said_is_speaker{$said} = 1; } 2974: else { $said_is_speaker{$said} = 0; } 2975: $html_front_divs .= '<DIV id="'.$said.'_front" style="position:absolute; visibility:hidden" onMouseDown="beginDrag(this, event);" onMouseOver="turnLabelOnFromProfile(this, event);"></DIV>'."\n"; 2976: $html_profile_divs .= '<DIV id="'.$said.'_profile" style="position:absolute; visibility:hidden"></DIV>'."\n"; 2977: $html_label_divs .= '<DIV id="'.$said.'_label" style="position:absolute; visibility:hidden"></DIV>'."\n"; 2978: if ( defined($said2addr{$said}) ) { 2979: $player_def = '{ addr:"'.$said2addr{$said}.'"'; 2980: $player_def .= ', initialRho:'.$said2previous_rho{$said}.', finalRho:'.$said2rho{$said}; 2981: $player_def .= ', rhoMovement:'.$said2rho_movement{$said}; 2982: $player_def .= ', initialTheta:'.$said2previous_theta{$said}.', finalTheta:'.$said2theta{$said}; 2983: $player_def .= ', thetaMovement:'.$said2theta_movement{$said}; 2984: $player_def .= ', profile:"'.$said2profile{$said}.'", imageID:'.$said2image_id{$said}; 2985: $player_def .= ', said:'.$said.', isSpeaker:'.$said_is_speaker{$said}; 2986: $player_def .= ', score:'.$said2score{$said}.', previousScore:'.$said2previous_score{$said}; 2987: if ( $said == $current_winner ) { $player_def .= ', isWinner: 1 }'; } 2988: else { $player_def .= ', isWinner: 0 }'; } 2989: push(@player_defs,$player_def); 2990: } 2991: } 2992: $subject_line_div = '<DIV id="subject_line_div" style="position:absolute; visibility:hidden"></DIV>'."\n"; 2993: $sentence_div = '<DIV id="sentence_div" style="position:absolute; visibility:hidden"></DIV>'."\n"; 2994: $newsgroup_html = '<B class=labeltext>n e w s g r o u p </B>'; 2995: $newsgroup_html .= '<B class=scorelisttext>'.$news_group.' </B>'; 2996: $newsgroup_div = '<DIV id="newsgroup_div" style="position:absolute; visibility:hidden"></DIV>'."\n"; 2997: $newsgroup_def = 'var newsgroupHTML = "'.$newsgroup_html.'";'."\n"; 2998: $end_date_html = '<B class=labeltext>t i m e </B>'; 2999: $end_date_html .= '<B class=scorelisttext>'.&code_the_date($smid2date{$current_smid}).'</B>'; 3000: $end_date_div = '<DIV id="end_date_div" style="position:absolute; visibility:hidden"></DIV>'."\n"; 3001: $end_date_def = 'var endDateHTML = "'.$end_date_html.'";'."\n"; 3002: ## add forward and rewind buttons 3003: $rewind_button_html = '<A href="http://'.$cgi_url.'/Agonistics/rewind_to_first_frame.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'"><IMG src="rewind.gif"></A>'; 3004: $rewind_button_div = '<DIV id="rewind_button_div" style="position:absolute; visibility:hidden"></DIV>'."\n"; 3005: $rewind_button_def = "var rewindButtonHTML = '".$rewind_button_html."';\n"; 3006: $forward_button_html = '<A href="http://'.$cgi_url.'/Agonistics/forward_to_latest_frame.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'"><IMG src="forward.gif"></A>'; 3007: $forward_button_div = '<DIV id="forward_button_div" style="position:absolute; visibility:hidden"></DIV>'."\n"; 3008: $forward_button_def = "var forwardButtonHTML = '".$forward_button_html."';\n"; 3009: ## add help and post buttons 3010: if ( $language_locale =~ /^EN/ ) { 3011: $help_button_html = '<FORM ACTION="http://'.$documents_url.'/Agonistics/'.$archive_name.'/Interface/help-en.html" TARGET="agonistics / help">'; 3012: $help_button_html .= '<INPUT class=buttonstyle type="SUBMIT" value="help "></FORM>'; 3013: $post_button_label = "write"; 3014: } 3015: elsif ( $language_locale =~ /^FR/ ) { 3016: $help_button_html = '<FORM ACTION="http://'.$documents_url.'/Agonistics/'.$archive_name.'/Interface/help-fr.html" TARGET="agonistics / aide">'; 3017: $help_button_html .= '<INPUT class=buttonstyle type="SUBMIT" value="aide "></FORM>'; 3018: $post_button_label = "écrire"; 3019: } 3020: elsif ( $language_locale =~ /^DE/ ) { 3021: $help_button_html = '<FORM ACTION="http://'.$documents_url.'/Agonistics/'.$archive_name.'/Interface/help-de.html" TARGET="agonistics / hilfe">'; 3022: $help_button_html .= '<INPUT class=buttonstyle type="SUBMIT" value="hilfe "></FORM>'; 3023: $post_button_label = "schreiben"; 3024: } 3025: $help_button_div = '<DIV id="help_button_div" style="position:absolute; visibility:hidden"></DIV>'."\n"; 3026: $help_button_def = "var helpButtonHTML = '".$help_button_html."';\n"; 3027: ## Post button for newsgroup discussions 3028: if ( $nntp_server ) { 3029: $post_button_html = '<FORM METHOD="POST" ACTION="http://'.$cgi_url.'/Agonistics/write_message.pl" ENCTYPE="multipart/form-data" TARGET="write_a_message">'; 3030: $post_button_html .= '<INPUT TYPE="hidden" NAME="url" VALUE="'.$documents_url.'">'; 3031: $post_button_html .= '<INPUT TYPE="hidden" NAME="cgi_url" VALUE="'.$cgi_url.'">'; 3032: $post_button_html .= '<INPUT TYPE="hidden" NAME="archive" VALUE="'.$archive_name.'">'; 3033: $post_button_html .= '<INPUT TYPE="hidden" NAME="news_group" VALUE="'.$news_group.'">'; 3034: $post_button_html .= '<INPUT TYPE="hidden" NAME="language_locale" VALUE="'.$language_locale.'">'; 3035: } 3036: ## Post button for email-based discussions 3037: else { $post_button_html = '<FORM ACTION="mailto:'.$mailing_list_address.'">'; } 3038: $post_button_html .= '<INPUT name="post_message" class=buttonstyle type="SUBMIT" value="'.$post_button_label.'">'; 3039: $post_button_div = '<DIV id="post_button_div" style="position:absolute; visibility:hidden"></DIV>'."\n"; 3040: $post_button_def = "var postButtonHTML = '".$post_button_html."';\n"; 3041: $def_button_html = ''; 3042: $def_button_div = '<DIV id="def_button_div" style="position:absolute; visibility:hidden"></DIV>'."\n"; 3043: $def_button_def = "var defButtonHTML = '".$def_button_html."';\n"; 3044: ## Player definitions 3045: $player_defs = join ','."\n",@player_defs; 3046: $player_defs = 'var players = [ '.$player_defs.'];'; 3047: $current_winning_theme = HTML::Entities::encode_entities($current_winning_theme); 3048: $theme_defs = '{ network:'.$speaker_sni.', text:"'.$current_winning_theme.'", theta:'.$sn2theta{$speaker_sni}.' }'; 3049: $theme_defs = 'var networkThemes = [ '.$theme_defs.' ];'; 3050: $html_theme_divs .= '<DIV id="theme_'.$speaker_sni.'" style="position:absolute; visibility:hidden"></DIV>'."\n"; 3051: $html .= $html_front_divs."\n\n".$html_profile_divs."\n\n".$html_label_divs."\n\n".$subject_line_div."\n\n".$sentence_div."\n\n".$html_theme_divs."\n\n".$newsgroup_div."\n\n".$end_date_div."\n\n".$rewind_button_div."\n\n".$forward_button_div."\n\n".$help_button_div."\n\n".$post_button_div."\n\n".$def_button_div."\n\n".'<SCRIPT>'."\n".$player_defs."\n\n".$theme_defs."\n\n".$newsgroup_def."\n\n".$end_date_def."\n\n".$rewind_button_def."\n\n".$forward_button_def."\n\n".$help_button_def."\n\n".$post_button_def."\n\n".$def_button_def."\n\n"; 3052: $html .= 'var dir = "Messages";'."\n"; 3053: $html .= 'var messagesCGIScript = "http://'.$cgi_url.'/Agonistics/index_for_player.pl?url='.$documents_url.'&path='.$web_server_directory.'&archive='.$archive_name.'&msg='.$current_smid.'";'."\n"; 3054: $subject_line_text = HTML::Entities::encode_entities($smid2nsl{$current_smid}); 3055: $html .= 'var subjectLineText = "'.$subject_line_text.'";'."\n"; 3056: if ( $current_winning_sentence ) { 3057: $current_winning_sentence =~ s/"//g; 3058: $current_winning_sentence = HTML::Entities::encode_entities($current_winning_sentence); 3059: $html .= 'var sentenceText = "'.$current_winning_sentence.'";'."\n"; 3060: } 3061: else { $html .= 'var sentenceText = "";'."\n"; } 3062: if ( defined($said2addr{$current_winner}) ) { $html .= 'var currentWinner = "'.$said2addr{$current_winner}.'";'."\n"; } 3063: else { $html .= 'var currentWinner = "";'."\n"; } 3064: if ( defined($said2addr{$speaker_said}) ) { $html .= 'var currentSpeaker = "'.$said2addr{$speaker_said}.'";'."\n"; } 3065: else { $html .= 'var currentSpeaker = "";'."\n"; } 3066: open(JS,'Resources'.$slash.'javascript_interface_code.txt'); 3067: @js_code = <JS>; 3068: close(JS); 3069: $js_code = join '',@js_code; 3070: $html .= $js_code; 3071: open(AGON,'>'.$interface_directory.$slash.'agon'.$frame_number.'.html'); 3072: print AGON $html; 3073: close(AGON); 3074: } 3075: 3076: 3077: ## CODE_THE_DATE 3078: ## 3079: sub code_the_date 3080: { 3081: my($date) = @_; 3082: 3083: return(HTML::Entities::encode_entities(lc(UnixDate($date,"%a %b %e %H:%M:%S %Y")))); 3084: } 3085: 3086: 3087: ## WRITE_MESSAGE_LIST_ID_ENTRY 3088: ## 3089: sub write_message_list_id_entry 3090: { 3091: my($smid,$is_hyperlinked) = @_; 3092: 3093: if ( $is_hyperlinked ) { $smid = '<A href="'.$smid.'.html">'.$smid.'</A>'; } 3094: return("\n\t\t\t".'<TD class=listtext>'.$smid.'</TD>'); 3095: } 3096: 3097: 3098: ## WRITE_MESSAGE_LIST_DATE_ENTRY 3099: ## 3100: sub write_message_list_date_entry 3101: { 3102: my($smid,$is_hyperlinked) = @_; 3103: 3104: my($date); 3105: 3106: if (defined $smid2date{$smid}) { 3107: $date = $smid2date{$smid}; 3108: $date = UnixDate($date,"%d %b %y at %H:%M"); 3109: $date = HTML::Entities::encode_entities($date); 3110: $date =~ s/\s/\ \;/g; 3111: } 3112: else { $date = ''; } 3113: if ( $is_hyperlinked ) { $date = '<A href="'.$smid.'.html">'.$date.'</A>'; } 3114: return("\n\t\t\t".'<TD class=listtext>'.$date.'</TD>'); 3115: } 3116: 3117: 3118: ## WRITE_MESSAGE_LIST_SUBJECT_ENTRY 3119: ## 3120: sub write_message_list_subject_entry 3121: { 3122: my($smid,$is_hyperlinked) = @_; 3123: 3124: my($subject); 3125: 3126: if (defined $smid2subject{$smid}) { 3127: $subject = $smid2subject{$smid}; 3128: $subject = HTML::Entities::encode_entities($subject); 3129: $subject =~ s/\s/\ \;/g; 3130: } 3131: else { $subject = ''; } 3132: if ( length($subject) > 60 ) { $subject = substr($subject,0,60); } 3133: if ( $is_hyperlinked ) { $subject = '<A href="http://'.$documents_url.'/Agonistics/'.$archive_name.'/Interface/Messages/'.$smid.'.html">'.$subject.'</A>'; } 3134: return("\n\t\t\t".'<TD class=listtext>'.$subject.'</TD>'); 3135: } 3136: 3137: 3138: ## WRITE_MESSAGE_LIST_SENDER_ENTRY 3139: ## 3140: sub write_message_list_sender_entry 3141: { 3142: my($smid,$is_hyperlinked) = @_; 3143: 3144: my($sender); 3145: 3146: if ( (defined $smid2said{$smid}) and (defined $said2ana{$smid2said{$smid}} ) ) { 3147: $sender = $said2ana{$smid2said{$smid}}; 3148: $sender = HTML::Entities::encode_entities($sender); 3149: $sender =~ s/\s/\ \;/g; 3150: } 3151: else { $sender = ''; } 3152: if ( $is_hyperlinked ) { $sender = '<A href="'.$smid.'.html">'.$sender.'</A>'; } 3153: return("\n\t\t\t".'<TD class=listtext>'.$sender.'</TD>'); 3154: } 3155: 3156: 3157: ## WRITE_ENTRY_IN_MESSAGE_INDEX 3158: ## 3159: sub write_entry_in_message_index 3160: { 3161: my($smid) = @_; 3162: 3163: my($html,$entry); 3164: 3165: if ( defined($smid2sdt{$smid}) 3166: ## Do not include references to messages that are not in the archive. 3167: and ( $smid2sdt{$smid} != -1 ) 3168: and defined($smid2said{$smid}) 3169: and defined($said2addr{$smid2said{$smid}}) 3170: and defined($smid2nsl{$smid}) ) { 3171: $entry = $smid."\t".$said2addr{$smid2said{$smid}}."\t".$smid2sdt{$smid}."\t".$smid2nsl{$smid}."\t"; 3172: if ( ( $smid2spid{$smid} ) 3173: and ( $smid2sdt{$smid2spid{$smid}} != -1 ) 3174: and defined($smid2said{$smid2spid{$smid}}) 3175: and defined($said2addr{$smid2said{$smid2spid{$smid}}}) 3176: and defined($smid2nsl{$smid2spid{$smid}}) ) { 3177: $entry .= $smid2spid{$smid}."\n"; 3178: } 3179: else { $entry .= "no_parent\n"; } 3180: $html .= "\t\t".'<TR align="left">'; 3181: $html .= &write_message_list_id_entry($smid,0); 3182: $html .= &write_message_list_date_entry($smid,0); 3183: $html .= &write_message_list_subject_entry($smid,1); 3184: $html .= &write_message_list_sender_entry($smid,0); 3185: $html .= "\n\t\t".'</TR>'."\n"; 3186: $entry .= $html; 3187: $message_index_for_said = $message_index_file_name.'_for_'.$smid2said{$smid}.'.txt'; 3188: ## Initialize the file for the player if this is the player's first message posted. 3189: if ( not(-e $message_index_for_said) ) { 3190: open(MSGINDEX,'>'.$message_index_for_said); 3191: close(MSGINDEX); 3192: } 3193: open(MSGINDEX,'>>'.$message_index_for_said); 3194: flock(MSGINDEX, LOCK_EX); ## exclusive lock, use LOCK_SH to read from the file 3195: print MSGINDEX $entry; 3196: close(MSGINDEX); 3197: } 3198: } 3199: 3200: ## CLEAN_RAW_MESSAGE 3201: ## 3202: sub clean_raw_message 3203: { 3204: my($next_line,$next_msg,$msg_header,$msg_body,%fields_and_values,$pos_next_msg,$boundary); 3205: my($temp_filename); 3206: 3207: ## Get rid of any extras carriage returns or line feeds. Windows, UNIX, and Apple 3208: ## formats differ according to how newlines are encoded. This normalizes those 3209: ## differences. Also, some newline characters get translated into =20 or =, 3210: ## so these extra characters are discarded here. 3211: $/ = "\n"; 3212: open(RAW,$raw_messages_file) || die "Can't open messages file\, $raw_messages_file\, $! \n"; 3213: my $random_number = int(rand(10000)); 3214: $temp_filename = 'temp_file_'.$random_number.'.txt'; 3215: open(TEMP,'>'.$temp_filename); 3216: while ($next_line = <RAW>) { 3217: if ( $next_line !~ /^From\s/ ) { ## Get rid of any leading "From " that is not an actual header. 3218: chomp($next_line); 3219: for ($next_line) { 3220: s/^(.*)\s+$/$1/; 3221: s/^(.*)\=20$/$1/; 3222: s/^(.*)\=$/$1/; 3223: } 3224: print TEMP $next_line."\n"; 3225: } 3226: } 3227: close(RAW); 3228: close(TEMP); 3229: $/ = $end_of_message_marker; 3230: open(TEMP,$temp_filename); 3231: ## Now strip out any HTML formatting in the messages and simplify the headers. 3232: open(ARCHIVEFILE,'>>'.$archive_file_name) || die "Can't open file for output, $archive_file_name: $!\n"; 3233: while ($next_msg = <TEMP>) { 3234: undef %fields_and_values; 3235: %fields_and_values = &extract_fields_and_values_from_message($next_msg); 3236: ## separate message header and body 3237: if ( $next_msg =~ m/\n\n/g ) { 3238: $pos_next_msg = pos($next_msg); 3239: $msg_header = substr($next_msg,0,$pos_next_msg); 3240: $msg_body = substr($next_msg,$pos_next_msg,length($next_msg)-$pos_next_msg); 3241: $next_msg = 'newsgroups: '.$news_group."\n"; 3242: foreach $field (sort keys %fields_and_values) { 3243: $next_msg .= $field.' '.$fields_and_values{$field}."\n"; 3244: } 3245: ## If the message has been sent in plain text and HTML, strip out the HTML. 3246: if ( (exists $fields_and_values{'content-type:'}) 3247: and ( $fields_and_values{'content-type:'} =~ /multipart/i ) ) { 3248: $boundary = &get_boundary_from_content_type($fields_and_values{'content-type:'}); 3249: $next_msg .= "\n".&remove_html_from_message_body($msg_body,$boundary); 3250: } 3251: else { $next_msg .= "\n".$msg_body; } 3252: } 3253: print ARCHIVEFILE $next_msg; 3254: } 3255: $/ = "\n"; 3256: close(ARCHIVEFILE); 3257: close(TEMP); 3258: unlink($temp_filename); 3259: } 3260: 3261: 3262: ## GET_BOUNDARY_FROM_CONTENT_TYPE 3263: ## 3264: sub get_boundary_from_content_type 3265: { 3266: my($content_type) = @_; 3267: 3268: if ( $content_type =~ /boundary\=\"([^\"]+)\"/ ) { return quotemeta($1); } 3269: } 3270: 3271: 3272: ## REMOVE_HTML_FROM_MESSAGE_BODY 3273: ## 3274: ## It might be better to do this with the MIME::Parser module, especially 3275: ## if the message attachments are to be saved and analyzed, but the 3276: ## function below simply looks for the part of the message that is encoded 3277: ## as plain/text and discards all of the other parts of the message. 3278: ## 3279: sub remove_html_from_message_body 3280: { 3281: my($msg_body,$top_level_boundary) = @_; 3282: 3283: my($inputrecordseparator,@msg_body,$boundary,$save_lines_p); 3284: my($msg_body_part,@lines,$line); 3285: 3286: $inputrecordseparator = $/; 3287: $/ = "\n"; 3288: @msg_body = &split_message_body_at_boundary($top_level_boundary,$msg_body); 3289: $boundary = '__this_is_a_highly_unlikely_mime_message_boundary__'; 3290: $save_lines_p = 0; 3291: foreach $msg_body_part (@msg_body) { 3292: @msg_body_part = split /\n/,$msg_body_part; 3293: undef @lines; 3294: foreach $line (@msg_body_part) { 3295: if ( not($save_lines_p) ) { 3296: if ( $line =~ /boundary\=\"([^\"]+)\"/ ) { $boundary = quotemeta($1); } 3297: elsif ( $line =~ /^content-type\:\s*text\/plain/mi ) { $save_lines_p = 1; } 3298: } 3299: else { 3300: if ( $line =~ /$boundary/ ) { last; } 3301: elsif ( ( $line !~ /charset\=/i ) 3302: and ( $line !~ /content\-transfer\-encoding/i ) ) { 3303: push(@lines,$line); 3304: } 3305: } 3306: } 3307: if ( @lines ) { 3308: $plain_text = join "\n",@lines; 3309: last; 3310: } 3311: } 3312: $/ = $inputrecordseparator; 3313: return($plain_text."\n".$end_of_message_marker); 3314: } 3315: 3316: 3317: ## SPLIT_MESSAGE_BODY_AT_BOUNDARY 3318: ## 3319: sub split_message_body_at_boundary 3320: { 3321: my($boundary,$msg_body) = @_; 3322: my(@msg_body,$line,@msg_parts,$msg_part); 3323: 3324: $msg_part = ''; 3325: @msg_body = split /\n/,$msg_body; 3326: foreach $line (@msg_body) { 3327: if ( $line =~ /$boundary/ ) { 3328: push(@msg_parts,$msg_part); 3329: $msg_part = ''; 3330: } 3331: else { $msg_part .= "\n".$line; } 3332: } 3333: if ( $msg_part ) { push(@msg_parts,$msg_part); } 3334: return @msg_parts; 3335: } 3336: 3337: 3338: ## PRINT_TO_LOG 3339: ## 3340: sub print_to_log 3341: { 3342: my($status_message_string) = @_; 3343: 3344: my($time); 3345: 3346: open(LOG,'>>'.$log_file); 3347: $time = localtime; 3348: print LOG $time.' >> '; 3349: print LOG $status_message_string."\n"; 3350: close(LOG); 3351: print STDOUT $time.' >> '; 3352: print STDOUT $status_message_string."\n"; 3353: } 3354: 3355: 3356: ## CHECK_YAHOO_MAIL 3357: ## 3358: sub check_yahoo_mail 3359: { 3360: my(@ids,$id,$msg,$msg_header); 3361: 3362: my($yahoo_client); 3363: 3364: $new_messages_downloaded = 0; 3365: &print_to_log("Logging in..."); 3366: eval { 3367: $yahoo_client = Mail::Client::Yahoo->login(username => $yahoo_uid, 3368: password => $yahoo_password, 3369: secure => 0 3370: ) 3371: }; 3372: if ( $@ ) { 3373: &print_to_log("ERROR: Failed login..."); 3374: sleep($pause); 3375: return; 3376: } 3377: &print_to_log("Checking Inbox..."); 3378: eval { $yahoo_client->select_folder('Inbox') }; 3379: if ( $@ ) { 3380: &print_to_log("ERROR: Couldn't find Inbox..."); 3381: eval { $yahoo_client->logout() }; 3382: if ( $@ ) { &print_to_log("ERROR: Failed logout..."); } 3383: sleep($pause); 3384: return; 3385: } 3386: eval { @ids = $yahoo_client->message_list() }; 3387: if ( $@ ) { 3388: &print_to_log("ERROR: Couldn't find message ids in Inbox..."); 3389: eval { $yahoo_client->logout() }; 3390: if ( $@ ) { &print_to_log("ERROR: Failed logout..."); } 3391: sleep($pause); 3392: return; 3393: } 3394: foreach $id (@ids) { 3395: if ( defined($yahooid{$id}) ) { next; } 3396: else { 3397: $yahooid{$id} = 1; 3398: eval { $msg = $yahoo_client->message($id) }; 3399: if ( $@ ) { 3400: &print_to_log("ERROR: Couldn't get message $id..."); 3401: eval { $yahoo_client->logout() }; 3402: if ( $@ ) { &print_to_log("ERROR: Failed logout..."); } 3403: sleep($pause); 3404: return; 3405: } 3406: $msg_header = $msg->head(); 3407: ## Messages from $news_group are selected by examining the 3408: ## subject lines. 3409: if ( $msg_header->get('Subject') =~ /$news_group/i ) { 3410: open(RAW,'>'.$raw_messages_file); 3411: $msg->print(\*RAW); 3412: print RAW "\n".$end_of_message_marker."\n"; 3413: close(RAW); 3414: &clean_raw_message(); 3415: $new_messages_downloaded++; 3416: &print_to_log("Downloaded $id"); 3417: eval { $yahoo_client->move_message($id,$yahoo_outbox) }; 3418: if ( $@ ) { 3419: &print_to_log("ERROR: Unable to move message out of Inbox into $yahoo_outbox..."); 3420: sleep($pause); 3421: return; 3422: } 3423: } 3424: } 3425: } 3426: eval { $yahoo_client->logout() }; 3427: if ( $@ ) { &print_to_log("ERROR: Failed logout..."); } 3428: if ( $new_messages_downloaded == 0 ) { &print_to_log("No new messages found."); } 3429: sleep($pause); 3430: } 3431: 3432: 3433: 3434: ## CHECK_NEWSGROUP 3435: ## 3436: sub check_newsgroup 3437: { 3438: my($since) = @_; 3439: 3440: my($now,$msgid,$message_ids_ptr,$since_date,$article_lines_ptr,$article); 3441: 3442: $new_messages_downloaded = 0; 3443: $now = $since; # or, should this be, time() ? this seems to depend on the NNTP server and the machine 3444: ## Get a pointer to a list of message ids posted since server was last checked ($since). 3445: $since_date = &ParseDateString("epoch $since"); 3446: $since_date = &UnixDate($since_date,"%d %b %y at %H:%M"); 3447: &print_to_log("Accessing ".$nntp_server." to download ids of messages posted to ".$news_group." since ".$since_date." GMT"); 3448: eval { $message_ids_ptr = $nntp->newnews($since,$news_group) }; 3449: if ( $@ ) { 3450: &print_to_log("ERROR: Couldn't get message ids from NNTP server..."); 3451: sleep($pause); 3452: return($since); 3453: } 3454: foreach $msgid (@{$message_ids_ptr}) { 3455: chomp($msgid); 3456: if ( exists($nntpid{$msgid}) ) { next; } 3457: $nntpid{$msgid} = 1; 3458: eval { $article_lines_ptr = $nntp->article($msgid) }; 3459: if ( $@ ) { 3460: &print_to_log("ERROR: Couldn't get message $msgid from NNTP server..."); 3461: sleep($pause); 3462: return($since); 3463: } 3464: $article = join '',@{$article_lines_ptr}; 3465: open(RAW,'>'.$raw_messages_file); 3466: print RAW $article."\n".$end_of_message_marker."\n"; 3467: close(RAW); 3468: &clean_raw_message(); 3469: $new_messages_downloaded++; 3470: &print_to_log("Downloaded $msgid"); 3471: } 3472: if ( $new_messages_downloaded == 0 ) { &print_to_log("No new messages found."); } 3473: sleep($pause); 3474: return($now); 3475: } 3476: 3477: 3478: ## CHECK_IMAP_MAIL 3479: ## 3480: ## This is not debugged. 3481: ## 3482: sub check_imap_mail 3483: { 3484: my(@ids,$id,$msg,$msg_header); 3485: 3486: my($imap_client,$imapid,$number_of_messages,$lines_ref,$lines,$line,$from_mailing_list_p); 3487: 3488: $new_messages_downloaded = 0; 3489: &print_to_log("Connecting to IMAP server $imap_server..."); 3490: eval { $imap_client = new Net::IMAP::Simple::SSL($imap_server) }; ## frequently hangs; cause: unknown 3491: if ( $@ ) { 3492: &print_to_log("ERROR: Failed to connect to server..."); 3493: sleep($pause); 3494: return; 3495: } 3496: &print_to_log("Logging in..."); 3497: eval { $imap_client->login( $imap_uid => $imap_password ) }; 3498: if ( $@ ) { 3499: &print_to_log("ERROR: Failed log in..."); 3500: sleep($pause); 3501: return; 3502: } 3503: &print_to_log("Checking Inbox..."); 3504: eval { $number_of_messages = $imap_client->select( 'Inbox' ) }; 3505: if ( $@ ) { 3506: &print_to_log("ERROR: Couldn't find Inbox..."); 3507: eval { $imap_client->quit() }; 3508: if ( $@ ) { &print_to_log("ERROR: Failed logout..."); } 3509: sleep($pause); 3510: return; 3511: } 3512: foreach $imapid ( 1..$number_of_messages ) { 3513: if ( not($imap_client->seen($imapid)) ) { 3514: # get the message, returned as a reference to an array of lines 3515: $lines_ref = $imap_client->get($imapid); 3516: $from_mailing_list_p = 0; 3517: foreach $line (@{$lines_ref}) { 3518: if ( $line =~ /^subject:\s+$news_group/i ) { 3519: $from_mailing_list_p = 1; 3520: last; 3521: } 3522: } 3523: if ( $from_mailing_list_p ) { 3524: $lines = join '',@{$lines_ref}; 3525: open(RAW,'>'.$raw_messages_file); 3526: print RAW $lines; 3527: print RAW "\n".$end_of_message_marker."\n"; 3528: close(RAW); 3529: &clean_raw_message(); 3530: $new_messages_downloaded++; 3531: &print_to_log("Downloaded $imapid"); 3532: eval { $imap_client->copy($imapid,$imap_outbox) }; 3533: if ( $@ ) { 3534: &print_to_log("ERROR: Unable to move message out of Inbox into $imap_outbox..."); 3535: eval { $imap_client->quit() }; 3536: if ( $@ ) { &print_to_log("ERROR: Failed logout..."); } 3537: sleep($pause); 3538: return; 3539: } 3540: eval { $imap_client->delete($imapid) }; 3541: if ( $@ ) { 3542: &print_to_log("ERROR: Unable to delete message from Inbox after moving it to $imap_outbox..."); 3543: eval { $imap_client->quit() }; 3544: if ( $@ ) { &print_to_log("ERROR: Failed logout..."); } 3545: sleep($pause); 3546: return; 3547: } 3548: } 3549: } 3550: } 3551: eval { $imap_client->quit() }; 3552: if ( $@ ) { &print_to_log("ERROR: Failed logout..."); } 3553: if ( $new_messages_downloaded == 0 ) { &print_to_log("No new messages found."); } 3554: sleep($pause); 3555: return; 3556: sleep($pause); 3557: } 3558: 3559: 3560: ## YESTERDAY_IN_SECONDS 3561: ## 3562: sub yesterday_in_seconds 3563: { 3564: my($yesterday,@yesterday_as_strings,@yesterday); 3565: my($elem,$sec,$min,$hour,$mday,$mon,$year,$time); 3566: 3567: ## Use ParseDate to find the date and time of yesterday. 3568: $yesterday = ParseDate('24 hours ago'); 3569: ## Simplify and format the date and time into a list of strings. 3570: @yesterday_as_strings = UnixDate($yesterday,"%S","%M","%k","%e","%m","%y"); 3571: ## Convert the strings into integers. 3572: @yesterday = (); 3573: foreach $elem (@yesterday_as_strings) { 3574: $elem += 0; 3575: push(@yesterday,$elem); 3576: } 3577: ($sec,$min,$hour,$mday,$mon,$year) = @yesterday; 3578: ## Decrement the month ($mon) to accord with the gmtime format 3579: ## (wherein the month runs from 0..11). 3580: $mon--; 3581: ## Calculate the number of seconds between yesterday and 1970. 3582: $time = timegm($sec,$min,$hour,$mday,$mon,$year); 3583: return($time); 3584: } 3585: 3586: 3587: ## FETCH_NEW_MESSAGES_FROM_SERVER 3588: ## 3589: ## This function simply calls a more specific function. 3590: ## It is a placeholder for a function that periodically 3591: ## fetches messages off of a server. The function called 3592: ## here must address the following issues: 3593: ## 3594: ## (1) Timing: Some form of wait function should be built into the 3595: ## function so that it is not constantly pinging the 3596: ## server for new messages but, rather, polls from the 3597: ## server only once in a while. The time period to 3598: ## wait will depend on the server. 3599: ## 3600: ## (2) Message format: The fetching function must write the 3601: ## fetched messages into the file named $archive_file_name. 3602: ## These messages should be appended into that file so that 3603: ## more messages can be downloaded even if the system is not 3604: ## done processing the current messages. The messages should 3605: ## be rendered in plain text (i.e., for example, HTML should 3606: ## be stripped out of the messages). The messages should be 3607: ## formatted according to RFC-1036 conventions. One field of 3608: ## the messages must be 'newsgroups:' and must be filled 3609: ## with the name of the newsgroup specified in the agonistics 3610: ## configuration file. Finally, after the end of the 3611: ## text of each message, on a separate line, the 3612: ## $end_of_message_marker must be printed. 3613: ## 3614: ## (3) Log: It is preferrable, but not necessary, for the fetching 3615: ## function to log its errors (and successes) in the $log_file 3616: ## using the &print_to_log function. 3617: ## 3618: ## See &check_yahoo_mail and the associated functions 3619: ## (e.g., &clean_raw_message) for an example implementation of 3620: ## these criteria. 3621: ## 3622: sub fetch_new_messages_from_server 3623: { 3624: ## Download messages from a Yahoo email account; or, 3625: if ( $yahoo_uid and $yahoo_password ) { &check_yahoo_mail(); } 3626: ## download messages from a Usenet newsgroup server; or, 3627: elsif ( $nntp_server ) { 3628: ## If no messages have been downloaded yet, download all message posted since yesterday. 3629: if ( $time_of_last_nntp_server_check == 0 ) { 3630: $time_of_last_nntp_server_check = &yesterday_in_seconds(); 3631: } 3632: $time_of_last_nntp_server_check = &check_newsgroup($time_of_last_nntp_server_check); 3633: } 3634: ## download messages from an IMAP email server. 3635: elsif ( $imap_server and $imap_uid and $imap_password ) { &check_imap_mail(); } 3636: else { die "No Yahoo, NNTP or IMAP server was specified for the interactive downloading of messages."; } 3637: } 3638: 3639: 3640: ## REASSEMBLE_MESSAGE 3641: ## 3642: sub reassemble_message 3643: { 3644: my($smid) = @_; 3645: 3646: $message_header = ''; 3647: if ( $smid2newsgroups{$smid} ) { $message_header = 'newsgroups: '.$smid2newsgroups{$smid}."\n"; } 3648: if ( $smid2date{$smid} ) { $message_header .= 'date: '.$smid2date{$smid}."\n"; } 3649: if ( $smid2mid{$smid} ) { 3650: $message_header .= 'message-id: '.$smid2mid{$smid}."\n"; 3651: if ( $mid2pid{$smid2mid{$smid}} ) { $message_header .= 'references: '.$mid2pid{$smid2mid{$smid}}."\n"; } 3652: } 3653: if ( $smid2said{$smid} and $said2ana{$smid2said{$smid}} ) { $message_header .= 'from: '.$said2ana{$smid2said{$smid}}."\n"; } 3654: if ( $smid2subject{$smid} ) { $message_header .= 'subject: '.$smid2subject{$smid}."\n"; } 3655: $message_body = ''; 3656: $ln = 0; 3657: while ( defined($smidln2line{"$smid $ln"}) ) { 3658: $message_body .= $smidln2line{"$smid $ln"}."\n"; 3659: $ln++; 3660: } 3661: return($message_header."\n".$message_body); 3662: } 3663: 3664: 3665: ## REORGANIZE_ARCHIVE 3666: ## 3667: sub reorganize_archive 3668: { 3669: my(@smids,$mid,$last_batch_archive_file_name); 3670: 3671: ## sort the messages by date 3672: @smids = (); 3673: foreach $mid (keys %analyzed_p) { push(@smids,$mid2smid{$mid}); } 3674: @smids = sort { $smid2sdt{$a} <=> $smid2sdt{$b} } @smids; 3675: ## store the last batch of messages 3676: $last_batch_archive_file_name = $archive_file_name; 3677: $last_batch_archive_file_name =~ s/\./\_$batch_of_messages\./; 3678: if ( $last_batch_archive_file_name eq $archive_file_name ) { die "archive file name not renamed!"; } 3679: ## Save the archive into a new file and delete the existing archive file 3680: copy($archive_file_name,$last_batch_archive_file_name); 3681: unlink($archive_file_name); 3682: ## print the last ( $max_frames / 2 ) into the (now empty) archive; and, 3683: ## update %analyzed_p so that it only contains message ids from messages not in the new archive 3684: open(ARCHIVEFILE,'>'.$archive_file_name); 3685: $first_smid_index = / 2; 3686: foreach $smid (@smids[$first_smid_index..]) { 3687: $message = &reassemble_message($smid); 3688: print ARCHIVEFILE $message.$end_of_message_marker."\n"; 3689: } 3690: close(ARCHIVEFILE); 3691: } 3692: 3693: 3694: ## CREATE_BLANK_PAGES 3695: ## 3696: sub create_blank_pages 3697: { 3698: my(@interface_files,$fn,$n,$html); 3699: 3700: opendir(ID,$interface_directory); 3701: @interface_files = readdir(ID); 3702: closedir(ID); 3703: $html = '<HTML> 3704: <HEAD> 3705: <META http-equiv="Refresh" content="1; URL=agon2.html"> 3706: <TITLE>agonistics</TITLE> 3707: <LINK type="text/css" rel="stylesheet" href="interface_style.css"> 3708: </HEAD> 3709: <BODY> 3710: </BODY> 3711: </HTML>'; 3712: open(AGON,'>'.$interface_directory.$slash.'agon1.html'); 3713: print AGON $html; 3714: close(AGON); 3715: $html =~ s/agon2/agon1/; 3716: foreach $fn (@interface_files) { 3717: if ( $fn =~ /^agon([0-9]+)\.html/ ) { 3718: $n = $1; 3719: if ( $n != 1 ) { 3720: open(AGON,'>'.$interface_directory.$slash.'agon'.$n.'.html'); 3721: print AGON $html; 3722: close(AGON); 3723: } 3724: } 3725: } 3726: return; 3727: } 3728: 3729: 3730: ## INTERACTIVE_ANALYSIS 3731: ## 3732: ## Messages are periodically pulled from a server and analyzed. 3733: ## 3734: sub interactive_analysis 3735: { 3736: my($l,$small_pause); 3737: 3738: while (1) { 3739: if ( $new_messages_downloaded <= 0 ) { &fetch_new_messages_from_server(); } 3740: if ( $new_messages_downloaded > 0 ) { 3741: $frame_number++; 3742: $file_pointer_in_messages_archive = &process_next_message($file_pointer_in_messages_archive); 3743: $new_messages_downloaded--; 3744: } 3745: # write another version of the last page and have it point back to the 3746: # current page. 3747: if (-e $interface_directory.$slash.'agon'.$frame_number.'.html') { 3748: open(PREVIOUSAGON,$interface_directory.$slash.'agon'.$frame_number.'.html'); 3749: @current_page_contents = <PREVIOUSAGON>; 3750: close(PREVIOUSAGON); 3751: open(NEXTAGON,'>'.$interface_directory.$slash.'agon'.($frame_number + 1).'.html'); 3752: foreach $l (@current_page_contents) { 3753: if ( $l =~ /content\=\"$pause_between_frames\;\s+URL\=agon([0-9]+)/ ) { 3754: $l =~ s/agon[0-9]+/agon$frame_number/i; 3755: $small_pause = int($pause/2); 3756: $l =~ s/content\=\"$pause_between_frames\;/content\=\"$small_pause\;/i; 3757: } 3758: print NEXTAGON $l; 3759: } 3760: close(NEXTAGON); 3761: } 3762: if ( $frame_number > $max_frames ) { last; } 3763: } 3764: ## If max frames reached, restart the analysis 3765: &analyze_messages(); 3766: } 3767: 3768: 3769: ## STATIC_ANALYSIS 3770: ## 3771: ## Messages are analyzed from a static archive of messages. 3772: ## 3773: sub static_analysis 3774: { 3775: my($l,$small_pause,@interface_files,$fn,$n); 3776: 3777: while ($file_pointer_in_messages_archive < (-s $archive_file_name) ) { 3778: $frame_number++; 3779: $file_pointer_in_messages_archive = &process_next_message($file_pointer_in_messages_archive); 3780: } 3781: # write another version of the last page and have it point back to the 3782: # current page. 3783: if (-e $interface_directory.$slash.'agon'.$frame_number.'.html') { 3784: open(PREVIOUSAGON,$interface_directory.$slash.'agon'.$frame_number.'.html'); 3785: @current_page_contents = <PREVIOUSAGON>; 3786: close(PREVIOUSAGON); 3787: open(NEXTAGON,'>'.$interface_directory.$slash.'agon'.($frame_number + 1).'.html'); 3788: foreach $l (@current_page_contents) { 3789: if ( $l =~ /content\=\"$pause_between_frames\;\s+URL\=agon([0-9]+)/ ) { 3790: $l =~ s/agon[0-9]+/agon$frame_number/i; 3791: $small_pause = int($pause/2); 3792: $l =~ s/content\=\"$pause_between_frames\;/content\=\"$small_pause\;/i; 3793: } 3794: print NEXTAGON $l; 3795: } 3796: close(NEXTAGON); 3797: } 3798: ## Erase any blank interface pages still remaining. 3799: opendir(ID,$interface_directory); 3800: @interface_files = readdir(ID); 3801: closedir(ID); 3802: foreach $fn (@interface_files) { 3803: if ( $fn =~ /^agon([0-9]+)\.html/ ) { 3804: $n = $1; 3805: if ( $n > ($frame_number + 1) ) { 3806: unlink($interface_directory.$slash.'agon'.$n.'.html'); 3807: } 3808: } 3809: } 3810: ## If max frames reached, restart the analysis 3811: if ( $frame_number > $max_frames ) { &analyze_messages(); } 3812: } 3813: 3814: 3815: ## ANALYZE_MESSAGES 3816: ## 3817: sub analyze_messages 3818: { 3819: &initialize_global_variables(); 3820: $batch_of_messages++; 3821: ## Make sure that clients viewing interface pages are moved to a blank page 3822: ## before any pages are deleted. 3823: sleep($pause_between_frames+2); 3824: ## If the archive does not yet exist, create it. 3825: if ( not(-e $archive_file_name) ) { 3826: open(ARCHIVEFILE,'>'.$archive_file_name); 3827: close(ARCHIVEFILE); 3828: } 3829: $file_pointer_in_messages_archive = 0; 3830: ## case 1: The messages to be analyzed are to be fetched periodically from 3831: ## a server. This is the interactive mode. 3832: ## case 2: Messages are analyzed from a static archive of messages. 3833: ## case 3: There is already an archive messages to be analyzed and then -- 3834: ## after the archive is analyzed -- messages will be pulled from the server and 3835: ## analyzed interactively. 3836: if ( (-e $archive_file_name) and ( $is_interactive_p eq 'YES' ) ) { 3837: &static_analysis(); 3838: &interactive_analysis(); 3839: } 3840: elsif ( $is_interactive_p eq 'YES' ) { &interactive_analysis(); } 3841: else { &static_analysis(); } 3842: } 3843: 3844: 3845: &analyze_messages(); |