|
agonistics: a language game w a r r e n s a c k <wsack@media.mit.edu> In the spirit of two exhibitions curated by Christiane Paul,
CODeDOC 2: 3: ######################################################################### 4: ## 5: ## RUN_AGONISTICS 6: ## 7: ## Warren Sack (wsack@media.mit.edu) 8: ## 9: ## February 2005 10: ## 11: ## Usage: perl run_agonistics.pl <configuration file> 12: ## 13: ## Preconditions: 13: ## (1) The file configuration file needs to exist and contain 14: ## a correct set of values for all of the necessary 15: ## parameters. 16: ## 17: ## (2) The sub-directory "Resources" needs to be exist within 18: ## the directory that contains this script. The Resources 19: ## sub-directory contains a number of images and CGI scripts. 20: ## 21: ## (3) If this script is going be run by polling new messages 22: ## periodically from a server, a means to connect and 23: ## download those messages needs to be arranged before the 24: ## script is run. See the example configuration files 25: ## *.conf for details concerning, for example, the use of 26: ## a Yahoo email account as an means to poll and archive 27: ## new messages from a mailing list. 28: ## 29: ######################################################################### 30: ## 31: ## Copyright (c) 2005 by Warren Sack 32: ## 33: ## This work is copyrighted with a Creative Commons 34: ## (http://creativecommons.org) Attribution-NonCommercial-NoDerivs 2.0 35: ## License. 36: ## 37: ## Here is a short summary of the license: 38: ## 39: ## You are free to copy, distribute, display, and perform the work 40: ## under the following conditions: 41: ## 42: ## Attribution: You must give the original author (Warren Sack) credit. 43: ## 44: ## Noncommercial: You may not use this work for commercial purposes. 45: ## 46: ## No Derivative Works: You may not alter, transform, or build upon this work. 47: ## 48: ## * For any reuse or distribution, you must make clear to others the 49: ## license terms of this work. 50: ## * Any of these conditions can be waived if you get permission from 51: ## the copyright holder (Warren Sack). 52: ## * Your fair use and other rights are in no way affected by the above. 53: ## * The details and full text of the license can be found at this URL: 54: ## http://creativecommons.org/licenses/by-nc-nd/2.0/legalcode 55: ## 56: ######################################################################### 57: 58: if ( < 0 ) { 59: die "Usage: run_agonistics.pl <configuration file>\n". 60: "This script requires one argument.\n"; 61: } 62: elsif ( not(-e $ARGV[0]) ) { 63: die "Usage: run_agonistics.pl <configuration file>\n". 64: "This script requires a configuration file. The given configuration file cannot be found.\n"; 65: } 66: 67: 68: use utf8; 69: use Unicode::Normalize; 70: use HTML::Entities(); 71: use File::Temp qw(tempfile); 72: use File::Path; 73: use Date::Manip qw(ParseDate ParseDateString UnixDate); 74: use Time::Local; 75: use File::Copy; 76: use MIME::WordDecoder; 77: use Lingua::Stem; 78: use Fcntl qw(:flock); 79: use Crypt::SSLeay; 80: use Mail::Client::Yahoo; 81: use Digest::MD5; 82: use Net::NNTP; 83: use Net::SSLeay; 84: use IO::Socket::SSL; 85: use Net::IMAP::Simple::SSL; 86: 87: $batch_of_messages = 0; 88: 89: ## READ_CONFIG_FILE 90: ## 91: sub read_config_file 92: { 93: my($key,$value); 94: 95: open(CONFIG,$ARGV[0]) || die "Can't find configuration file: $ARGV[0]\n"; 96: while(<CONFIG>) { 97: chomp; 98: if ( /^\#/ ) { next; } 99: if ( /^\s*$/ ) { next; } 100: ($key,$value) = $_ =~ /^(\S+)\s+(\S+)/; 101: $agonistics_config{$key} = $value; 102: } 103: close(CONFIG); 104: } 105: 106: 107: ## INITIALIZE_GLOBAL_VARIABLES 108: ## 109: ## Input: None 110: ## 111: ## Effects: Global variables associated with this package are initialized. 112: ## 113: ## Output: None 114: ## 115: sub initialize_global_variables 116: { 117: ## initializations done only before the first batch of messages is processed 118: if ( $batch_of_messages == 0 ) { 119: ## Open and read the config file into a hash. 120: %agonistics_config = (); 121: &read_config_file(); 122: ## Before setting slash, the machine's OS is checked. 123: if ( $ENV{'OSTYPE'} =~ /^win/i ) { $slash = '\\'; } 124: else { $slash = '/'; } 125: $news_group = $agonistics_config{'Newsgroup'}; 126: $archive_file_name = $agonistics_config{'FileNameOfArchive'}; 127: $language_locale = $agonistics_config{'LanguageTag'}; 128: if ( defined($agonistics_config{'MaxFrames'}) ) { 129: $max_frames = $agonistics_config{'MaxFrames'}; 130: } 131: else { $max_frames = 1000; } 132: if ( defined($agonistics_config{'PauseBetweenFrames'}) ) { 133: $pause_between_frames = $agonistics_config{'PauseBetweenFrames'}; 134: } 135: else { $pause_between_frames = 7; } 136: ## Load the correct end-of-sentence tagger. English and French 137: ## texts both use the English tagger. German texts use a 138: ## different tagger. 139: if ($language_locale eq 'DE') { 140: require Lingua::DE::Sentence; 141: Lingua::DE::Sentence->import( qw(get_sentences) ); 142: } 143: else { 144: require Lingua::EN::Sentence; 145: Lingua::EN::Sentence->import( qw(get_sentences) ); 146: } 147: $archive_name = $agonistics_config{'DirectoryForOutput'}; 148: $recency = $agonistics_config{'Recency'}; 149: $documents_url = $agonistics_config{'DocumentsURL'}; 150: $cgi_url = $agonistics_config{'CGIURL'}; 151: $web_server_directory = $agonistics_config{'WebServerDirectory'}; 152: $web_server_cgi_directory = $agonistics_config{'WebServerCGIDirectory'}; 153: $is_interactive_p = $agonistics_config{'InteractiveMode'}; 154: if ( $is_interactive_p =~ /n/i ) { $is_interactive_p = 'NO'; } 155: else { $is_interactive_p = 'YES'; } 156: ## Record the address to be used for posting messages to the list analyzed. 157: $mailing_list_address = $agonistics_config{'MailingListAddress'}; 158: ## Note information about the Yahoo mail account, if it is to be used. 159: $yahoo_uid = $agonistics_config{'YahooUID'}; 160: $yahoo_password = $agonistics_config{'YahooPassword'}; 161: $yahoo_outbox = $agonistics_config{'YahooOutbox'}; 162: ## Note information about the IMAP mail server and account, if it is to be used. 163: $imap_server = $agonistics_config{'IMAPServer'}; 164: $imap_uid = $agonistics_config{'IMAPUID'}; 165: $imap_password = $agonistics_config{'IMAPPassword'}; 166: $imap_outbox = $agonistics_config{'IMAPOutbox'}; 167: ## Note information about the NNTP account and server, if it is to be used. 168: $nntp_server = $agonistics_config{'NNTPServer'}; 169: $nntp_uid = $agonistics_config{'NNTPUID'}; 170: $nntp_password = $agonistics_config{'NNTPPassword'}; 171: if ( ( $nntp_server and ( $yahoo_uid or $imap_server ) ) 172: or ( $yahoo_uid and ( $nntp_server or $imap_server ) ) 173: or ( $imap_server and ( $nntp_server or $yahoo_uid ) ) ) { 174: die "Only one of the following may be defined: (a) NNTP server; (b) Yahoo UID; (c) IMAP server". 175: "\nTwo of the three need to be commented out in the configuration file $ARGV[0]\n"; 176: } 177: ## How many seconds should the script wait between tries to download 178: ## messages from the server? 179: $pause = $agonistics_config{'PauseBetweenFetches'}; 180: $output_directory = $web_server_directory.$slash.'Agonistics'.$slash.$archive_name; 181: $cgi_directory = $web_server_cgi_directory.$slash.'Agonistics'; 182: ## create the directory to house the CGI scripts 183: mkdir($cgi_directory); 184: ## copy the CGI scripts into the CGI directory 185: my $cgi_script_file; 186: my $send_script_file_found_p = 0; 187: opendir CGISCRIPTS, 'Resources'.$slash.'CGIScripts' or die "Cannot open CGIScripts directory: $!"; 188: foreach $cgi_script_file (readdir CGISCRIPTS) { 189: if ( $cgi_script_file eq 'send_message.pl' ) { 190: &rewrite_send_script_file('Resources'.$slash.'CGIScripts'.$slash.$cgi_script_file,$cgi_directory.$slash.$cgi_script_file); 191: $send_script_file_found_p = 1; 192: } 193: else { copy('Resources'.$slash.'CGIScripts'.$slash.$cgi_script_file,$cgi_directory.$slash.$cgi_script_file); } 194: chmod(0777,$cgi_directory.$slash.$cgi_script_file); 195: } 196: closedir CGISCRIPTS; 197: if ( $send_script_file_found_p == 0 ) { die "Can't find send_message.pl CGI script\n"; } 198: ## Alternatively load the German or English+French end-of-sentence 199: ## tagger depending upon the language specified on the command line. 200: if ( $language_locale =~ /^DE/ ) { 201: require Lingua::DE::Sentence; 202: Lingua::DE::Sentence->import( qw(get_sentences) ); 203: } 204: else { 205: require Lingua::EN::Sentence; 206: Lingua::EN::Sentence->import( qw(get_sentences) ); 207: } 208: $stemmer = Lingua::Stem->new({-locale => $language_locale}); 209: $stemmer->stem_caching({ -level => 2 }); 210: $log_file = $agonistics_config{'FileNameOfLog'}; 211: open(LOG,'>'.$log_file); 212: close(LOG); 213: $radius_of_circle = 100000; 214: my $random_number = int(rand(10000)); 215: $raw_messages_file = 'raw_messages_file_'.$random_number.'.txt'; 216: $end_of_message_marker = '__end_of_message_marker__'; 217: ## English stop words 218: @english_stop_words = ("a", "about", "above", "according", "across", "actually", "adj", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anywhere", "are", "aren", "around", "as", "at", "b", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "behind", "being", "below", "beside", "besides", "between", "beyond", "billion", "both", "but", "by", "c", "can", "can", "cannot", "caption", "co", "could", "couldn", "d", "did", "didn", "do", "does", "doesn", "don", "down", "during", "e", "each", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "f", "few", "fifty", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "further", "g", "h", "had", "has", "hasn", "have", "haven", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "instead", "into", "is", "isn", "it", "its", "itself", "j", "k", "l", "last", "later", "latter", "latterly", "least", "less", "let", "like", "likely", "ll", "ltd", "m", "made", "make", "makes", "many", "maybe", "me", "meantime", "meanwhile", "might", "million", "miss", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "must", "my", "myself", "n", "namely", "neither", "never", "nevertheless", "next", "nine", "ninety", "no", "nobody", "none", "nonetheless", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "overall", "own", "p", "per", "perhaps", "q", "r", "rather", "re", "recent", "recently", "s", "same", "seem", "seemed", "seeming", "seems", "seven", "seventy", "several", "she", "should", "shouldn", "since", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "stop", "such", "t", "taking", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thirty", "this", "those", "though", "thousand", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "toward", "towards", "trillion", "twenty", "two", "u", "under", "unless", "unlike", "unlikely", "until", "up", "upon", "us", "used", "using", "v", "ve", "very", "via", "w", "was", "wasn", "we", "well", "were", "weren", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whomever", "whose", "why", "will", "with", "within", "without", "would", "wouldn", "wrote", "x", "y", "yes", "yet", "you", "your", "yours", "yourself", "yourselves", "z", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "http", "www", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec", "com", "edu", "org", "just", "net", "ftp", "nntp", "http", "html"); 219: @english_present_tense_of_to_be = ( "am", "are", "is" ); 220: ## French stop words 221: ## Note that some words here are intentionally misspelled (i.e., spelled without diacritical marks). 222: ## This appears to be necessary since the diacritical marks are not always employed by the participants 223: ## and they are oftentimes lost between the client and server. 224: @french_stop_words = ( "a", "au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "elles", "en", "et", "eux", "il", "ils", "je", "la", "le", "les", "leur", "lui", "ma", "mais", "me", "m\x{EA}me", "mes", "moi", "mon", "ne", "nos", "notre", "nous", "on", "ou", "par", "pas", "plus", "pour", "qu", "que", "qui", "bien", "bon", "bonne", "bonnes", "faire", "fais", "fait", "faisons", "font", "sa", "se", "ses", "son", "sur", "ta", "te", "tes", "toi", "ton", "tu", "un", "une", "vos", "votre", "vous", "c", "d", "j", "l", "\x{E0}", "m", "n", "s", "t", "y", "\x{E9}t\x{E9}", "ete", "\x{E9}t\x{E9}e", "etee", "\x{E9}t\x{E9}es", "etees", "\x{E9}t\x{E9}s", "etes", "\x{E9}tant", "etant", "\x{E9}tante", "etante", "\x{E9}tants", "etants", "\x{E9}tantes", "etantes", "suis", "es", "est", "sommes", "\x{EA}tes", "etes", "sont", "serai", "seras", "sera", "serons", "serez", "seront", "serais", "serait", "serions", "seriez", "seraient", "\x{E9}tais", "etais", "\x{E9}tait", "etait", "\x{E9}tions", "etions", "\x{E9}tiez", "etiez", "\x{E9}taient", "etaient", "fus", "fut", "f\x{FB}mes", "fumes", "f\x{FB}tes", "futes", "furent", "sois", "soit", "soyons", "soyez", "soient", "fusse", "fusses", "f\x{FB}t", "fut", "fussions", "fussiez", "fussent", "ayant", "ayante", "ayantes", "ayants", "eu", "eue", "eues", "eus", "ai", "as", "avons", "avez", "ont", "aurai", "auras", "aura", "aurons", "aurez", "auront", "aurais", "aurait", "aurions", "auriez", "auraient", "avais", "avait", "avions", "aviez", "avaient", "eut", "e\x{FB}mes", "eumes", "e\x{FB}tes", "eutes", "eurent", "aie", "aies", "ait", "ayons", "ayez", "aient", "eusse", "eusses", "e\x{FB}t", "eut", "eussions", "eussiez", "eussent", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "un", "deux", "trois", "quatre", "cinq", "six", "sept", "huit", "neuf", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "http", "message", "jamais", "fr", "sans", "non", "oui", "dire", "dis", "dit", "disez", "disons", "disent", "faut", "toujours", "que", "quel", "quelle", "quelles", "quelqu", "quelque", "quelques", "quels", "qui", "quiconque", "quoi", "quoiqu", "quoique", "\x{E7}a", "ca", "car", "ce", "ceci", "cela", "celle", "l\x{E0}", "la", "celles", "ci", "celles", "celui", "ces", "cet", "cette", "ceux", "comme", "vrai", "\x{EA}tre", "etre", "\x{E9}crit", "ecrit", "rien", "tout", "tous", "toute", "toutes", "si", "fr", "com", "edu", "org", "autres", "autre", "sauf", "vais", "vas", "va", "venons", "venez", "vont", "aller", "aimer", "aiment", "aimons", "aime", "aimes", "mal", "news", "nntp", "html", "ftp", "net", "part", "puis", "voir", "quand", "tant", "autant", "leur", "leurs", "tien", "tienne", "tiens", "tiennes", "mien", "mienne", "miens", "miennes", "soi", "aussi", "ailleurs", "moins", "alors", "passe", "pass\x{E9}", "avoir" ); 225: @french_present_tense_of_to_be = ( "suis", "es", "est", "\x{EA}tes", "etes", "sommes", "sont" ); 226: ## German stop words 227: ## Note that some words here are intentionally misspelled (i.e., spelled without diacritical marks). 228: ## This appears to be necessary since the diacritical marks are not always employed by the participants 229: ## and they are oftentimes lost between the client and server. 230: @german_stop_words = ( "aber", "alle", "allem", "allen", "aller", "alles", "als", "also", "am", "an", "ander", "andere", "anderem", "anderen", "anderer", "anderes", "anderm", "andern", "anderr", "anders", "auch", "auf", "aus", "bei", "bin", "bis", "bist", "da", "damit", "dann", "der", "den", "des", "dem", "die", "das", "da\x{DF}", "derselbe", "derselben", "denselben", "desselben", "demselben", "dieselbe", "dieselben", "dasselbe", "dazu", "dein", "deine", "deinem", "deinen", "deiner", "deines", "denn", "derer", "dessen", "dich", "dir", "du", "dies", "diese", "diesem", "diesen", "dieser", "dieses", "doch", "dort", "durch", "ein", "eine", "einem", "einen", "einer", "eines", "einig", "einige", "einigem", "einigen", "einiger", "einiges", "einmal", "er", "ihn", "ihm", "es", "etwas", "euer", "eure", "eurem", "euren", "eurer", "eures", "f\x{FC}r", "fur", "gegen", "gewesen", "hab", "habe", "haben", "hat", "hatte", "hatten", "hier", "hin", "hinter", "ich", "mich", "mir", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "euch", "im", "in", "indem", "ins", "ist", "jede", "jedem", "jeden", "jeder", "jedes", "jene", "jenem", "jenen", "jener", "jenes", "jetzt", "kann", "kein", "keine", "keinem", "keinen", "keiner", "keines", "k\x{F6}nnen", "konnen", "k\x{F6}nnte", "konntemachen", "man", "manche", "manchem", "manchen", "mancher", "manches", "mein", "meine", "meinem", "meinen", "meiner", "meines", "mit", "muss", "musste", "nach", "nicht", "nichts", "noch", "nun", "nur", "ob", "oder", "ohne", "sehr", "sein", "seine", "seinem", "seinen", "seiner", "seines", "selbst", "sich", "sie", "ihnen", "sind", "so", "solche", "solchem", "solchen", "solcher", "solches", "soll", "sollte", "sondern", "sonst", "\x{FC}ber", "uber", "um", "und", "uns", "unse", "unsem", "unsen", "unser", "unses", "unter", "viel", "vom", "von", "vor", "w\x{E4}hrend", "wahrend", "war", "waren", "warst", "was", "weg", "weil", "weiter", "welche", "welchem", "welchen", "welcher", "welches", "wenn", "werde", "werden", "wie", "wieder", "will", "wir", "wird", "wirst", "wo", "wollen", "wollte", "w\x{FC}rde", "wurde", "w\x{FC}rden", "wurden", "zu", "zum", "zur", "zwar", "zwischen", "dk", "com", "edu", "org", "news", "http", "ftp", "html", "com", "org", "net", "nntp"); 231: @german_present_tense_of_to_be = ( "bin", "bist", "ist", "sind", "seid" ); 232: @stop_words = (); 233: if ( $language_locale =~ /^EN/ ) { @stop_words = @english_stop_words; } 234: ## Many French and German discussions include English posts, so English stop words are used for them too. 235: elsif ( $language_locale =~ /^FR/ ) { push(@stop_words,@french_stop_words,@english_stop_words); } 236: elsif ( $language_locale =~ /^DE/ ) { push(@stop_words,@german_stop_words,@english_stop_words); } |