Loading tests/mail_to_db.pl +336 −171 Original line number Diff line number Diff line #!/usr/bin/perl #!/usr/bin/perl -w # Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB # This file is public domain and comes with NO WARRANTY of any kind # # This program is brought to you by Janne-Petteri Koilo with the # administration of Michael Widenius. # # Rewritten with a lot of bug fixes by Jani Tolonen and Thimble Smith # 15.12.2000 # # This program takes your mails and puts them into your database. It ignores # messages with the same from, date and message text. # You can use mail-files that are compressed or gzipped and ends with Loading @@ -13,40 +16,92 @@ use DBI; use Getopt::Long; $VER = "1.6"; $opt_db = "mail"; $opt_table = "mails"; $| = 1; $VER = "2.0"; $opt_help = 0; $opt_version = 0; $opt_debug = 0; $opt_host = undef(); $opt_port = undef(); $opt_socket = undef(); $opt_db = undef(); $opt_table = undef(); $opt_user = undef(); $opt_password = undef(); $opt_max_mail_size = 65536; $opt_db_engine = "mysql"; $opt_host = "localhost"; $opt_user = $opt_password = ""; $opt_help = $opt_version = $opt_test=0; $opt_create = 0; $opt_test = 0; $opt_no_path = 0; $opt_stop_on_error = 0; GetOptions("help","version","user=s","password=s", "db_engine=s","db=s","host=s","max_mail_size=s","test") || usage(); my ($dbh, $progname, $mail_no_from_f, $mail_no_txt_f, $mail_too_big, $mail_forwarded, $mail_duplicates, $mail_no_subject_f, $mail_inserted); usage($VER) if ($opt_help || $opt_version || !$ARGV[0]); $mail_no_from_f = $mail_no_txt_f = $mail_too_big = $mail_forwarded = $mail_duplicates = $mail_no_subject_f = $mail_inserted = 0; %months= ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5, my %months = ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5, 'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10, 'Nov' => 11, 'Des' => 12); 'Nov' => 11, 'Dec' => 12); $progname = $0; $progname =~ s/.*[\/]//; main(); #### #### main sub routine #### sub main { my ($connect_arg, @args, $ignored, @defops, $i); if (defined(my_which("my_print_defaults"))) { @defops = `my_print_defaults mail_to_db`; chop @defops; splice @ARGV, 0, 0, @defops; } else { print "WARNING: No command 'my_print_defaults' found; unable to read\n"; print "the my.cnf file. This command is available from the latest MySQL\n"; print "distribution.\n"; } GetOptions("help","version","host=s","port=i","socket=s","db=s","table=s", "user=s","password=s","max_mail_size=i","create","test", "no_path","debug","stop_on_error") || die "Wrong option! See $progname --help\n"; $count_no_from = $count_no_txt = $count_too_big = 0; $count_forwarded_msgs = $count_duplicates = $no_subject = 0; $inserted_mails = 0; $dbh=0; usage($VER) if ($opt_help || $opt_version || (!$ARGV[0] && !$opt_create)); $dbh = DBI->connect("DBI:$opt_db_engine:$opt_db:$opt_host",$opt_user, $opt_password,{ PrintError => 0}) || die $DBI::errstr; if (!$opt_test) # Check that the given inbox files exist and are regular files for ($i = 0; defined($ARGV[$i]); $i++) { create_table_if_needed($dbh); die "FATAL: Can't find inbox file: $ARGV[$i]\n" if (! -f $ARGV[$i]); } $connect_arg = "DBI:mysql:"; push @args, "database=$opt_db" if defined($opt_db); push @args, "host=$opt_host" if defined($opt_host); push @args, "port=$opt_port" if defined($opt_port); push @args, "mysql_socket=$opt_socket" if defined($opt_socket); push @args, "mysql_read_default_group=mail_to_db"; $connect_arg .= join ';', @args; $dbh = DBI->connect("$connect_arg", $opt_user, $opt_password) || die "Couldn't connect: $DBI::errstr\n"; die "You must specify the database; use --db=" if (!defined($opt_db)); die "You must specify the table; use --table=" if (!defined($opt_table)); create_table($dbh) if ($opt_create); foreach (@ARGV) { if (/^(.*)\.(gz|Z)$/) #checks if the file is compressed or gzipped # Check if the file is compressed if (/^(.*)\.(gz|Z)$/) { open(FILE, "zcat $_ |"); process_mail_file($dbh, $1); Loading @@ -59,81 +114,66 @@ foreach (@ARGV) } $dbh->disconnect if (!$opt_test); $ignored = $count_no_from + $count_no_txt + $count_too_big + $count_duplicates + $no_subject; print "Mails inserted:\t\t\t$inserted_mails\n"; $ignored = ($mail_no_from_f + $mail_no_subject_f + $mail_no_txt_f + $mail_too_big + $mail_duplicates); print "Mails inserted:\t\t\t$mail_inserted\n"; print "Mails ignored:\t\t\t$ignored\n"; print "Mails without \"From:\" -field:\t$count_no_from\n"; print "Mails without message:\t\t$count_no_txt\n"; print "Too big mails (> $opt_max_mail_size):\t$count_too_big\n"; print "Duplicate mails:\t\t$count_duplicates\n"; print "Forwarded mails:\t\t$count_forwarded_msgs\n"; print "No subject:\t\t\t$no_subject\n"; print "Mails altogether:\t\t"; print $inserted_mails+$ignored; print "Mails without \"From:\" -field:\t$mail_no_from_f\n"; print "Mails without message:\t\t$mail_no_txt_f\n"; print "Mails without subject:\t\t$mail_no_subject_f\n"; print "Too big mails (> $opt_max_mail_size):\t$mail_too_big\n"; print "Duplicate mails:\t\t$mail_duplicates\n"; print "Forwarded mails:\t\t$mail_forwarded\n"; print "Total number of mails:\t\t"; print $mail_inserted + $ignored; print "\n"; exit(0); sub usage { my($VER)=@_; $0 =~ s/.\/(.+)/$1/; if ($opt_version) { print "$0 version $VER\n"; } else { print <<EOF; $0 version $VER Usage: $0 [options] file1 [file2 file3 ...] Description: Inserts mails from file(s) into a database Options: --help show this help and exit --version shows the version of the program --db_engine=... database server (default: $opt_db_engine) --db=... database to be used (default: $opt_db) --host=... hostname to be used (default: $opt_host) --password=... user password for the db server --user=... username for the db server --max_mail_size=# max size of a mail to be inserted into the db. mail will be ignored if it exceeds this size (default $opt_max_mail_size) --test Don\'t connect to the database, just write the queries to stdout EOF } exit(0); } #### #### table creation #### sub create_table_if_needed sub create_table { my ($dbh) = @_; my ($sth,$create); $sth = $dbh->prepare("select count(*) from $opt_table") or die $dbh->errstr; if (!$sth->execute) { $create = "CREATE TABLE $opt_table (msg_nro mediumint unsigned not null "; $create .= "auto_increment, date DATETIME NOT NULL, time_zone CHAR(6) "; $create .= "NOT NULL, mail_from char(120) not null, reply char(120), "; $create .= "mail_to TEXT, cc TEXT, sbj char(200), txt MEDIUMTEXT NOT "; $create .= "NULL, file char(32) noT NULL, hash INT NOT NULL, key "; $create .= "(msg_nro), primary key (mail_from, date, time_zone, hash))"; $sth = $dbh->prepare($create) or die $dbh->errstr; $sth->execute() or die $dbh->errstr; } my ($sth, $query); $query = <<EOF; CREATE TABLE $opt_table ( mail_id MEDIUMINT UNSIGNED NOT NULL auto_increment, date DATETIME NOT NULL, time_zone VARCHAR(20), mail_from VARCHAR(120) NOT NULL, reply VARCHAR(120), mail_to TEXT, cc TEXT, sbj VARCHAR(200), txt MEDIUMTEXT NOT NULL, file VARCHAR(64) NOT NULL, hash INTEGER NOT NULL, KEY (mail_id), PRIMARY KEY (mail_from, date, hash)) TYPE=MyISAM COMMENT='' EOF $sth = $dbh->prepare($query) or die $DBI::errstr; $sth->execute() or die "Couldn't create table: $DBI::errstr\n"; } #### #### inbox processing #### sub process_mail_file { my ($dbh, $file_name) = @_; my (%values, $type, $check); %values=(); $type=""; $file_name =~ s/.*[\/]// if ($opt_no_path); %values = (); $type = ""; $check = 0; while (<FILE>) Loading @@ -141,7 +181,7 @@ sub process_mail_file chop; if ($type ne "message") { if (/^Reply-To: (.*)/i) # finding different fields from file if (/^Reply-To: (.*)/i) { $type = "reply"; $values{$type} = $1; Loading @@ -168,7 +208,7 @@ sub process_mail_file } elsif (/^Date: (.*)/i) { date_parser($1,\%values); date_parser($1, \%values, $file_name); $type = "rubbish"; } elsif (/^[\w\W-]+:\s/) Loading @@ -195,14 +235,15 @@ sub process_mail_file { $values{'hash'} = checksum("$values{'message'}"); update_table($dbh, $file_name, \%values); %values=(); $type=""; %values = (); $type = ""; $check = 0; } elsif (/-* forwarded message .*-*/i) # in case of forwarded messages { $values{$type} .= "\n" . $_; $check++; $count_forwarded_msgs++; $mail_forwarded++; } else { Loading @@ -213,87 +254,134 @@ sub process_mail_file update_table($dbh, $file_name, \%values); } ######## # converts date to the right form #### #### get date and timezone #### sub date_parser { my ($date_raw,$values)=@_; my ($date_raw, $values, $file_name, $tmp) = @_; $date_raw =~ /\s*(\d{1,2}) (\w+) (\d{2,4}) (\d+:\d+:\d+)\s*([\w-+]{3-5})?/; # If you ever need to change this test, be especially careful with # the timezone; it may be just a number (-0600), or just a name (EET), or # both (-0600 (EET), or -0600 (EET GMT)), or without parenthesis: GMT. # You probably should use a 'greedy' regexp in the end $date_raw =~ /^\D*(\d{1,2})\s+(\w+)\s+(\d{2,4})\s+(\d+:\d+)(:\d+)?\s*(\S+.*)?/; $values->{'date'}=$3 . "-" . $months{$2} . "-" . "$1 $4"; $values->{'time_zone'}=$5; if (!defined($1) || !defined($2) || !defined($3) || !defined($4) || !defined($months{$2})) { if ($opt_debug || $opt_stop_on_error) { print "FAILED: date_parser: 1: $1 2: $2 3: $3 4: $4 5: $5\n"; print "months{2}: $months{$2}\n"; print "date_raw: $date_raw\n"; print "Inbox filename: $file_name\n"; } exit(1) if ($opt_stop_on_error); } $tmp = $3 . "-" . $months{$2} . "-" . "$1 $4"; $tmp.= defined($5) ? $5 : ":00"; $values->{'date'} = $tmp; print "INSERTING DATE: $tmp\n" if ($opt_debug); $values->{'time_zone'} = $6; } ######### # this is runned when the whole mail is gathered. # this actually puts the mail to the database. #### #### Insert to table #### sub update_table { my($dbh, $file_name, $values) = @_; my($query); my($q); if (!defined($values->{'subject'}) || !defined($values->{'to'})) { $no_subject++; $mail_no_subject_f++; return; # Ignore these } $values->{'message'} =~ s/^\s*//; #removes whitespaces from the beginning $values->{'message'} =~ s/\s*$//; #removes whitespaces from the end $query = "insert into $opt_table values (NULL,'" . $values->{'date'}; $query .= "','" . $values->{'time_zone'} . "',"; $query .= (defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL") . ","; $query .= (defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL") . ","; $query .= (defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL") . ","; $query .= (defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL") . ","; $query .= $dbh->quote($values->{'subject'}) . ","; $query .= $dbh->quote($values->{'message'}) . "," . $dbh->quote($file_name); $query .= ",'" . $values->{'hash'} . "')"; if (length($values->{'message'}) > $opt_max_mail_size) #disables big message { $count_too_big++; } elsif ($values->{'from'} eq "") #disables mails with no from field { $count_no_from++; $q = "INSERT INTO $opt_table ("; $q.= "mail_id,"; $q.= "date,"; $q.= "time_zone,"; $q.= "mail_from,"; $q.= "reply,"; $q.= "mail_to,"; $q.= "cc,"; $q.= "sbj,"; $q.= "txt,"; $q.= "file,"; $q.= "hash"; $q.= ") VALUES ("; $q.= "NULL,"; $q.= "'" . $values->{'date'} . "',"; $q.= (defined($values->{'time_zone'}) ? ("'" . $values->{'time_zone'} . "',") : "NULL,"); $q.= defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL"; $q.= ","; $q.= defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL"; $q.= ","; $q.= defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL"; $q.= ","; $q.= defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL"; $q.= ","; $q.= $dbh->quote($values->{'subject'}); $q.= ","; $q.= $dbh->quote($values->{'message'}); $q.= ","; $q.= $dbh->quote($file_name); $q.= ","; $q.= "'" . $values->{'hash'} . "'"; $q.= ")"; # Don't insert mails bigger than $opt_max_mail_size if (length($values->{'message'}) > $opt_max_mail_size) { $mail_too_big++; } # Don't insert mails without 'From' field elsif ($values->{'from'} eq "") { $mail_no_from_f++; } elsif ($opt_test) { print "$query\n"; $inserted_mails++; print "$q\n"; $mail_inserted++; } elsif ($values->{'message'} eq "") #disables mails with no message text # Don't insert mails without the 'message' elsif ($values->{'message'} eq "") { $count_no_msg_text++; $mail_no_txt_f++; } elsif ($dbh->do($query)) elsif ($dbh->do($q)) { $inserted_mails++; $mail_inserted++; } elsif (!($dbh->errstr =~ /Duplicate entry /)) #disables duplicates # This should never happen. This means that the above q failed, # but it wasn't because of a duplicate mail entry elsif (!($DBI::errstr =~ /Duplicate entry /)) { die "Aborting: Got error '" . $dbh->errstr ."' for query: '$query'\n"; die "FATAL: Got error :$DBI::errstr\nAttempted query was: $q\n"; } else { $count_duplicates++; $mail_duplicates++; print "Duplicate mail: query: $q\n" if ($opt_debug); } $query=""; $q = ""; } ########## # In case you have two identical messages we wanted to identify them # and remove additionals; We do this by calculating a hash number of the # message and ignoring messages with the same from, date and hash. # This function calculates a simple 32 bit hash value for the message. #### #### In case you have two identical messages we wanted to identify them #### and remove additionals; We do this by calculating a hash number of the #### message and ignoring messages with the same from, date and hash. #### This function calculates a simple 32 bit hash value for the message. #### sub checksum { Loading @@ -308,3 +396,80 @@ sub checksum } return $crc; } #### #### my_which is used, because we can't assume that every system has the #### which -command. my_which can take only one argument at a time. #### Return values: requested system command with the first found path, #### or undefined, if not found. #### sub my_which { my ($command) = @_; my (@paths, $path); return $command if (-f $command && -x $command); @paths = split(':', $ENV{'PATH'}); foreach $path (@paths) { $path = "." if ($path eq ""); $path .= "/$command"; return $path if (-f $path && -x $path); } return undef(); } #### #### usage and version #### sub usage { my ($VER)= @_; if ($opt_version) { print "$progname version $VER\n"; } else { print <<EOF; $progname version $VER Description: Insert mails from inbox file(s) into a table. This program can read group [mail_to_db] from the my.cnf file. You may want to have db and table set there at least. Usage: $progname [options] file1 [file2 file3 ...] [>& /path/to/log.txt] or: $progname [options] --create [file1 file2...] [>& /path/to/log.txt] Options: --help Show this help and exit. --version Show the version number and exit. --debug Print some extra information during the run. --host=... Hostname to be used. (Using: $opt_host) --port=# TCP/IP port to be used with connection. (Using: $opt_port) --socket=... MySQL UNIX socket to be used with connection. (Using: $opt_socket) --db=... Database to be used. (Using: $opt_db) --table=... Table name for mails. (Using: $opt_table) --user=... Username for connecting. (Using: $opt_user) --password=... Password for the user. --max_mail_size=# Maximum size of a mail. Beware of the downside letting this variable be too big; you may easily end up inserting a lot of attached binary files (like MS Word documents etc), which take space, make the database slower and are not really searchable anyway. (Default: $opt_max_mail_size) --create Create the mails table. This can be done with the first run. --test Dry run. Print the queries and the result as it would be. --no_path When inserting the file name, leave out any paths of the name. --stop_on_error Stop the run, if an unexpected, but not fatal error occurs during the run. Without this option some fields may get unwanted values. --debug will also report about these. EOF } exit(0); } Loading
tests/mail_to_db.pl +336 −171 Original line number Diff line number Diff line #!/usr/bin/perl #!/usr/bin/perl -w # Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB # This file is public domain and comes with NO WARRANTY of any kind # # This program is brought to you by Janne-Petteri Koilo with the # administration of Michael Widenius. # # Rewritten with a lot of bug fixes by Jani Tolonen and Thimble Smith # 15.12.2000 # # This program takes your mails and puts them into your database. It ignores # messages with the same from, date and message text. # You can use mail-files that are compressed or gzipped and ends with Loading @@ -13,40 +16,92 @@ use DBI; use Getopt::Long; $VER = "1.6"; $opt_db = "mail"; $opt_table = "mails"; $| = 1; $VER = "2.0"; $opt_help = 0; $opt_version = 0; $opt_debug = 0; $opt_host = undef(); $opt_port = undef(); $opt_socket = undef(); $opt_db = undef(); $opt_table = undef(); $opt_user = undef(); $opt_password = undef(); $opt_max_mail_size = 65536; $opt_db_engine = "mysql"; $opt_host = "localhost"; $opt_user = $opt_password = ""; $opt_help = $opt_version = $opt_test=0; $opt_create = 0; $opt_test = 0; $opt_no_path = 0; $opt_stop_on_error = 0; GetOptions("help","version","user=s","password=s", "db_engine=s","db=s","host=s","max_mail_size=s","test") || usage(); my ($dbh, $progname, $mail_no_from_f, $mail_no_txt_f, $mail_too_big, $mail_forwarded, $mail_duplicates, $mail_no_subject_f, $mail_inserted); usage($VER) if ($opt_help || $opt_version || !$ARGV[0]); $mail_no_from_f = $mail_no_txt_f = $mail_too_big = $mail_forwarded = $mail_duplicates = $mail_no_subject_f = $mail_inserted = 0; %months= ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5, my %months = ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5, 'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10, 'Nov' => 11, 'Des' => 12); 'Nov' => 11, 'Dec' => 12); $progname = $0; $progname =~ s/.*[\/]//; main(); #### #### main sub routine #### sub main { my ($connect_arg, @args, $ignored, @defops, $i); if (defined(my_which("my_print_defaults"))) { @defops = `my_print_defaults mail_to_db`; chop @defops; splice @ARGV, 0, 0, @defops; } else { print "WARNING: No command 'my_print_defaults' found; unable to read\n"; print "the my.cnf file. This command is available from the latest MySQL\n"; print "distribution.\n"; } GetOptions("help","version","host=s","port=i","socket=s","db=s","table=s", "user=s","password=s","max_mail_size=i","create","test", "no_path","debug","stop_on_error") || die "Wrong option! See $progname --help\n"; $count_no_from = $count_no_txt = $count_too_big = 0; $count_forwarded_msgs = $count_duplicates = $no_subject = 0; $inserted_mails = 0; $dbh=0; usage($VER) if ($opt_help || $opt_version || (!$ARGV[0] && !$opt_create)); $dbh = DBI->connect("DBI:$opt_db_engine:$opt_db:$opt_host",$opt_user, $opt_password,{ PrintError => 0}) || die $DBI::errstr; if (!$opt_test) # Check that the given inbox files exist and are regular files for ($i = 0; defined($ARGV[$i]); $i++) { create_table_if_needed($dbh); die "FATAL: Can't find inbox file: $ARGV[$i]\n" if (! -f $ARGV[$i]); } $connect_arg = "DBI:mysql:"; push @args, "database=$opt_db" if defined($opt_db); push @args, "host=$opt_host" if defined($opt_host); push @args, "port=$opt_port" if defined($opt_port); push @args, "mysql_socket=$opt_socket" if defined($opt_socket); push @args, "mysql_read_default_group=mail_to_db"; $connect_arg .= join ';', @args; $dbh = DBI->connect("$connect_arg", $opt_user, $opt_password) || die "Couldn't connect: $DBI::errstr\n"; die "You must specify the database; use --db=" if (!defined($opt_db)); die "You must specify the table; use --table=" if (!defined($opt_table)); create_table($dbh) if ($opt_create); foreach (@ARGV) { if (/^(.*)\.(gz|Z)$/) #checks if the file is compressed or gzipped # Check if the file is compressed if (/^(.*)\.(gz|Z)$/) { open(FILE, "zcat $_ |"); process_mail_file($dbh, $1); Loading @@ -59,81 +114,66 @@ foreach (@ARGV) } $dbh->disconnect if (!$opt_test); $ignored = $count_no_from + $count_no_txt + $count_too_big + $count_duplicates + $no_subject; print "Mails inserted:\t\t\t$inserted_mails\n"; $ignored = ($mail_no_from_f + $mail_no_subject_f + $mail_no_txt_f + $mail_too_big + $mail_duplicates); print "Mails inserted:\t\t\t$mail_inserted\n"; print "Mails ignored:\t\t\t$ignored\n"; print "Mails without \"From:\" -field:\t$count_no_from\n"; print "Mails without message:\t\t$count_no_txt\n"; print "Too big mails (> $opt_max_mail_size):\t$count_too_big\n"; print "Duplicate mails:\t\t$count_duplicates\n"; print "Forwarded mails:\t\t$count_forwarded_msgs\n"; print "No subject:\t\t\t$no_subject\n"; print "Mails altogether:\t\t"; print $inserted_mails+$ignored; print "Mails without \"From:\" -field:\t$mail_no_from_f\n"; print "Mails without message:\t\t$mail_no_txt_f\n"; print "Mails without subject:\t\t$mail_no_subject_f\n"; print "Too big mails (> $opt_max_mail_size):\t$mail_too_big\n"; print "Duplicate mails:\t\t$mail_duplicates\n"; print "Forwarded mails:\t\t$mail_forwarded\n"; print "Total number of mails:\t\t"; print $mail_inserted + $ignored; print "\n"; exit(0); sub usage { my($VER)=@_; $0 =~ s/.\/(.+)/$1/; if ($opt_version) { print "$0 version $VER\n"; } else { print <<EOF; $0 version $VER Usage: $0 [options] file1 [file2 file3 ...] Description: Inserts mails from file(s) into a database Options: --help show this help and exit --version shows the version of the program --db_engine=... database server (default: $opt_db_engine) --db=... database to be used (default: $opt_db) --host=... hostname to be used (default: $opt_host) --password=... user password for the db server --user=... username for the db server --max_mail_size=# max size of a mail to be inserted into the db. mail will be ignored if it exceeds this size (default $opt_max_mail_size) --test Don\'t connect to the database, just write the queries to stdout EOF } exit(0); } #### #### table creation #### sub create_table_if_needed sub create_table { my ($dbh) = @_; my ($sth,$create); $sth = $dbh->prepare("select count(*) from $opt_table") or die $dbh->errstr; if (!$sth->execute) { $create = "CREATE TABLE $opt_table (msg_nro mediumint unsigned not null "; $create .= "auto_increment, date DATETIME NOT NULL, time_zone CHAR(6) "; $create .= "NOT NULL, mail_from char(120) not null, reply char(120), "; $create .= "mail_to TEXT, cc TEXT, sbj char(200), txt MEDIUMTEXT NOT "; $create .= "NULL, file char(32) noT NULL, hash INT NOT NULL, key "; $create .= "(msg_nro), primary key (mail_from, date, time_zone, hash))"; $sth = $dbh->prepare($create) or die $dbh->errstr; $sth->execute() or die $dbh->errstr; } my ($sth, $query); $query = <<EOF; CREATE TABLE $opt_table ( mail_id MEDIUMINT UNSIGNED NOT NULL auto_increment, date DATETIME NOT NULL, time_zone VARCHAR(20), mail_from VARCHAR(120) NOT NULL, reply VARCHAR(120), mail_to TEXT, cc TEXT, sbj VARCHAR(200), txt MEDIUMTEXT NOT NULL, file VARCHAR(64) NOT NULL, hash INTEGER NOT NULL, KEY (mail_id), PRIMARY KEY (mail_from, date, hash)) TYPE=MyISAM COMMENT='' EOF $sth = $dbh->prepare($query) or die $DBI::errstr; $sth->execute() or die "Couldn't create table: $DBI::errstr\n"; } #### #### inbox processing #### sub process_mail_file { my ($dbh, $file_name) = @_; my (%values, $type, $check); %values=(); $type=""; $file_name =~ s/.*[\/]// if ($opt_no_path); %values = (); $type = ""; $check = 0; while (<FILE>) Loading @@ -141,7 +181,7 @@ sub process_mail_file chop; if ($type ne "message") { if (/^Reply-To: (.*)/i) # finding different fields from file if (/^Reply-To: (.*)/i) { $type = "reply"; $values{$type} = $1; Loading @@ -168,7 +208,7 @@ sub process_mail_file } elsif (/^Date: (.*)/i) { date_parser($1,\%values); date_parser($1, \%values, $file_name); $type = "rubbish"; } elsif (/^[\w\W-]+:\s/) Loading @@ -195,14 +235,15 @@ sub process_mail_file { $values{'hash'} = checksum("$values{'message'}"); update_table($dbh, $file_name, \%values); %values=(); $type=""; %values = (); $type = ""; $check = 0; } elsif (/-* forwarded message .*-*/i) # in case of forwarded messages { $values{$type} .= "\n" . $_; $check++; $count_forwarded_msgs++; $mail_forwarded++; } else { Loading @@ -213,87 +254,134 @@ sub process_mail_file update_table($dbh, $file_name, \%values); } ######## # converts date to the right form #### #### get date and timezone #### sub date_parser { my ($date_raw,$values)=@_; my ($date_raw, $values, $file_name, $tmp) = @_; $date_raw =~ /\s*(\d{1,2}) (\w+) (\d{2,4}) (\d+:\d+:\d+)\s*([\w-+]{3-5})?/; # If you ever need to change this test, be especially careful with # the timezone; it may be just a number (-0600), or just a name (EET), or # both (-0600 (EET), or -0600 (EET GMT)), or without parenthesis: GMT. # You probably should use a 'greedy' regexp in the end $date_raw =~ /^\D*(\d{1,2})\s+(\w+)\s+(\d{2,4})\s+(\d+:\d+)(:\d+)?\s*(\S+.*)?/; $values->{'date'}=$3 . "-" . $months{$2} . "-" . "$1 $4"; $values->{'time_zone'}=$5; if (!defined($1) || !defined($2) || !defined($3) || !defined($4) || !defined($months{$2})) { if ($opt_debug || $opt_stop_on_error) { print "FAILED: date_parser: 1: $1 2: $2 3: $3 4: $4 5: $5\n"; print "months{2}: $months{$2}\n"; print "date_raw: $date_raw\n"; print "Inbox filename: $file_name\n"; } exit(1) if ($opt_stop_on_error); } $tmp = $3 . "-" . $months{$2} . "-" . "$1 $4"; $tmp.= defined($5) ? $5 : ":00"; $values->{'date'} = $tmp; print "INSERTING DATE: $tmp\n" if ($opt_debug); $values->{'time_zone'} = $6; } ######### # this is runned when the whole mail is gathered. # this actually puts the mail to the database. #### #### Insert to table #### sub update_table { my($dbh, $file_name, $values) = @_; my($query); my($q); if (!defined($values->{'subject'}) || !defined($values->{'to'})) { $no_subject++; $mail_no_subject_f++; return; # Ignore these } $values->{'message'} =~ s/^\s*//; #removes whitespaces from the beginning $values->{'message'} =~ s/\s*$//; #removes whitespaces from the end $query = "insert into $opt_table values (NULL,'" . $values->{'date'}; $query .= "','" . $values->{'time_zone'} . "',"; $query .= (defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL") . ","; $query .= (defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL") . ","; $query .= (defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL") . ","; $query .= (defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL") . ","; $query .= $dbh->quote($values->{'subject'}) . ","; $query .= $dbh->quote($values->{'message'}) . "," . $dbh->quote($file_name); $query .= ",'" . $values->{'hash'} . "')"; if (length($values->{'message'}) > $opt_max_mail_size) #disables big message { $count_too_big++; } elsif ($values->{'from'} eq "") #disables mails with no from field { $count_no_from++; $q = "INSERT INTO $opt_table ("; $q.= "mail_id,"; $q.= "date,"; $q.= "time_zone,"; $q.= "mail_from,"; $q.= "reply,"; $q.= "mail_to,"; $q.= "cc,"; $q.= "sbj,"; $q.= "txt,"; $q.= "file,"; $q.= "hash"; $q.= ") VALUES ("; $q.= "NULL,"; $q.= "'" . $values->{'date'} . "',"; $q.= (defined($values->{'time_zone'}) ? ("'" . $values->{'time_zone'} . "',") : "NULL,"); $q.= defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL"; $q.= ","; $q.= defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL"; $q.= ","; $q.= defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL"; $q.= ","; $q.= defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL"; $q.= ","; $q.= $dbh->quote($values->{'subject'}); $q.= ","; $q.= $dbh->quote($values->{'message'}); $q.= ","; $q.= $dbh->quote($file_name); $q.= ","; $q.= "'" . $values->{'hash'} . "'"; $q.= ")"; # Don't insert mails bigger than $opt_max_mail_size if (length($values->{'message'}) > $opt_max_mail_size) { $mail_too_big++; } # Don't insert mails without 'From' field elsif ($values->{'from'} eq "") { $mail_no_from_f++; } elsif ($opt_test) { print "$query\n"; $inserted_mails++; print "$q\n"; $mail_inserted++; } elsif ($values->{'message'} eq "") #disables mails with no message text # Don't insert mails without the 'message' elsif ($values->{'message'} eq "") { $count_no_msg_text++; $mail_no_txt_f++; } elsif ($dbh->do($query)) elsif ($dbh->do($q)) { $inserted_mails++; $mail_inserted++; } elsif (!($dbh->errstr =~ /Duplicate entry /)) #disables duplicates # This should never happen. This means that the above q failed, # but it wasn't because of a duplicate mail entry elsif (!($DBI::errstr =~ /Duplicate entry /)) { die "Aborting: Got error '" . $dbh->errstr ."' for query: '$query'\n"; die "FATAL: Got error :$DBI::errstr\nAttempted query was: $q\n"; } else { $count_duplicates++; $mail_duplicates++; print "Duplicate mail: query: $q\n" if ($opt_debug); } $query=""; $q = ""; } ########## # In case you have two identical messages we wanted to identify them # and remove additionals; We do this by calculating a hash number of the # message and ignoring messages with the same from, date and hash. # This function calculates a simple 32 bit hash value for the message. #### #### In case you have two identical messages we wanted to identify them #### and remove additionals; We do this by calculating a hash number of the #### message and ignoring messages with the same from, date and hash. #### This function calculates a simple 32 bit hash value for the message. #### sub checksum { Loading @@ -308,3 +396,80 @@ sub checksum } return $crc; } #### #### my_which is used, because we can't assume that every system has the #### which -command. my_which can take only one argument at a time. #### Return values: requested system command with the first found path, #### or undefined, if not found. #### sub my_which { my ($command) = @_; my (@paths, $path); return $command if (-f $command && -x $command); @paths = split(':', $ENV{'PATH'}); foreach $path (@paths) { $path = "." if ($path eq ""); $path .= "/$command"; return $path if (-f $path && -x $path); } return undef(); } #### #### usage and version #### sub usage { my ($VER)= @_; if ($opt_version) { print "$progname version $VER\n"; } else { print <<EOF; $progname version $VER Description: Insert mails from inbox file(s) into a table. This program can read group [mail_to_db] from the my.cnf file. You may want to have db and table set there at least. Usage: $progname [options] file1 [file2 file3 ...] [>& /path/to/log.txt] or: $progname [options] --create [file1 file2...] [>& /path/to/log.txt] Options: --help Show this help and exit. --version Show the version number and exit. --debug Print some extra information during the run. --host=... Hostname to be used. (Using: $opt_host) --port=# TCP/IP port to be used with connection. (Using: $opt_port) --socket=... MySQL UNIX socket to be used with connection. (Using: $opt_socket) --db=... Database to be used. (Using: $opt_db) --table=... Table name for mails. (Using: $opt_table) --user=... Username for connecting. (Using: $opt_user) --password=... Password for the user. --max_mail_size=# Maximum size of a mail. Beware of the downside letting this variable be too big; you may easily end up inserting a lot of attached binary files (like MS Word documents etc), which take space, make the database slower and are not really searchable anyway. (Default: $opt_max_mail_size) --create Create the mails table. This can be done with the first run. --test Dry run. Print the queries and the result as it would be. --no_path When inserting the file name, leave out any paths of the name. --stop_on_error Stop the run, if an unexpected, but not fatal error occurs during the run. Without this option some fields may get unwanted values. --debug will also report about these. EOF } exit(0); }