Commit e63aa638 authored by jani@prima.mysql.fi's avatar jani@prima.mysql.fi
Browse files

Merge work:/home/bk/mysql into prima.mysql.fi:/home/my/bk/mysql

parents f11d5c83 8594820f
Loading
Loading
Loading
Loading
+336 −171
Original line number Diff line number Diff line
#!/usr/bin/perl
#!/usr/bin/perl -w
# Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB
# This file is public domain and comes with NO WARRANTY of any kind
#
# This program is brought to you by Janne-Petteri Koilo with the 
# administration of Michael Widenius.

#
# Rewritten with a lot of bug fixes by Jani Tolonen and Thimble Smith
# 15.12.2000
#
# This program takes your mails and puts them into your database. It ignores
# messages with the same from, date and message text.
# You can use mail-files that are compressed or gzipped and ends with
@@ -13,40 +16,92 @@
use DBI;
use Getopt::Long;

$VER = "1.6";

$opt_db = "mail";
$opt_table = "mails";
$| = 1;
$VER = "2.0";

$opt_help          = 0;
$opt_version       = 0;
$opt_debug         = 0;
$opt_host          = undef();
$opt_port          = undef();
$opt_socket        = undef();
$opt_db            = undef();
$opt_table         = undef();
$opt_user          = undef();
$opt_password      = undef();
$opt_max_mail_size = 65536;
$opt_db_engine = "mysql";
$opt_host = "localhost";
$opt_user = $opt_password = "";
$opt_help = $opt_version = $opt_test=0;
$opt_create        = 0;
$opt_test          = 0;
$opt_no_path       = 0;
$opt_stop_on_error = 0;

GetOptions("help","version","user=s","password=s",
	   "db_engine=s","db=s","host=s","max_mail_size=s","test") || usage();
my ($dbh, $progname, $mail_no_from_f, $mail_no_txt_f, $mail_too_big,
    $mail_forwarded, $mail_duplicates, $mail_no_subject_f, $mail_inserted);

usage($VER) if ($opt_help || $opt_version || !$ARGV[0]);
$mail_no_from_f = $mail_no_txt_f = $mail_too_big = $mail_forwarded =
$mail_duplicates = $mail_no_subject_f = $mail_inserted = 0;

%months= ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5,
my %months = ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5,
	      'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10,
	  'Nov' => 11, 'Des' => 12);
	      'Nov' => 11, 'Dec' => 12);

$progname = $0;
$progname =~ s/.*[\/]//;

main();

####
#### main sub routine
####

sub main
{
  my ($connect_arg, @args, $ignored, @defops, $i);

  if (defined(my_which("my_print_defaults")))
  {
    @defops = `my_print_defaults mail_to_db`;
    chop @defops;
    splice @ARGV, 0, 0, @defops;
  }
  else
  {
    print "WARNING: No command 'my_print_defaults' found; unable to read\n";
    print "the my.cnf file. This command is available from the latest MySQL\n";
    print "distribution.\n";
  }
  GetOptions("help","version","host=s","port=i","socket=s","db=s","table=s",
	     "user=s","password=s","max_mail_size=i","create","test",
	     "no_path","debug","stop_on_error")
  || die "Wrong option! See $progname --help\n";

$count_no_from = $count_no_txt = $count_too_big = 0;
$count_forwarded_msgs = $count_duplicates = $no_subject = 0;
$inserted_mails = 0;
$dbh=0;
  usage($VER) if ($opt_help || $opt_version || (!$ARGV[0] && !$opt_create));

$dbh = DBI->connect("DBI:$opt_db_engine:$opt_db:$opt_host",$opt_user,
		    $opt_password,{ PrintError => 0}) || die $DBI::errstr;
if (!$opt_test)
  # Check that the given inbox files exist and are regular files
  for ($i = 0; defined($ARGV[$i]); $i++)
  {
  create_table_if_needed($dbh);
    die "FATAL: Can't find inbox file: $ARGV[$i]\n" if (! -f $ARGV[$i]);
  }

  $connect_arg = "DBI:mysql:";
  push @args, "database=$opt_db" if defined($opt_db);
  push @args, "host=$opt_host" if defined($opt_host);
  push @args, "port=$opt_port" if defined($opt_port);
  push @args, "mysql_socket=$opt_socket" if defined($opt_socket);
  push @args, "mysql_read_default_group=mail_to_db";
  $connect_arg .= join ';', @args;
  $dbh = DBI->connect("$connect_arg", $opt_user, $opt_password)
  || die "Couldn't connect: $DBI::errstr\n";

  die "You must specify the database; use --db=" if (!defined($opt_db));
  die "You must specify the table; use --table=" if (!defined($opt_table));

  create_table($dbh) if ($opt_create);

  foreach (@ARGV)
  {
  if (/^(.*)\.(gz|Z)$/) #checks if the file is compressed or gzipped
    # Check if the file is compressed
    if (/^(.*)\.(gz|Z)$/)
    {
      open(FILE, "zcat $_ |");
      process_mail_file($dbh, $1);
@@ -59,81 +114,66 @@ foreach (@ARGV)
  }
  $dbh->disconnect if (!$opt_test);

$ignored = $count_no_from + $count_no_txt + $count_too_big + $count_duplicates + $no_subject;
print "Mails inserted:\t\t\t$inserted_mails\n";
  $ignored = ($mail_no_from_f + $mail_no_subject_f + $mail_no_txt_f +
	      $mail_too_big + $mail_duplicates);
  print "Mails inserted:\t\t\t$mail_inserted\n";
  print "Mails ignored:\t\t\t$ignored\n";
print "Mails without \"From:\" -field:\t$count_no_from\n";
print "Mails without message:\t\t$count_no_txt\n";
print "Too big mails (> $opt_max_mail_size):\t$count_too_big\n";
print "Duplicate mails:\t\t$count_duplicates\n";
print "Forwarded mails:\t\t$count_forwarded_msgs\n";
print "No subject:\t\t\t$no_subject\n";
print "Mails altogether:\t\t"; 
print $inserted_mails+$ignored;
  print "Mails without \"From:\" -field:\t$mail_no_from_f\n";
  print "Mails without message:\t\t$mail_no_txt_f\n";
  print "Mails without subject:\t\t$mail_no_subject_f\n";
  print "Too big mails (> $opt_max_mail_size):\t$mail_too_big\n";
  print "Duplicate mails:\t\t$mail_duplicates\n";
  print "Forwarded mails:\t\t$mail_forwarded\n";
  print "Total number of mails:\t\t"; 
  print $mail_inserted + $ignored;
  print "\n";
  exit(0);

sub usage
{  
  my($VER)=@_;
  
  $0 =~ s/.\/(.+)/$1/;
  if ($opt_version)
  {
    print "$0 version $VER\n";
}
  else
  {
    print <<EOF;
$0 version $VER

Usage: $0 [options] file1 [file2 file3 ...]

Description: Inserts mails from file(s) into a database

Options:
--help             show this help and exit
--version          shows the version of the program
--db_engine=...    database server (default: $opt_db_engine)
--db=...           database to be used (default: $opt_db)
--host=...         hostname to be used (default: $opt_host)
--password=...     user password for the db server
--user=...         username for the db server
--max_mail_size=#  max size of a mail to be inserted into the db.
                   mail will be ignored if it exceeds this size
                   (default $opt_max_mail_size)
--test		   Don\'t connect to the database, just write the
		   queries to stdout
EOF
  }
  exit(0);
}
####
#### table creation
####

sub create_table_if_needed
sub create_table
{
  my ($dbh) = @_;
  my ($sth,$create);
  
  $sth = $dbh->prepare("select count(*) from $opt_table") or die $dbh->errstr;
  if (!$sth->execute)
  {
    $create = "CREATE TABLE $opt_table (msg_nro mediumint unsigned not null ";
    $create .= "auto_increment, date DATETIME NOT NULL, time_zone CHAR(6) ";
    $create .= "NOT NULL, mail_from char(120) not null, reply char(120), ";
    $create .= "mail_to TEXT, cc TEXT, sbj char(200), txt MEDIUMTEXT NOT ";
    $create .= "NULL, file char(32) noT NULL, hash INT NOT NULL, key ";
    $create .= "(msg_nro), primary key (mail_from, date, time_zone, hash))";
    $sth = $dbh->prepare($create) or die $dbh->errstr;
    $sth->execute() or die $dbh->errstr;
  }  
  my ($sth, $query);

  $query = <<EOF;
CREATE TABLE $opt_table
(
 mail_id MEDIUMINT UNSIGNED NOT NULL auto_increment,
 date DATETIME NOT NULL,
 time_zone VARCHAR(20),
 mail_from VARCHAR(120) NOT NULL,
 reply VARCHAR(120),
 mail_to TEXT,
 cc TEXT,
 sbj VARCHAR(200),
 txt MEDIUMTEXT NOT NULL,
 file VARCHAR(64) NOT NULL,
 hash INTEGER NOT NULL,
 KEY (mail_id),
 PRIMARY KEY (mail_from, date, hash))
 TYPE=MyISAM COMMENT=''
EOF
  $sth = $dbh->prepare($query) or die $DBI::errstr;
  $sth->execute() or die "Couldn't create table: $DBI::errstr\n";
}

####
#### inbox processing
####

sub process_mail_file
{
  my ($dbh, $file_name) = @_;
  my (%values, $type, $check);

  %values=(); $type="";
  $file_name =~ s/.*[\/]// if ($opt_no_path);

  %values = ();
  $type = "";
  $check = 0;

  while (<FILE>)
@@ -141,7 +181,7 @@ sub process_mail_file
    chop;
    if ($type ne "message")
    { 
      if (/^Reply-To: (.*)/i)  # finding different fields from file
      if (/^Reply-To: (.*)/i)
      {
	$type = "reply";
	$values{$type} = $1;
@@ -168,7 +208,7 @@ sub process_mail_file
      }
      elsif (/^Date: (.*)/i)
      {
	date_parser($1,\%values);
	date_parser($1, \%values, $file_name);
	$type = "rubbish";
      }
      elsif (/^[\w\W-]+:\s/)
@@ -195,14 +235,15 @@ sub process_mail_file
    {
      $values{'hash'} = checksum("$values{'message'}");
      update_table($dbh, $file_name, \%values);
      %values=(); $type="";
      %values = ();
      $type = "";
      $check = 0;
    }
    elsif (/-* forwarded message .*-*/i) # in case of forwarded messages
    {
      $values{$type} .= "\n" . $_;
      $check++;
      $count_forwarded_msgs++;
      $mail_forwarded++;
    }
    else
    {
@@ -213,87 +254,134 @@ sub process_mail_file
  update_table($dbh, $file_name, \%values);
}

########

# converts date to the right form
####
#### get date and timezone
####

sub date_parser
{
  my ($date_raw,$values)=@_;
  my ($date_raw, $values, $file_name, $tmp) = @_;

  $date_raw =~ /\s*(\d{1,2}) (\w+) (\d{2,4}) (\d+:\d+:\d+)\s*([\w-+]{3-5})?/;
  # If you ever need to change this test, be especially careful with
  # the timezone; it may be just a number (-0600), or just a name (EET), or
  # both (-0600 (EET), or -0600 (EET GMT)), or without parenthesis: GMT.
  # You probably should use a 'greedy' regexp in the end
  $date_raw =~ /^\D*(\d{1,2})\s+(\w+)\s+(\d{2,4})\s+(\d+:\d+)(:\d+)?\s*(\S+.*)?/;

  $values->{'date'}=$3 . "-" . $months{$2} . "-" . "$1 $4";
  $values->{'time_zone'}=$5;
  if (!defined($1) || !defined($2) || !defined($3) || !defined($4) ||
      !defined($months{$2}))
  {
    if ($opt_debug || $opt_stop_on_error)
    {
      print "FAILED: date_parser: 1: $1 2: $2 3: $3 4: $4 5: $5\n";
      print "months{2}: $months{$2}\n";
      print "date_raw: $date_raw\n";
      print "Inbox filename: $file_name\n";
    }
    exit(1) if ($opt_stop_on_error);
  }
  $tmp = $3 . "-" . $months{$2} . "-" . "$1 $4";
  $tmp.= defined($5) ? $5 : ":00";
  $values->{'date'} = $tmp;
  print "INSERTING DATE: $tmp\n" if ($opt_debug);
  $values->{'time_zone'} = $6;
}

#########

# this is runned when the whole mail is gathered.
# this actually puts the mail to the database.
####
#### Insert to table
#### 

sub update_table
{
  my($dbh, $file_name, $values) = @_;
  my($query);
  my($q);

  if (!defined($values->{'subject'}) || !defined($values->{'to'}))
  {
    $no_subject++;
    $mail_no_subject_f++;
    return;			# Ignore these
  }
  $values->{'message'} =~ s/^\s*//; #removes whitespaces from the beginning 
  $values->{'message'} =~ s/\s*$//; #removes whitespaces from the end
  $query = "insert into $opt_table values (NULL,'" . $values->{'date'};
  $query .= "','" . $values->{'time_zone'} . "',";
  $query .= (defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL") . ",";
  $query .= (defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL") . ",";

  $query .= (defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL") . ","; 
  $query .= (defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL") . ","; 
  $query .= $dbh->quote($values->{'subject'}) . ",";
  $query .= $dbh->quote($values->{'message'}) . "," . $dbh->quote($file_name);
  $query .= ",'" . $values->{'hash'} . "')";

  if (length($values->{'message'}) > $opt_max_mail_size) #disables big message
  {
    $count_too_big++;
  }
  elsif ($values->{'from'} eq "") #disables mails with no from field
  {
    $count_no_from++;
  $q = "INSERT INTO $opt_table (";
  $q.= "mail_id,";
  $q.= "date,";
  $q.= "time_zone,";
  $q.= "mail_from,";
  $q.= "reply,";
  $q.= "mail_to,";
  $q.= "cc,";
  $q.= "sbj,";
  $q.= "txt,";
  $q.= "file,";
  $q.= "hash";
  $q.= ") VALUES (";
  $q.= "NULL,";
  $q.= "'" . $values->{'date'} . "',";
  $q.= (defined($values->{'time_zone'}) ?
	("'" . $values->{'time_zone'} . "',") : "NULL,");
  $q.= defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL";
  $q.= ",";
  $q.= defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL";
  $q.= ",";
  $q.= defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL";
  $q.= ",";
  $q.= defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL"; 
  $q.= ","; 
  $q.= $dbh->quote($values->{'subject'});
  $q.= ",";
  $q.= $dbh->quote($values->{'message'});
  $q.= ",";
  $q.= $dbh->quote($file_name);
  $q.= ",";
  $q.= "'" . $values->{'hash'} . "'";
  $q.= ")";

  # Don't insert mails bigger than $opt_max_mail_size
  if (length($values->{'message'}) > $opt_max_mail_size)
  {
    $mail_too_big++;
  }
  # Don't insert mails without 'From' field
  elsif ($values->{'from'} eq "") 
  {
    $mail_no_from_f++;
  }
  elsif ($opt_test)
  {
    print "$query\n";
    $inserted_mails++;
    print "$q\n";
    $mail_inserted++;
  }
  elsif ($values->{'message'} eq "") #disables mails with no message text
  # Don't insert mails without the 'message'
  elsif ($values->{'message'} eq "") 
  {
    $count_no_msg_text++;
    $mail_no_txt_f++;
  }
  elsif ($dbh->do($query))
  elsif ($dbh->do($q))
  {
    $inserted_mails++;
    $mail_inserted++;
  }
  elsif (!($dbh->errstr =~ /Duplicate entry /)) #disables duplicates
  # This should never happen. This means that the above q failed,
  # but it wasn't because of a duplicate mail entry
  elsif (!($DBI::errstr =~ /Duplicate entry /))
  {
    die "Aborting: Got error '" . $dbh->errstr ."' for query: '$query'\n";
    die "FATAL: Got error :$DBI::errstr\nAttempted query was: $q\n";
  }
  else
  {
    $count_duplicates++;    
    $mail_duplicates++;
    print "Duplicate mail: query: $q\n" if ($opt_debug);
  }
  $query="";
  $q = "";
}


##########

# In case you have two identical messages we wanted to identify them
# and remove additionals;  We do this by calculating a hash number of the
# message and ignoring messages with the same from, date and hash.
# This function calculates a simple 32 bit hash value for the message.
####
#### In case you have two identical messages we wanted to identify them
#### and remove additionals;  We do this by calculating a hash number of the
#### message and ignoring messages with the same from, date and hash.
#### This function calculates a simple 32 bit hash value for the message.
####

sub checksum
{
@@ -308,3 +396,80 @@ sub checksum
  }
  return $crc;
}

####
#### my_which is used, because we can't assume that every system has the
#### which -command. my_which can take only one argument at a time.
#### Return values: requested system command with the first found path,
#### or undefined, if not found.
####

sub my_which
{
  my ($command) = @_;
  my (@paths, $path);

  return $command if (-f $command && -x $command);
  @paths = split(':', $ENV{'PATH'});
  foreach $path (@paths)
  {
    $path = "." if ($path eq "");
    $path .= "/$command";
    return $path if (-f $path && -x $path);
  }
  return undef();
}

####
#### usage and version
####

sub usage
{  
  my ($VER)= @_;
  
  if ($opt_version)
  {
    print "$progname version $VER\n";
  } 
  else
  {
    print <<EOF;
$progname version $VER

Description: Insert mails from inbox file(s) into a table.
This program can read group [mail_to_db] from the my.cnf
file. You may want to have db and table set there at least.

Usage: $progname [options] file1 [file2 file3 ...] [>& /path/to/log.txt]
or:    $progname [options] --create [file1 file2...] [>& /path/to/log.txt]

Options:
--help             Show this help and exit.
--version          Show the version number and exit.
--debug            Print some extra information during the run.
--host=...         Hostname to be used. (Using: $opt_host)
--port=#           TCP/IP port to be used with connection. (Using: $opt_port)
--socket=...       MySQL UNIX socket to be used with connection.
                   (Using: $opt_socket)
--db=...           Database to be used.     (Using: $opt_db)
--table=...        Table name for mails.    (Using: $opt_table)
--user=...         Username for connecting. (Using: $opt_user)
--password=...     Password for the user.
--max_mail_size=#  Maximum size of a mail.
                   Beware of the downside letting this variable be too big;
                   you may easily end up inserting a lot of attached 
                   binary files (like MS Word documents etc), which take
                   space, make the database slower and are not really
                   searchable anyway. (Default: $opt_max_mail_size)
--create           Create the mails table. This can be done with the first run.
--test		   Dry run. Print the queries and the result as it would be.
--no_path          When inserting the file name, leave out any paths of
                   the name.
--stop_on_error    Stop the run, if an unexpected, but not fatal error occurs
                   during the run. Without this option some fields may get
                   unwanted values. --debug will also report about these.
EOF
  }
  exit(0);
}