Embryo of indexing works (and goes pretty fast!)
authorFrancois Fleuret <francois@fleuret.org>
Sun, 27 Jan 2013 15:30:28 +0000 (16:30 +0100)
committerFrancois Fleuret <francois@fleuret.org>
Sun, 27 Jan 2013 15:30:28 +0000 (16:30 +0100)
mymail.c

index da9bed1..f657306 100644 (file)
--- a/mymail.c
+++ b/mymail.c
 #include <getopt.h>
 #include <limits.h>
 #include <dirent.h>
+#include <regex.h>
 
 #define VERSION "0.1"
 
 #define BUFFER_SIZE 16384
 
 struct parsable_field {
-  char *regexp;
-  char *db_filename;
+  char *name;
+  char *regexp_string;
+  regex_t regexp;
+  FILE *db_file;
 };
 
+char *db_filename_prefix;
+
 /********************************************************************/
 
 /* malloc with error checking.  */
@@ -72,11 +77,11 @@ void usage(FILE *out) {
 }
 
 void read_file(const char *input_filename,
-               int nb_fields_to_parse, struct parsable_field *fields_to_parse,
-               FILE **db_files) {
+               int nb_fields_to_parse, struct parsable_field *fields_to_parse) {
   char raw_line[BUFFER_SIZE];
   FILE *file;
   int in_header;
+  unsigned int position_in_file;
 
   file = fopen(input_filename, "r");
 
@@ -87,10 +92,15 @@ void read_file(const char *input_filename,
 
   in_header = 0;
 
+  position_in_file = 0;
+
   while(fgets(raw_line, BUFFER_SIZE, file)) {
     if(strncmp(raw_line, "From ", 5) == 0) {
       if(in_header) {
-        fprintf(stderr, "Got a 'From ' in the header.\n");
+        fprintf(stderr,
+                "Got a 'From ' in the header in %s:%u.\n",
+                input_filename, position_in_file);
+        fprintf(stderr, "%s", raw_line);
         exit(EXIT_FAILURE);
       }
       in_header = 1;
@@ -98,11 +108,25 @@ void read_file(const char *input_filename,
       if(in_header) { in_header = 0; }
     }
 
+    /* if(in_header) { */
+      /* printf("LINE.H %s", raw_line); */
+    /* } else { */
+      /* printf("LINE.B %s", raw_line); */
+    /* } */
+
     if(in_header) {
-      printf("LINE.H %s", raw_line);
-    } else {
-      printf("LINE.B %s", raw_line);
+      int f;
+      regmatch_t matches;
+      for(f = 0; f < nb_fields_to_parse; f++) {
+        if(regexec(&fields_to_parse[f].regexp, raw_line, 1, &matches, 0) == 0) {
+          fprintf(fields_to_parse[f].db_file, "%s:%d %s",
+                  input_filename, position_in_file,
+                  raw_line + matches.rm_eo);
+        }
+      }
     }
+
+    position_in_file += strlen(raw_line);
   }
 
   fclose(file);
@@ -116,8 +140,7 @@ int ignore_entry(const char *name) {
 }
 
 void process_entry(const char *dir_name,
-                   int nb_fields_to_parse, struct parsable_field *fields_to_parse,
-                   FILE **db_files) {
+                   int nb_fields_to_parse, struct parsable_field *fields_to_parse) {
   DIR *dir;
   struct dirent *dir_e;
   struct stat sb;
@@ -143,14 +166,14 @@ void process_entry(const char *dir_name,
     while((dir_e = readdir(dir))) {
       if(!ignore_entry(dir_e->d_name)) {
         snprintf(subname, PATH_MAX, "%s/%s", dir_name, dir_e->d_name);
-        process_entry(subname, nb_fields_to_parse, fields_to_parse, db_files);
+        process_entry(subname, nb_fields_to_parse, fields_to_parse);
       }
     }
     closedir(dir);
   } else {
     if(S_ISREG(sb.st_mode)) {
-      printf("Processing regular file '%s'.\n", dir_name);
-      read_file(dir_name, nb_fields_to_parse, fields_to_parse, db_files);
+      /* printf("Processing regular file '%s'.\n", dir_name); */
+      read_file(dir_name, nb_fields_to_parse, fields_to_parse);
     }
   }
 }
@@ -166,11 +189,22 @@ enum
 
 static struct option long_options[] = {
   { "help", no_argument, 0, 'h' },
+  { "db-prefix", 1, 0, 'p' },
   { 0, 0, 0, 0 }
 };
 
 static struct parsable_field fields_to_parse[] = {
-  { "^[Tt][Oo]:", "/tmp/mymail-to" }
+  {
+    "from",
+    "^[Ff][Rr][Oo][Mm]: *",
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 0
+  },
+
+  {
+    "dest",
+    "^\\([Tt][Oo]\\|[Cc][Cc]\\|[Bb][Cc][Cc]\\): *",
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 0
+  },
 };
 
 int main(int argc, char **argv) {
@@ -178,11 +212,10 @@ int main(int argc, char **argv) {
   const int nb_fields_to_parse = sizeof(fields_to_parse) / sizeof(struct parsable_field);
   char c;
   int f;
-  FILE **db_files;
 
   setlocale(LC_ALL, "");
 
-  while ((c = getopt_long(argc, argv, "h",
+  while ((c = getopt_long(argc, argv, "hp:",
                           long_options, NULL)) != -1) {
 
     switch(c) {
@@ -191,12 +224,20 @@ int main(int argc, char **argv) {
       show_help = 1;
       break;
 
+    case 'p':
+      db_filename_prefix = strdup(optarg);
+      break;
+
     default:
       error = 1;
       break;
     }
   }
 
+  if(!db_filename_prefix) {
+    db_filename_prefix = strdup("/tmp/mymail_");
+  }
+
   if(error) {
     usage(stderr);
     exit(EXIT_FAILURE);
@@ -207,30 +248,41 @@ int main(int argc, char **argv) {
     exit(EXIT_SUCCESS);
   }
 
-  db_files = safe_malloc(nb_fields_to_parse * sizeof(FILE *));
-
   for(f = 0; f < nb_fields_to_parse; f++) {
-    db_files[f] = fopen(fields_to_parse[f].db_filename, "w");
-    if(!db_files[f]) {
+    char db_filename[BUFFER_SIZE];
+    sprintf(db_filename, "%s%s", db_filename_prefix, fields_to_parse[f].name);
+    fields_to_parse[f].db_file = fopen(db_filename, "w");
+    if(!fields_to_parse[f].db_file) {
       fprintf(stderr,
               "mymail: Can not open \"%s\" for writing: %s\n",
-              fields_to_parse[f].db_filename,
+              db_filename,
               strerror(errno));
+      exit(EXIT_FAILURE);
+    }
+
+    printf("Initialized %s.\n", db_filename);
+
+    if(regcomp(&fields_to_parse[f].regexp,
+               fields_to_parse[f].regexp_string,
+               REG_ICASE)) {
+      fprintf(stderr,
+              "mymail: Syntax error in regexp \"%s\" for field \"%s\".\n",
+              fields_to_parse[f].regexp_string,
+              fields_to_parse[f].name);
+      exit(EXIT_FAILURE);
     }
   }
 
   while(optind < argc) {
     process_entry(argv[optind],
-                  nb_fields_to_parse, fields_to_parse,
-                  db_files);
+                  nb_fields_to_parse, fields_to_parse);
     optind++;
   }
 
   for(f = 0; f < nb_fields_to_parse; f++) {
-    fclose(db_files[f]);
+    fclose(fields_to_parse[f].db_file);
+    regfree(&fields_to_parse[f].regexp);
   }
 
-  free(db_files);
-
   exit(EXIT_SUCCESS);
 }