/* checkwordfreq.c by Michael Thorpe 2006-03-18 */ #include #include #include #include #include #include #include #include #define MAX_TOKEN 60 #define NUM_WORDS 15 #define MIN_WORDLEN 3 #define EPSILON 0.00000001 /* If >0, checkwordfreq will only look at the first HOW_MUCH_TO_CHECK bytes */ #define HOW_MUCH_TO_CHECK 10000 static char token[MAX_TOKEN+1]; static unsigned long bademails,goodemails; static double probs[NUM_WORDS]; static double deviations[NUM_WORDS]; static char *(words[NUM_WORDS]); static int verbose=0; static unsigned long get(GDBM_FILE dbf,char *word) { datum key,value; int i; unsigned long val=0; key.dptr=word; key.dsize=strlen(word); value=gdbm_fetch(dbf,key); if(!value.dptr) return(0); for(i=0;i1) pb=1; pg=(double)g/(double)goodemails; if(pg>1) pg=1; prob=pb/(pb+pg); if(prob>.99) prob=.99; if(prob<.01) prob=.01; } pg=prob-.5; if(pg<0) pg=-pg; if(deviations[NUM_WORDS-1]0;i--) { if(deviations[i-1]>pg+EPSILON) break; if(words[i-1] && fabs(deviations[i-1]-pg)1 && argv[1][0]=='-' && argv[1][1]=='v' && !argv[1][2]) verbose=1; if(argc != 3+verbose) { fprintf(stderr,"usage: checkwordfreq [-v] \n"); return(1); } do { bad=gdbm_open(argv[1+verbose],0,GDBM_READER,0,0); } while(!bad && EAGAIN==errno && (sleep(1) || 1)); if(!bad) { perror("gdbm_open"); return(1); } do { good=gdbm_open(argv[2+verbose],0,GDBM_READER,0,0); } while(!good && EAGAIN==errno && (sleep(1) || 1)); if(!good) { perror("gdbm_open"); return(1); } bademails=get(bad," "); goodemails=get(good," "); if(!bademails || !goodemails) { fprintf(stderr,"Must have at least 1 each good and bad email!\n"); return(1); } if(doit(stdin,bad,good)) { perror("stdin"); goterror=1; } gdbm_close(good); gdbm_close(bad); return(goterror); }