/* * statistical analysis of file to see if it is binary or text data * * every SKIP'th character is read, and checked to see if it is printable * (white space is considered printable) * * if at least THRESHHOLD% characters are printable, it's binary; otherwise * it's text INPORTANT: THRESHHOLD is a percentage, not a fraction!) * * Matt Bishop, ECS 36A * -- May 22, 2024 Original program */ #include <stdio.h> #include <ctype.h> #include <stdlib.h> #include <errno.h> int skip = 10; /* read 1 out of every SKIP characters */ double threshhold = 25.0; /* a percent, *not* a fraction! */ /* * forward declarations */ int getopt(int, char *[], char *); extern char *optarg; /* argument to option */ extern int optind; /* option index in argv */ /* * test to see if this is a binary file */ void bintest(char *fname, FILE *fp) { int ch; /* input character */ int bin = 0; /* number of binary characters */ int nonbin = 0; /* number of printing characters */ int sample_size = 0; /* number of characters read */ double frac = 0.0; /* percentage of binary characters */ /* * skip ahead SKIP characters and analyze the character */ while(fseek(fp, skip, SEEK_CUR) != -1){ /* if done, break out of the loop */ if ((ch = fgetc(fp)) == EOF) break; /* read another character */ sample_size += 1; /* classify the character; note spaces */ /* are considered printable characters */ if (isprint(ch)) nonbin++; else bin++; } /* * now do the analysis */ /* what percent of the sampled characters are non-printable? */ frac = ((double) bin) / ((double) sample_size) * 100; /* print out the results */ printf("%s: %d samples,", fname, sample_size); printf(" %d non-binary, %d (%0.2f%%) binary; a ", nonbin, bin, frac); printf(frac < threshhold ? "text" : "binary"); printf(" file\n"); } /* * process integer in str; check for under/overflow */ int getint(char *str) { char *endptr; /* points to char just beyond end of integer */ long retval; /* the integer being read */ /* read the integer in base 10, set endptr to char after end */ retval = strtol(str, &endptr, 10); /* if it's not NUL, there's junk following it */ /* let the user know and quit */ if (*endptr != '\0'){ fprintf(stderr, "Invalid parameter for -s: '%s'\n", str); exit(EXIT_FAILURE); } /* either underflow or overflow happened */ if (errno == ERANGE){ perror("strtol"); exit(EXIT_FAILURE); } /* return the integer */ return((int) retval); } /* * process integer in str; check for under/overflow */ double getdouble(char *str) { char *endptr; /* points to char just beyond end of integer */ double retval; /* the integer being read */ /* read the double, set endptr to char after end */ retval = strtod(str, &endptr); /* if it's not NUL, there's junk following it */ /* let the user know and quit */ if (*endptr != '\0'){ fprintf(stderr, "Invalid parameter for -t: '%s'\n", str); exit(EXIT_FAILURE); } /* either underflow or overflow happened */ if (errno == ERANGE){ perror("strtod"); exit(EXIT_FAILURE); } /* return the double */ return(retval); } /* * say how to use this */ void usage(char *pn) { fprintf(stderr, "Usage: %s [ -s skip ] [-t threshhold ] file [ ... ]\n", pn); } /* * In the beginning . . . */ int main(int argc, char *argv[]) { FILE *fp; /* pointer to current input file */ int opt; /* the option being processed */ int i; /* counter in a for loop */ int rv = 0; /* number of unreadable input files */ /* * process options */ while((opt = getopt(argc, argv, "s:t:")) != -1){ switch(opt){ case 's': /* change skip distance */ skip = getint(optarg); break; case 't': /* change threshhold */ threshhold = getdouble(optarg); break; default: /* no idea */ usage(argv[0]); exit(EXIT_FAILURE); } } /* check there is a file argument */ if (optind == argc){ fprintf(stderr, "%s: need at least 1 file name\n", argv[0]); return(EXIT_FAILURE); } /* walk the argument list */ for (i = optind; argv[i] != NULL; i++){ /* open the file */ if ((fp = fopen(argv[i], "r")) == NULL){ perror(argv[i]); rv++; continue; } /* do the test */ bintest(argv[i], fp); /* done with this file */ (void) fclose(fp); } /* that's all folks! */ return(rv); }
|
ECS 36A, Programming & Problem Solving Version of April 2, 2024 at 12:13PM
|
You can get the raw source code here. |