/* * statistical analysis of file to see if it is binary or text data * * every SKIP'th character is read, and checked to see if it is printable * (white space is considered printable) * * if at least THRESHHOLD% characters are printable, it's binary; otherwise * it's text INPORTANT: THRESHHOLD is a percentage, not a fraction!) * * Matt Bishop, ECS 36A * -- May 22, 2024 Original program */ #include #include #include #include #include #include #include #include #include int skip = 10; /* read 1 out of every SKIP characters */ double threshhold = 25.0; /* a percent, *not* a fraction! */ /* * output functions */ /* write to standard output */ void outdump(char *s) { (void) write(1, s, strlen(s) * sizeof(char)); } /* write to standard error */ void errdump(char *s) { (void) write(2, s, strlen(s) * sizeof(char))); } /* write a system error message to standard error */ void perrdump(char *s) { int oops = errno; /* remember the current error number */ errdump(s); errdump(": "); errdump(strerror(oops)); errdump("\n"); } /* * test to see if this is a binary file */ void bintest(char *fname, int fd) { char ch; /* input character */ int bin = 0; /* number of binary characters */ int nonbin = 0; /* number of printing characters */ int sample_size = 0; /* number of characters read */ double frac = 0.0; /* percentage of binary characters */ int x; /* return value of read() */ char errbuf[1000]; /* buffer to hold errors */ /* * skip ahead SKIP characters and analyze the character */ while(1){ /* go to the position, and check for errors */ errno = 0; if (lseek(fd, skip, SEEK_CUR) == -1 && errno != 0){ perrdump("lseek"); break; } /* if done, break out of the loop */ if ((x = read(fd, &ch, sizeof(char))) == 0) break; else if (x != 1){ perrdump("read"); break; } /* read another character */ sample_size += 1; /* classify the character; note spaces */ /* are considered printable characters */ if (isprint(ch)) nonbin++; else bin++; } /* * now do the analysis */ /* what percent of the sampled characters are non-printable? */ frac = ((double) bin) / ((double) sample_size) * 100; /* print out the results */ sprintf(errbuf, "%s: %d samples,", fname, sample_size); outdump(errbuf); sprintf(errbuf, " %d non-binary, %d (%0.2f%%) binary; a ", nonbin, bin, frac); outdump(errbuf); sprintf(errbuf, frac < threshhold ? "text" : "binary"); outdump(errbuf); outdump(" file\n"); } /* * process integer in str; check for under/overflow */ int getint(char *str) { char *endptr; /* points to char just beyond end of integer */ long retval; /* the integer being read */ char errbuf[1000]; /* buffer to hold errors */ /* read the integer in base 10, set endptr to char after end */ retval = strtol(str, &endptr, 10); /* if it's not NUL, there's junk following it */ /* let the user know and quit */ if (*endptr != '\0'){ strcpy(errbuf, "Invalid parameter for -s: '"); strcat(errbuf, str); strcat(errbuf, "'\n"); errdump(errbuf); exit(EXIT_FAILURE); } /* either underflow or overflow happened */ if (errno == ERANGE){ perrdump("strtol"); exit(EXIT_FAILURE); } /* return the integer */ return((int) retval); } /* * process integer in str; check for under/overflow */ double getdouble(char *str) { char *endptr; /* points to char just beyond end of integer */ double retval; /* the integer being read */ char errbuf[1000]; /* buffer to hold errors */ /* read the double, set endptr to char after end */ retval = strtod(str, &endptr); /* if it's not NUL, there's junk following it */ /* let the user know and quit */ if (*endptr != '\0'){ strcpy(errbuf, "Invalid parameter for -t: '"); strcat(errbuf, str); strcat(errbuf, "'\n"); errdump(errbuf); exit(EXIT_FAILURE); } /* either underflow or overflow happened */ if (errno == ERANGE){ perrdump("strtod"); exit(EXIT_FAILURE); } /* return the double */ return(retval); } /* * say how to use this */ void usage(char *pn) { char errbuf[1000]; /* buffer to hold errors */ /* give the usage error message */ strcpy(errbuf, "Usage: "); strcat(errbuf, pn); strcat(errbuf, " [ -s skip ] [-t threshhold ] file [ ... ]\n"); errdump(errbuf); } /* * In the beginning . . . */ int main(int argc, char *argv[]) { int fd; /* descriptor to current input file */ int opt; /* the option being processed */ int i; /* counter in a for loop */ int rv = 0; /* number of unreadable input files */ char errbuf[1000]; /* buffer to hold errors */ /* * process options */ while((opt = getopt(argc, argv, "s:t:")) != -1){ switch(opt){ case 's': /* change skip distance */ skip = getint(optarg); break; case 't': /* change threshhold */ threshhold = getdouble(optarg); break; default: /* no idea */ usage(argv[0]); exit(EXIT_FAILURE); } } /* check there is a file argument */ if (optind == argc){ strcpy(errbuf, argv[0]); strcat(errbuf, ": need at least 1 file name\n"); errdump(errbuf); return(EXIT_FAILURE); } /* walk the argument list */ for (i = optind; argv[i] != NULL; i++){ /* open the file */ if ((fd = open(argv[i], O_RDONLY)) < 0){ perrdump(argv[i]); rv++; continue; } /* do the test */ bintest(argv[i], fd); /* done with this file */ (void) close(fd); } /* that's all folks! */ return(rv); }