/*
* statistical analysis of file to see if it is binary or text data
*
* every SKIP'th character is read, and checked to see if it is printable
* (white space is considered printable)
*
* if at least THRESHHOLD% characters are printable, it's binary; otherwise
* it's text INPORTANT: THRESHHOLD is a percentage, not a fraction!)
*
* Matt Bishop, ECS 36A
* -- May 22, 2024 Original program
*/
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <errno.h>
int skip = 10; /* read 1 out of every SKIP characters */
double threshhold = 25.0; /* a percent, *not* a fraction! */
/*
* forward declarations
*/
int getopt(int, char *[], char *);
extern char *optarg; /* argument to option */
extern int optind; /* option index in argv */
/*
* test to see if this is a binary file
*/
void bintest(char *fname, FILE *fp)
{
int ch; /* input character */
int bin = 0; /* number of binary characters */
int nonbin = 0; /* number of printing characters */
int sample_size = 0; /* number of characters read */
double frac = 0.0; /* percentage of binary characters */
/*
* skip ahead SKIP characters and analyze the character
*/
while(fseek(fp, skip, SEEK_CUR) != -1){
/* if done, break out of the loop */
if ((ch = fgetc(fp)) == EOF)
break;
/* read another character */
sample_size += 1;
/* classify the character; note spaces */
/* are considered printable characters */
if (isprint(ch))
nonbin++;
else
bin++;
}
/*
* now do the analysis
*/
/* what percent of the sampled characters are non-printable? */
frac = ((double) bin) / ((double) sample_size) * 100;
/* print out the results */
printf("%s: %d samples,", fname, sample_size);
printf(" %d non-binary, %d (%0.2f%%) binary; a ", nonbin, bin, frac);
printf(frac < threshhold ? "text" : "binary");
printf(" file\n");
}
/*
* process integer in str; check for under/overflow
*/
int getint(char *str)
{
char *endptr; /* points to char just beyond end of integer */
long retval; /* the integer being read */
/* read the integer in base 10, set endptr to char after end */
retval = strtol(str, &endptr, 10);
/* if it's not NUL, there's junk following it */
/* let the user know and quit */
if (*endptr != '\0'){
fprintf(stderr, "Invalid parameter for -s: '%s'\n", str);
exit(EXIT_FAILURE);
}
/* either underflow or overflow happened */
if (errno == ERANGE){
perror("strtol");
exit(EXIT_FAILURE);
}
/* return the integer */
return((int) retval);
}
/*
* process integer in str; check for under/overflow
*/
double getdouble(char *str)
{
char *endptr; /* points to char just beyond end of integer */
double retval; /* the integer being read */
/* read the double, set endptr to char after end */
retval = strtod(str, &endptr);
/* if it's not NUL, there's junk following it */
/* let the user know and quit */
if (*endptr != '\0'){
fprintf(stderr, "Invalid parameter for -t: '%s'\n", str);
exit(EXIT_FAILURE);
}
/* either underflow or overflow happened */
if (errno == ERANGE){
perror("strtod");
exit(EXIT_FAILURE);
}
/* return the double */
return(retval);
}
/*
* say how to use this
*/
void usage(char *pn)
{
fprintf(stderr, "Usage: %s [ -s skip ] [-t threshhold ] file [ ... ]\n",
pn);
}
/*
* In the beginning . . .
*/
int main(int argc, char *argv[])
{
FILE *fp; /* pointer to current input file */
int opt; /* the option being processed */
int i; /* counter in a for loop */
int rv = 0; /* number of unreadable input files */
/*
* process options
*/
while((opt = getopt(argc, argv, "s:t:")) != -1){
switch(opt){
case 's': /* change skip distance */
skip = getint(optarg);
break;
case 't': /* change threshhold */
threshhold = getdouble(optarg);
break;
default: /* no idea */
usage(argv[0]);
exit(EXIT_FAILURE);
}
}
/* check there is a file argument */
if (optind == argc){
fprintf(stderr, "%s: need at least 1 file name\n", argv[0]);
return(EXIT_FAILURE);
}
/* walk the argument list */
for (i = optind; argv[i] != NULL; i++){
/* open the file */
if ((fp = fopen(argv[i], "r")) == NULL){
perror(argv[i]);
rv++;
continue;
}
/* do the test */
bintest(argv[i], fp);
/* done with this file */
(void) fclose(fp);
}
/* that's all folks! */
return(rv);
}
|
ECS 36A, Programming & Problem Solving Version of April 2, 2024 at 12:13PM
|
You can get the raw source code here. |