stats2.c

/*
 * statistical analysis of file to see if it is binary or text data
 *
 * every SKIP'th character is read, and checked to see if it is printable
 * (white space is considered printable)
 *
 * if at least THRESHHOLD% characters are printable, it's binary; otherwise
 * it's text INPORTANT: THRESHHOLD is a percentage, not a fraction!)
 *
 * Matt Bishop, ECS 36A
 * -- May 22, 2024              Original program
 */
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <errno.h>

int skip = 10;			/* read 1 out of every SKIP characters */
double threshhold = 25.0;	/* a percent, *not* a fraction! */

/*
 * forward declarations
 */
int getopt(int, char *[], char *);
extern char *optarg;		/* argument to option */
extern int optind;		/* option index in argv */

/*
 * test to see if this is a binary file
 */
void bintest(char *fname, FILE *fp)
{
        int ch;                         /* input character */
        int bin = 0;                    /* number of binary characters */
        int nonbin = 0;                 /* number of printing characters */
        int sample_size = 0;            /* number of characters read */
        double frac = 0.0;              /* percentage of binary characters */

        /*
         * skip ahead SKIP characters and analyze the character
         */
        while(fseek(fp, skip, SEEK_CUR) != -1){
                /* if done, break out of the loop */
                if ((ch = fgetc(fp)) == EOF)
                        break;
                /* read another character */
                sample_size += 1;
                /* classify the character; note spaces */
                /* are considered printable characters */
                if (isprint(ch))
                        nonbin++;
                else
                        bin++;
        }

        /*
         * now do the analysis
         */
        /* what percent of the sampled characters are non-printable? */
        frac = ((double) bin) / ((double) sample_size) * 100;

        /* print out the results */
        printf("%s: %d samples,", fname, sample_size);
        printf(" %d non-binary, %d (%0.2f%%) binary; a ", nonbin, bin, frac);
        printf(frac < threshhold ? "text" : "binary");
        printf(" file\n");
}

/*
 * process integer in str; check for under/overflow
 */
int getint(char *str)
{
	char *endptr;	/* points to char just beyond end of integer */
	long retval;	/* the integer being read */

	/* read the integer in base 10, set endptr to char after end */
	retval = strtol(str, &endptr, 10);

	/* if it's not NUL, there's junk following it */
	/* let the user know and quit                  */
	if (*endptr != '\0'){
		fprintf(stderr, "Invalid parameter for -s: '%s'\n", str);
		exit(EXIT_FAILURE);
	}
	/* either underflow or overflow happened */
	if (errno == ERANGE){
		perror("strtol");
		exit(EXIT_FAILURE);
	}

	/* return the integer */
	return((int) retval);
}

/*
 * process integer in str; check for under/overflow
 */
double getdouble(char *str)
{
	char *endptr;	/* points to char just beyond end of integer */
	double retval;	/* the integer being read */

	/* read the double, set endptr to char after end */
	retval = strtod(str, &endptr);

	/* if it's not NUL, there's junk following it */
	/* let the user know and quit                  */
	if (*endptr != '\0'){
		fprintf(stderr, "Invalid parameter for -t: '%s'\n", str);
		exit(EXIT_FAILURE);
	}
	/* either underflow or overflow happened */
	if (errno == ERANGE){
		perror("strtod");
		exit(EXIT_FAILURE);
	}

	/* return the double */
	return(retval);
}

/*
 * say how to use this
 */
void usage(char *pn)
{
	fprintf(stderr, "Usage: %s [ -s skip ] [-t threshhold ] file [ ... ]\n",
					pn);
}
	

/*
 * In the beginning . . .
 */
int main(int argc, char *argv[])
{
	FILE *fp;		/* pointer to current input file */
	int opt;		/* the option being processed */
	int i;			/* counter in a for loop */
	int rv = 0;		/* number of unreadable input files */

	/* 
	 * process options
	 */
	while((opt = getopt(argc, argv, "s:t:")) != -1){
		switch(opt){
		case 's':			/* change skip distance */
			skip = getint(optarg);
			break;
		case 't':			/* change threshhold */
			threshhold = getdouble(optarg);
			break;
		default:			/* no idea */
			usage(argv[0]);
			exit(EXIT_FAILURE);
		}
	}

	/* check there is a file argument */
	if (optind == argc){
		fprintf(stderr, "%s: need at least 1 file name\n", argv[0]);
		return(EXIT_FAILURE);
	}

        /* walk the argument list */
	for (i = optind; argv[i] != NULL; i++){
		/* open the file */
		if ((fp = fopen(argv[i], "r")) == NULL){
			perror(argv[i]);
			rv++;
			continue;
		}
		/* do the test */
		bintest(argv[i], fp);
		/* done with this file */
		(void) fclose(fp);
	}

	/* that's all folks! */
	return(rv);
}




UC Davis sigil
Matt Bishop
Office: 2209 Watershed Sciences
Phone: +1 (530) 752-8060
Email: mabishop@ucdavis.edu
ECS 36A, Programming & Problem Solving
Version of April 2, 2024 at 12:13PM

You can get the raw source code here.

Valid HTML 4.01 Transitional Built with BBEdit Built on a Macintosh