plan9port

[fork] Plan 9 from user space
git clone git://src.adamsgaard.dk/plan9port # fast
git clone https://src.adamsgaard.dk/plan9port.git # slow
Log | Files | Refs | README | LICENSE Back to index

file.c (23931B)


      1 #include <u.h>
      2 #include <libc.h>
      3 #include <bio.h>
      4 #include <ctype.h>
      5 #include <mach.h>
      6 
      7 /*
      8  * file - determine type of file
      9  */
     10 #define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
     11 
     12 uchar	buf[6001];
     13 short	cfreq[140];
     14 short	wfreq[50];
     15 int	nbuf;
     16 Dir*	mbuf;
     17 int	fd;
     18 char 	*fname;
     19 char	*slash;
     20 
     21 enum
     22 {
     23 	Cword,
     24 	Fword,
     25 	Aword,
     26 	Alword,
     27 	Lword,
     28 	I1,
     29 	I2,
     30 	I3,
     31 	Clatin	= 128,
     32 	Cbinary,
     33 	Cnull,
     34 	Ceascii,
     35 	Cutf,
     36 };
     37 struct
     38 {
     39 	char*	word;
     40 	int	class;
     41 } dict[] =
     42 {
     43 	"PATH",		Lword,
     44 	"TEXT",		Aword,
     45 	"adt",		Alword,
     46 	"aggr",		Alword,
     47 	"alef",		Alword,
     48 	"array",	Lword,
     49 	"block",	Fword,
     50 	"chan",		Alword,
     51 	"char",		Cword,
     52 	"common",	Fword,
     53 	"con",		Lword,
     54 	"data",		Fword,
     55 	"dimension",	Fword,
     56 	"double",	Cword,
     57 	"extern",	Cword,
     58 	"bio",		I2,
     59 	"float",	Cword,
     60 	"fn",		Lword,
     61 	"function",	Fword,
     62 	"h",		I3,
     63 	"implement",	Lword,
     64 	"import",	Lword,
     65 	"include",	I1,
     66 	"int",		Cword,
     67 	"integer",	Fword,
     68 	"iota",		Lword,
     69 	"libc",		I2,
     70 	"long",		Cword,
     71 	"module",	Lword,
     72 	"real",		Fword,
     73 	"ref",		Lword,
     74 	"register",	Cword,
     75 	"self",		Lword,
     76 	"short",	Cword,
     77 	"static",	Cword,
     78 	"stdio",	I2,
     79 	"struct",	Cword,
     80 	"subroutine",	Fword,
     81 	"u",		I2,
     82 	"void",		Cword,
     83 };
     84 
     85 /* codes for 'mode' field in language structure */
     86 enum	{
     87 		Normal	= 0,
     88 		First,		/* first entry for language spanning several ranges */
     89 		Multi,		/* later entries "   "       "  ... */
     90 		Shared,		/* codes used in several languages */
     91 	};
     92 
     93 struct
     94 {
     95 	int	mode;		/* see enum above */
     96 	int 	count;
     97 	int	low;
     98 	int	high;
     99 	char	*name;
    100 
    101 } language[] =
    102 {
    103 	Normal, 0,	0x0080, 0x0080,	"Extended Latin",
    104 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
    105 	Normal,	0,	0x0370,	0x03FF,	"Greek",
    106 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
    107 	Normal,	0,	0x0530,	0x058F,	"Armenian",
    108 	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
    109 	Normal,	0,	0x0600,	0x06FF,	"Arabic",
    110 	Normal,	0,	0x0900,	0x097F,	"Devanagari",
    111 	Normal,	0,	0x0980,	0x09FF,	"Bengali",
    112 	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
    113 	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
    114 	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
    115 	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
    116 	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
    117 	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
    118 	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
    119 	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
    120 	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
    121 	Normal,	0,	0x1000,	0x105F,	"Tibetan",
    122 	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
    123 	Normal,	0,	0x3040,	0x30FF,	"Japanese",
    124 	Normal,	0,	0x3100,	0x312F,	"Chinese",
    125 	First,	0,	0x3130,	0x318F,	"Korean",
    126 	Multi,	0,	0x3400,	0x3D2F,	"Korean",
    127 	Shared,	0,	0x4e00,	0x9fff,	"CJK",
    128 	Normal,	0,	0,	0,	0,		/* terminal entry */
    129 };
    130 
    131 
    132 enum
    133 {
    134 	Fascii,		/* printable ascii */
    135 	Flatin,		/* latin 1*/
    136 	Futf,		/* UTf character set */
    137 	Fbinary,	/* binary */
    138 	Feascii,	/* ASCII with control chars */
    139 	Fnull,		/* NULL in file */
    140 } guess;
    141 
    142 void	bump_utf_count(Rune);
    143 int	cistrncmp(char*, char*, int);
    144 void	filetype(int);
    145 int	getfontnum(uchar*, uchar**);
    146 int	isas(void);
    147 int	isc(void);
    148 int	isenglish(void);
    149 int	ishp(void);
    150 int	ishtml(void);
    151 int	isrfc822(void);
    152 int	ismbox(void);
    153 int	islimbo(void);
    154 int	ismung(void);
    155 int	isp9bit(void);
    156 int	isp9font(void);
    157 int	isrtf(void);
    158 int	ismsdos(void);
    159 int	iself(void);
    160 int	istring(void);
    161 int	iff(void);
    162 int	long0(void);
    163 int	istar(void);
    164 int	p9bitnum(uchar*);
    165 int	p9subfont(uchar*);
    166 void	print_utf(void);
    167 void	type(char*, int);
    168 int	utf_count(void);
    169 void	wordfreq(void);
    170 
    171 int	(*call[])(void) =
    172 {
    173 	long0,		/* recognizable by first 4 bytes */
    174 	istring,	/* recognizable by first string */
    175 	iff,		/* interchange file format (strings) */
    176 	isrfc822,	/* email file */
    177 	ismbox,		/* mail box */
    178 	istar,		/* recognizable by tar checksum */
    179 	ishtml,		/* html keywords */
    180 /*	iscint,		/* compiler/assembler intermediate */
    181 	islimbo,	/* limbo source */
    182 	isc,		/* c & alef compiler key words */
    183 	isas,		/* assembler key words */
    184 	ismung,		/* entropy compressed/encrypted */
    185 	isp9font,	/* plan 9 font */
    186 	isp9bit,	/* plan 9 image (as from /dev/window) */
    187 	isenglish,	/* char frequency English */
    188 	isrtf,		/* rich text format */
    189 	ismsdos,	/* msdos exe (virus file attachement) */
    190 	iself,		/* ELF (foreign) executable */
    191 	0
    192 };
    193 
    194 int mime;
    195 
    196 #define OCTET	"application/octet-stream\n"
    197 #define PLAIN	"text/plain\n"
    198 
    199 void
    200 main(int argc, char *argv[])
    201 {
    202 	int i, j, maxlen;
    203 	char *cp;
    204 	Rune r;
    205 
    206 	ARGBEGIN{
    207 	case 'm':
    208 		mime = 1;
    209 		break;
    210 	default:
    211 		fprint(2, "usage: file [-m] [file...]\n");
    212 		exits("usage");
    213 	}ARGEND;
    214 
    215 	maxlen = 0;
    216 	if(mime == 0 || argc > 1){
    217 		for(i = 0; i < argc; i++) {
    218 			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
    219 					;
    220 			if(j > maxlen)
    221 				maxlen = j;
    222 		}
    223 	}
    224 	if (argc <= 0) {
    225 		if(!mime)
    226 			print ("stdin: ");
    227 		filetype(0);
    228 	}
    229 	else {
    230 		for(i = 0; i < argc; i++)
    231 			type(argv[i], maxlen);
    232 	}
    233 	exits(0);
    234 }
    235 
    236 void
    237 type(char *file, int nlen)
    238 {
    239 	Rune r;
    240 	int i;
    241 	char *p;
    242 
    243 	if(nlen > 0){
    244 		slash = 0;
    245 		for (i = 0, p = file; *p; i++) {
    246 			if (*p == '/')			/* find rightmost slash */
    247 				slash = p;
    248 			p += chartorune(&r, p);		/* count runes */
    249 		}
    250 		print("%s:%*s",file, nlen-i+1, "");
    251 	}
    252 	fname = file;
    253 	if ((fd = open(file, OREAD)) < 0) {
    254 		print("cannot open\n");
    255 		return;
    256 	}
    257 	filetype(fd);
    258 	close(fd);
    259 }
    260 
    261 void
    262 filetype(int fd)
    263 {
    264 	Rune r;
    265 	int i, f, n;
    266 	char *p, *eob;
    267 
    268 	free(mbuf);
    269 	mbuf = dirfstat(fd);
    270 	if(mbuf == nil){
    271 		print("cannot stat: %r\n");
    272 		return;
    273 	}
    274 	if(mbuf->mode & DMDIR) {
    275 		print(mime ? "text/directory\n" : "directory\n");
    276 		return;
    277 	}
    278 	if(mbuf->type != 'M' && mbuf->type != '|') {
    279 		print(mime ? OCTET : "special file #%c/%s\n",
    280 			mbuf->type, mbuf->name);
    281 		return;
    282 	}
    283 	nbuf = read(fd, buf, sizeof(buf)-1);
    284 
    285 	if(nbuf < 0) {
    286 		print("cannot read\n");
    287 		return;
    288 	}
    289 	if(nbuf == 0) {
    290 		print(mime ? PLAIN : "empty file\n");
    291 		return;
    292 	}
    293 	buf[nbuf] = 0;
    294 
    295 	/*
    296 	 * build histogram table
    297 	 */
    298 	memset(cfreq, 0, sizeof(cfreq));
    299 	for (i = 0; language[i].name; i++)
    300 		language[i].count = 0;
    301 	eob = (char *)buf+nbuf;
    302 	for(n = 0, p = (char *)buf; p < eob; n++) {
    303 		if (!fullrune(p, eob-p) && eob-p < UTFmax)
    304 			break;
    305 		p += chartorune(&r, p);
    306 		if (r == 0)
    307 			f = Cnull;
    308 		else if (r <= 0x7f) {
    309 			if (!isprint(r) && !isspace(r))
    310 				f = Ceascii;	/* ASCII control char */
    311 			else f = r;
    312 		} else if (r == 0x080) {
    313 			bump_utf_count(r);
    314 			f = Cutf;
    315 		} else if (r < 0xA0)
    316 				f = Cbinary;	/* Invalid Runes */
    317 		else if (r <= 0xff)
    318 				f = Clatin;	/* Latin 1 */
    319 		else {
    320 			bump_utf_count(r);
    321 			f = Cutf;		/* UTF extension */
    322 		}
    323 		cfreq[f]++;			/* ASCII chars peg directly */
    324 	}
    325 	/*
    326 	 * gross classify
    327 	 */
    328 	if (cfreq[Cbinary])
    329 		guess = Fbinary;
    330 	else if (cfreq[Cutf])
    331 		guess = Futf;
    332 	else if (cfreq[Clatin])
    333 		guess = Flatin;
    334 	else if (cfreq[Ceascii])
    335 		guess = Feascii;
    336 	else if (cfreq[Cnull] == n) {
    337 		print(mime ? OCTET : "first block all null bytes\n");
    338 		return;
    339 	}
    340 	else guess = Fascii;
    341 	/*
    342 	 * lookup dictionary words
    343 	 */
    344 	memset(wfreq, 0, sizeof(wfreq));
    345 	if(guess == Fascii || guess == Flatin || guess == Futf)
    346 		wordfreq();
    347 	/*
    348 	 * call individual classify routines
    349 	 */
    350 	for(i=0; call[i]; i++)
    351 		if((*call[i])())
    352 			return;
    353 
    354 	/*
    355 	 * if all else fails,
    356 	 * print out gross classification
    357 	 */
    358 	if (nbuf < 100 && !mime)
    359 		print(mime ? PLAIN : "short ");
    360 	if (guess == Fascii)
    361 		print(mime ? PLAIN : "Ascii\n");
    362 	else if (guess == Feascii)
    363 		print(mime ? PLAIN : "extended ascii\n");
    364 	else if (guess == Flatin)
    365 		print(mime ? PLAIN : "latin ascii\n");
    366 	else if (guess == Futf && utf_count() < 4)
    367 		print_utf();
    368 	else print(mime ? OCTET : "binary\n");
    369 }
    370 
    371 void
    372 bump_utf_count(Rune r)
    373 {
    374 	int low, high, mid;
    375 
    376 	high = sizeof(language)/sizeof(language[0])-1;
    377 	for (low = 0; low < high;) {
    378 		mid = (low+high)/2;
    379 		if (r >=language[mid].low) {
    380 			if (r <= language[mid].high) {
    381 				language[mid].count++;
    382 				break;
    383 			} else low = mid+1;
    384 		} else high = mid;
    385 	}
    386 }
    387 
    388 int
    389 utf_count(void)
    390 {
    391 	int i, count;
    392 
    393 	count = 0;
    394 	for (i = 0; language[i].name; i++)
    395 		if (language[i].count > 0)
    396 			switch (language[i].mode) {
    397 			case Normal:
    398 			case First:
    399 				count++;
    400 				break;
    401 			default:
    402 				break;
    403 			}
    404 	return count;
    405 }
    406 
    407 int
    408 chkascii(void)
    409 {
    410 	int i;
    411 
    412 	for (i = 'a'; i < 'z'; i++)
    413 		if (cfreq[i])
    414 			return 1;
    415 	for (i = 'A'; i < 'Z'; i++)
    416 		if (cfreq[i])
    417 			return 1;
    418 	return 0;
    419 }
    420 
    421 int
    422 find_first(char *name)
    423 {
    424 	int i;
    425 
    426 	for (i = 0; language[i].name != 0; i++)
    427 		if (language[i].mode == First
    428 			&& strcmp(language[i].name, name) == 0)
    429 			return i;
    430 	return -1;
    431 }
    432 
    433 void
    434 print_utf(void)
    435 {
    436 	int i, printed, j;
    437 
    438 	if(mime){
    439 		print(PLAIN);
    440 		return;
    441 	}
    442 	if (chkascii()) {
    443 		printed = 1;
    444 		print("Ascii");
    445 	} else
    446 		printed = 0;
    447 	for (i = 0; language[i].name; i++)
    448 		if (language[i].count) {
    449 			switch(language[i].mode) {
    450 			case Multi:
    451 				j = find_first(language[i].name);
    452 				if (j < 0)
    453 					break;
    454 				if (language[j].count > 0)
    455 					break;
    456 				/* Fall through */
    457 			case Normal:
    458 			case First:
    459 				if (printed)
    460 					print(" & ");
    461 				else printed = 1;
    462 				print("%s", language[i].name);
    463 				break;
    464 			case Shared:
    465 			default:
    466 				break;
    467 			}
    468 		}
    469 	if(!printed)
    470 		print("UTF");
    471 	print(" text\n");
    472 }
    473 
    474 void
    475 wordfreq(void)
    476 {
    477 	int low, high, mid, r;
    478 	uchar *p, *p2, c;
    479 
    480 	p = buf;
    481 	for(;;) {
    482 		while (p < buf+nbuf && !isalpha(*p))
    483 			p++;
    484 		if (p >= buf+nbuf)
    485 			return;
    486 		p2 = p;
    487 		while(p < buf+nbuf && isalpha(*p))
    488 			p++;
    489 		c = *p;
    490 		*p = 0;
    491 		high = sizeof(dict)/sizeof(dict[0]);
    492 		for(low = 0;low < high;) {
    493 			mid = (low+high)/2;
    494 			r = strcmp(dict[mid].word, (char*)p2);
    495 			if(r == 0) {
    496 				wfreq[dict[mid].class]++;
    497 				break;
    498 			}
    499 			if(r < 0)
    500 				low = mid+1;
    501 			else
    502 				high = mid;
    503 		}
    504 		*p++ = c;
    505 	}
    506 }
    507 
    508 typedef struct Filemagic Filemagic;
    509 struct Filemagic {
    510 	ulong x;
    511 	ulong mask;
    512 	char *desc;
    513 	char *mime;
    514 };
    515 
    516 Filemagic long0tab[] = {
    517 	0xF16DF16D,	0xFFFFFFFF,	"pac1 audio file\n",	OCTET,
    518 	0x31636170,	0xFFFFFFFF,	"pac3 audio file\n",	OCTET,
    519 	0x32636170,	0xFFFF00FF,	"pac4 audio file\n",	OCTET,
    520 	0xBA010000,	0xFFFFFFFF,	"mpeg system stream\n",	OCTET,
    521 	0x30800CC0,	0xFFFFFFFF,	"inferno .dis executable\n", OCTET,
    522 	0x04034B50,	0xFFFFFFFF,	"zip archive\n", "application/zip\n",
    523 	070707,		0xFFFF,		"cpio archive\n", OCTET,
    524 	0x2F7,		0xFFFF,		"tex dvi\n", "application/dvi\n",
    525 	0xfffa0000,	0xfffe0000,	"mp3 audio\n",	"audio/mpeg\n",
    526 	0xcafebabe,	0xFFFFFFFF,	"Mach-O fat executable\n",	"application/x-mach-binary\n",
    527 	0xfeedface,	0xFFFFFFFE,	"Mach-O executable\n",	"application/x-mach-binary\n",
    528 	0xbebafeca,	0xFFFFFFFF,	"Java class\n",	"application/x-java-applet\n",
    529 };
    530 
    531 int
    532 filemagic(Filemagic *tab, int ntab, ulong x)
    533 {
    534 	int i;
    535 
    536 	for(i=0; i<ntab; i++)
    537 		if((x&tab[i].mask) == tab[i].x){
    538 			print(mime ? tab[i].mime : tab[i].desc);
    539 			return 1;
    540 		}
    541 	return 0;
    542 }
    543 
    544 int
    545 long0(void)
    546 {
    547 /*	Fhdr *f; */
    548 	long x;
    549 
    550 	seek(fd, 0, 0);		/* reposition to start of file */
    551 /*
    552 	if(crackhdr(fd, &f)) {
    553 		print(mime ? OCTET : "%s\n", f.name);
    554 		return 1;
    555 	}
    556 */
    557 	x = LENDIAN(buf);
    558 	if(filemagic(long0tab, nelem(long0tab), x))
    559 		return 1;
    560 	return 0;
    561 }
    562 
    563 /* from tar.c */
    564 enum { NAMSIZ = 100, TBLOCK = 512 };
    565 
    566 union	hblock
    567 {
    568 	char	dummy[TBLOCK];
    569 	struct	header
    570 	{
    571 		char	name[NAMSIZ];
    572 		char	mode[8];
    573 		char	uid[8];
    574 		char	gid[8];
    575 		char	size[12];
    576 		char	mtime[12];
    577 		char	chksum[8];
    578 		char	linkflag;
    579 		char	linkname[NAMSIZ];
    580 		/* rest are defined by POSIX's ustar format; see p1003.2b */
    581 		char	magic[6];	/* "ustar" */
    582 		char	version[2];
    583 		char	uname[32];
    584 		char	gname[32];
    585 		char	devmajor[8];
    586 		char	devminor[8];
    587 		char	prefix[155];  /* if non-null, path = prefix "/" name */
    588 	} dbuf;
    589 };
    590 
    591 int
    592 checksum(union hblock *hp)
    593 {
    594 	int i;
    595 	char *cp;
    596 	struct header *hdr = &hp->dbuf;
    597 
    598 	for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
    599 		*cp = ' ';
    600 	i = 0;
    601 	for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
    602 		i += *cp & 0xff;
    603 	return i;
    604 }
    605 
    606 int
    607 istar(void)
    608 {
    609 	int chksum;
    610 	char tblock[TBLOCK];
    611 	union hblock *hp = (union hblock *)tblock;
    612 	struct header *hdr = &hp->dbuf;
    613 
    614 	seek(fd, 0, 0);		/* reposition to start of file */
    615 	if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
    616 		return 0;
    617 	chksum = strtol(hdr->chksum, 0, 8);
    618 	if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
    619 		if (strcmp(hdr->magic, "ustar") == 0)
    620 			print(mime? "application/x-ustar\n":
    621 				"posix tar archive\n");
    622 		else
    623 			print(mime? "application/x-tar\n": "tar archive\n");
    624 		return 1;
    625 	}
    626 	return 0;
    627 }
    628 
    629 /*
    630  * initial words to classify file
    631  */
    632 struct	FILE_STRING
    633 {
    634 	char 	*key;
    635 	char	*filetype;
    636 	int	length;
    637 	char	*mime;
    638 } file_string[] =
    639 {
    640 	"!<arch>\n__.SYMDEF",	"archive random library",	16,	"application/octet-stream",
    641 	"!<arch>\n",		"archive",			8,	"application/octet-stream",
    642 	"070707",		"cpio archive - ascii header",	6,	"application/octet-stream",
    643 	"%!",			"postscript",			2,	"application/postscript",
    644 	"\004%!",		"postscript",			3,	"application/postscript",
    645 	"x T post",		"troff output for post",	8,	"application/troff",
    646 	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
    647 	"x T utf",		"troff output for UTF",		7,	"application/troff",
    648 	"x T 202",		"troff output for 202",		7,	"application/troff",
    649 	"x T aps",		"troff output for aps",		7,	"application/troff",
    650 	"GIF",			"GIF image", 			3,	"image/gif",
    651 	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
    652 	"%PDF",			"PDF",				4,	"application/pdf",
    653 	"<html>\n",		"HTML file",			7,	"text/html",
    654 	"<HTML>\n",		"HTML file",			7,	"text/html",
    655 	"compressed\n",		"Compressed image or subfont",	11,	"application/octet-stream",
    656 	"\111\111\052\000",	"tiff",				4,	"image/tiff",
    657 	"\115\115\000\052",	"tiff",				4,	"image/tiff",
    658 	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
    659 	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
    660 	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
    661 	"\106\117\126\142",	"x3f",				4,	"image/x3f",
    662 	"BM",			"bmp",				2,	"image/bmp",
    663 	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/octet-stream",
    664 	"<MakerFile ",		"FrameMaker file",		11,	"application/framemaker",
    665 	"\033%-12345X",	"HPJCL file",		9,	"application/hpjcl",
    666 	"ID3",			"mp3 audio with id3",	3,	"audio/mpeg",
    667 	0,0,0,0
    668 };
    669 
    670 int
    671 istring(void)
    672 {
    673 	int i, j;
    674 	struct FILE_STRING *p;
    675 
    676 	for(p = file_string; p->key; p++) {
    677 		if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
    678 			if(mime)
    679 				print("%s\n", p->mime);
    680 			else
    681 				print("%s\n", p->filetype);
    682 			return 1;
    683 		}
    684 	}
    685 	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
    686 		for(i = 5; i < nbuf; i++)
    687 			if(buf[i] == '\n')
    688 				break;
    689 		if(mime)
    690 			print(OCTET);
    691 		else
    692 			print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
    693 		return 1;
    694 	}
    695 	if(buf[0]=='#' && buf[1]=='!'){
    696 		i=2;
    697 		for(j=2; j < nbuf && buf[j] != ' ' && buf[j] != '\n' && buf[j] != '\r'; j++)
    698 			if(buf[j] == '/')
    699 				i = j+1;
    700 		if(mime)
    701 			print(PLAIN);
    702 		else
    703 			print("%.*s executable file script\n", utfnlen((char*)buf+i, j-i), (char*)buf+i);
    704 		return 1;
    705 	}
    706 	return 0;
    707 }
    708 
    709 int
    710 iff(void)
    711 {
    712 	if (strncmp((char*)buf, "FORM", 4) == 0 &&
    713 	    strncmp((char*)buf+8, "AIFF", 4) == 0) {
    714 		print("%s\n", mime? "audio/x-aiff": "aiff audio");
    715 		return 1;
    716 	}
    717 	return 0;
    718 }
    719 
    720 char*	html_string[] =
    721 {
    722 	"title",
    723 	"body",
    724 	"head",
    725 	"strong",
    726 	"h1",
    727 	"h2",
    728 	"h3",
    729 	"h4",
    730 	"h5",
    731 	"h6",
    732 	"ul",
    733 	"li",
    734 	"dl",
    735 	"br",
    736 	"em",
    737 	0,
    738 };
    739 
    740 int
    741 ishtml(void)
    742 {
    743 	uchar *p, *q;
    744 	int i, count;
    745 
    746 		/* compare strings between '<' and '>' to html table */
    747 	count = 0;
    748 	p = buf;
    749 	for(;;) {
    750 		while (p < buf+nbuf && *p != '<')
    751 			p++;
    752 		p++;
    753 		if (p >= buf+nbuf)
    754 			break;
    755 		if(*p == '/')
    756 			p++;
    757 		q = p;
    758 		while(p < buf+nbuf && *p != '>')
    759 			p++;
    760 		if (p >= buf+nbuf)
    761 			break;
    762 		for(i = 0; html_string[i]; i++) {
    763 			if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
    764 				if(count++ > 4) {
    765 					print(mime ? "text/html\n" : "HTML file\n");
    766 					return 1;
    767 				}
    768 				break;
    769 			}
    770 		}
    771 		p++;
    772 	}
    773 	return 0;
    774 }
    775 
    776 char*	rfc822_string[] =
    777 {
    778 	"from:",
    779 	"date:",
    780 	"to:",
    781 	"subject:",
    782 	"received:",
    783 	"reply to:",
    784 	"sender:",
    785 	0,
    786 };
    787 
    788 int
    789 isrfc822(void)
    790 {
    791 
    792 	char *p, *q, *r;
    793 	int i, count;
    794 
    795 	count = 0;
    796 	p = (char*)buf;
    797 	for(;;) {
    798 		q = strchr(p, '\n');
    799 		if(q == nil)
    800 			break;
    801 		*q = 0;
    802 		if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
    803 			count++;
    804 			*q = '\n';
    805 			p = q+1;
    806 			continue;
    807 		}
    808 		*q = '\n';
    809 		if(*p != '\t' && *p != ' '){
    810 			r = strchr(p, ':');
    811 			if(r == 0 || r > q)
    812 				break;
    813 			for(i = 0; rfc822_string[i]; i++) {
    814 				if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
    815 					count++;
    816 					break;
    817 				}
    818 			}
    819 		}
    820 		p = q+1;
    821 	}
    822 	if(count >= 3){
    823 		print(mime ? "message/rfc822\n" : "email file\n");
    824 		return 1;
    825 	}
    826 	return 0;
    827 }
    828 
    829 int
    830 ismbox(void)
    831 {
    832 	char *p, *q;
    833 
    834 	p = (char*)buf;
    835 	q = strchr(p, '\n');
    836 	if(q == nil)
    837 		return 0;
    838 	*q = 0;
    839 	if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
    840 		print(mime ? "text/plain\n" : "mail box\n");
    841 		return 1;
    842 	}
    843 	*q = '\n';
    844 	return 0;
    845 }
    846 
    847 int
    848 isc(void)
    849 {
    850 	int n;
    851 
    852 	n = wfreq[I1];
    853 	/*
    854 	 * includes
    855 	 */
    856 	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
    857 		goto yes;
    858 	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
    859 		goto yes;
    860 	/*
    861 	 * declarations
    862 	 */
    863 	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
    864 		goto yes;
    865 	/*
    866 	 * assignments
    867 	 */
    868 	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
    869 		goto yes;
    870 	return 0;
    871 
    872 yes:
    873 	if(mime){
    874 		print(PLAIN);
    875 		return 1;
    876 	}
    877 	if(wfreq[Alword] > 0)
    878 		print("alef program\n");
    879 	else
    880 		print("c program\n");
    881 	return 1;
    882 }
    883 
    884 int
    885 islimbo(void)
    886 {
    887 
    888 	/*
    889 	 * includes
    890 	 */
    891 	if(wfreq[Lword] < 4)
    892 		return 0;
    893 	print(mime ? PLAIN : "limbo program\n");
    894 	return 1;
    895 }
    896 
    897 int
    898 isas(void)
    899 {
    900 
    901 	/*
    902 	 * includes
    903 	 */
    904 	if(wfreq[Aword] < 2)
    905 		return 0;
    906 	print(mime ? PLAIN : "as program\n");
    907 	return 1;
    908 }
    909 
    910 /*
    911  * low entropy means encrypted
    912  */
    913 int
    914 ismung(void)
    915 {
    916 	int i, bucket[8];
    917 	float cs;
    918 
    919 	if(nbuf < 64)
    920 		return 0;
    921 	memset(bucket, 0, sizeof(bucket));
    922 	for(i=0; i<64; i++)
    923 		bucket[(buf[i]>>5)&07] += 1;
    924 
    925 	cs = 0.;
    926 	for(i=0; i<8; i++)
    927 		cs += (bucket[i]-8)*(bucket[i]-8);
    928 	cs /= 8.;
    929 	if(cs <= 24.322) {
    930 		if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
    931 			print(mime ? OCTET : "compressed\n");
    932 		else
    933 			print(mime ? OCTET : "encrypted\n");
    934 		return 1;
    935 	}
    936 	return 0;
    937 }
    938 
    939 /*
    940  * english by punctuation and frequencies
    941  */
    942 int
    943 isenglish(void)
    944 {
    945 	int vow, comm, rare, badpun, punct;
    946 	char *p;
    947 
    948 	if(guess != Fascii && guess != Feascii)
    949 		return 0;
    950 	badpun = 0;
    951 	punct = 0;
    952 	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
    953 		switch(*p) {
    954 		case '.':
    955 		case ',':
    956 		case ')':
    957 		case '%':
    958 		case ';':
    959 		case ':':
    960 		case '?':
    961 			punct++;
    962 			if(p[1] != ' ' && p[1] != '\n')
    963 				badpun++;
    964 		}
    965 	if(badpun*5 > punct)
    966 		return 0;
    967 	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
    968 		return 0;
    969 	if(2*cfreq[';'] > cfreq['e'])
    970 		return 0;
    971 
    972 	vow = 0;
    973 	for(p="AEIOU"; *p; p++) {
    974 		vow += cfreq[(uchar)*p];
    975 		vow += cfreq[tolower((uchar)*p)];
    976 	}
    977 	comm = 0;
    978 	for(p="ETAION"; *p; p++) {
    979 		comm += cfreq[(uchar)*p];
    980 		comm += cfreq[tolower((uchar)*p)];
    981 	}
    982 	rare = 0;
    983 	for(p="VJKQXZ"; *p; p++) {
    984 		rare += cfreq[(uchar)*p];
    985 		rare += cfreq[tolower((uchar)*p)];
    986 	}
    987 	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
    988 		print(mime ? PLAIN : "English text\n");
    989 		return 1;
    990 	}
    991 	return 0;
    992 }
    993 
    994 /*
    995  * pick up a number with
    996  * syntax _*[0-9]+_
    997  */
    998 #define	P9BITLEN	12
    999 int
   1000 p9bitnum(uchar *bp)
   1001 {
   1002 	int n, c, len;
   1003 
   1004 	len = P9BITLEN;
   1005 	while(*bp == ' ') {
   1006 		bp++;
   1007 		len--;
   1008 		if(len <= 0)
   1009 			return -1;
   1010 	}
   1011 	n = 0;
   1012 	while(len > 1) {
   1013 		c = *bp++;
   1014 		if(!isdigit(c))
   1015 			return -1;
   1016 		n = n*10 + c-'0';
   1017 		len--;
   1018 	}
   1019 	if(*bp != ' ')
   1020 		return -1;
   1021 	return n;
   1022 }
   1023 
   1024 int
   1025 depthof(char *s, int *newp)
   1026 {
   1027 	char *es;
   1028 	int d;
   1029 
   1030 	*newp = 0;
   1031 	es = s+12;
   1032 	while(s<es && *s==' ')
   1033 		s++;
   1034 	if(s == es)
   1035 		return -1;
   1036 	if('0'<=*s && *s<='9')
   1037 		return 1<<atoi(s);
   1038 
   1039 	*newp = 1;
   1040 	d = 0;
   1041 	while(s<es && *s!=' '){
   1042 		s++;	/* skip letter */
   1043 		d += strtoul(s, &s, 10);
   1044 	}
   1045 
   1046 	switch(d){
   1047 	case 32:
   1048 	case 24:
   1049 	case 16:
   1050 	case 8:
   1051 		return d;
   1052 	}
   1053 	return -1;
   1054 }
   1055 
   1056 int
   1057 isp9bit(void)
   1058 {
   1059 	int dep, lox, loy, hix, hiy, px, new;
   1060 	ulong t;
   1061 	long len;
   1062 	char *newlabel;
   1063 
   1064 	newlabel = "old ";
   1065 
   1066 	dep = depthof((char*)buf + 0*P9BITLEN, &new);
   1067 	if(new)
   1068 		newlabel = "";
   1069 	lox = p9bitnum(buf + 1*P9BITLEN);
   1070 	loy = p9bitnum(buf + 2*P9BITLEN);
   1071 	hix = p9bitnum(buf + 3*P9BITLEN);
   1072 	hiy = p9bitnum(buf + 4*P9BITLEN);
   1073 	if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
   1074 		return 0;
   1075 
   1076 	if(dep < 8){
   1077 		px = 8/dep;	/* pixels per byte */
   1078 		/* set l to number of bytes of data per scan line */
   1079 		if(lox >= 0)
   1080 			len = (hix+px-1)/px - lox/px;
   1081 		else{	/* make positive before divide */
   1082 			t = (-lox)+px-1;
   1083 			t = (t/px)*px;
   1084 			len = (t+hix+px-1)/px;
   1085 		}
   1086 	}else
   1087 		len = (hix-lox)*dep/8;
   1088 	len *= (hiy-loy);		/* col length */
   1089 	len += 5*P9BITLEN;		/* size of initial ascii */
   1090 
   1091 	/*
   1092 	 * for image file, length is non-zero and must match calculation above
   1093 	 * for /dev/window and /dev/screen the length is always zero
   1094 	 * for subfont, the subfont header should follow immediately.
   1095 	 */
   1096 	if (len != 0 && mbuf->length == 0) {
   1097 		print("%splan 9 image\n", newlabel);
   1098 		return 1;
   1099 	}
   1100 	if (mbuf->length == len) {
   1101 		print("%splan 9 image\n", newlabel);
   1102 		return 1;
   1103 	}
   1104 	/* Ghostscript sometimes produces a little extra on the end */
   1105 	if (mbuf->length < len+P9BITLEN) {
   1106 		print("%splan 9 image\n", newlabel);
   1107 		return 1;
   1108 	}
   1109 	if (p9subfont(buf+len)) {
   1110 		print("%ssubfont file\n", newlabel);
   1111 		return 1;
   1112 	}
   1113 	return 0;
   1114 }
   1115 
   1116 int
   1117 p9subfont(uchar *p)
   1118 {
   1119 	int n, h, a;
   1120 
   1121 		/* if image too big, assume it's a subfont */
   1122 	if (p+3*P9BITLEN > buf+sizeof(buf))
   1123 		return 1;
   1124 
   1125 	n = p9bitnum(p + 0*P9BITLEN);	/* char count */
   1126 	if (n < 0)
   1127 		return 0;
   1128 	h = p9bitnum(p + 1*P9BITLEN);	/* height */
   1129 	if (h < 0)
   1130 		return 0;
   1131 	a = p9bitnum(p + 2*P9BITLEN);	/* ascent */
   1132 	if (a < 0)
   1133 		return 0;
   1134 	return 1;
   1135 }
   1136 
   1137 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
   1138 
   1139 int
   1140 isp9font(void)
   1141 {
   1142 	uchar *cp, *p;
   1143 	int i, n;
   1144 	char pathname[1024];
   1145 
   1146 	cp = buf;
   1147 	if (!getfontnum(cp, &cp))	/* height */
   1148 		return 0;
   1149 	if (!getfontnum(cp, &cp))	/* ascent */
   1150 		return 0;
   1151 	for (i = 0; 1; i++) {
   1152 		if (!getfontnum(cp, &cp))	/* min */
   1153 			break;
   1154 		if (!getfontnum(cp, &cp))	/* max */
   1155 			return 0;
   1156 		while (WHITESPACE(*cp))
   1157 			cp++;
   1158 		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
   1159 				;
   1160 			/* construct a path name, if needed */
   1161 		n = 0;
   1162 		if (*p != '/' && slash) {
   1163 			n = slash-fname+1;
   1164 			if (n < sizeof(pathname))
   1165 				memcpy(pathname, fname, n);
   1166 			else n = 0;
   1167 		}
   1168 		if (n+cp-p < sizeof(pathname)) {
   1169 			memcpy(pathname+n, p, cp-p);
   1170 			n += cp-p;
   1171 			pathname[n] = 0;
   1172 			if (access(pathname, AEXIST) < 0)
   1173 				return 0;
   1174 		}
   1175 	}
   1176 	if (i) {
   1177 		print(mime ? "text/plain\n" : "font file\n");
   1178 		return 1;
   1179 	}
   1180 	return 0;
   1181 }
   1182 
   1183 int
   1184 getfontnum(uchar *cp, uchar **rp)
   1185 {
   1186 	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
   1187 		cp++;
   1188 	if (*cp < '0' || *cp > '9')
   1189 		return 0;
   1190 	strtoul((char *)cp, (char **)rp, 0);
   1191 	if (!WHITESPACE(**rp))
   1192 		return 0;
   1193 	return 1;
   1194 }
   1195 
   1196 int
   1197 isrtf(void)
   1198 {
   1199 	if(strstr((char *)buf, "\\rtf1")){
   1200 		print(mime ? "application/rtf\n" : "rich text format\n");
   1201 		return 1;
   1202 	}
   1203 	return 0;
   1204 }
   1205 
   1206 int
   1207 ismsdos(void)
   1208 {
   1209 	if (buf[0] == 0x4d && buf[1] == 0x5a){
   1210 		print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
   1211 		return 1;
   1212 	}
   1213 	return 0;
   1214 }
   1215 
   1216 int
   1217 iself(void)
   1218 {
   1219 	static char *cpu[] = {		/* NB: incomplete and arbitary list */
   1220 		nil,
   1221 	/*1*/	"WE32100",
   1222 	/*2*/	"SPARC",
   1223 	/*3*/	"i386",
   1224 	/*4*/	"M68000",
   1225 	/*5*/	"M88000",
   1226 	/*6*/	"i486",
   1227 	/*7*/	"i860",
   1228 	/*8*/	"R3000",
   1229 	/*9*/	"S370",
   1230 	/*10*/	"R4000",
   1231 		nil, nil, nil, nil,
   1232 	/*15*/	"HP-PA",
   1233 		nil,
   1234 		nil,
   1235 	/*18*/	"sparc v8+",
   1236 	/*19*/	"i960",
   1237 	/*20*/	"PPC-32",
   1238 	/*21*/	"PPC-64",
   1239 		nil, nil, nil, nil,
   1240 		nil, nil, nil, nil, nil,
   1241 		nil, nil, nil, nil, nil,
   1242 		nil, nil, nil, nil,
   1243 	/*40*/	"ARM",
   1244 	/*41*/	"Alpha",
   1245 		nil,
   1246 	/*43*/	"sparc v9",
   1247 		nil, nil,
   1248 		nil, nil, nil, nil,
   1249 	/*50*/	"IA-64",
   1250 		nil, nil, nil, nil, nil,
   1251 		nil, nil, nil, nil, nil,
   1252 		nil,
   1253 	/*62*/	"AMD64",
   1254 		nil, nil, nil,
   1255 		nil, nil, nil, nil, nil,
   1256 		nil, nil, nil, nil,
   1257 	/*75*/	"VAX",
   1258 	};
   1259 
   1260 
   1261 	if (memcmp(buf, "\177ELF", 4) == 0){
   1262 		/* gcc misparses \x7FELF as \x7FE L F */
   1263 		if (!mime){
   1264 			int n = (buf[19] << 8) | buf[18];
   1265 			char *p = "unknown";
   1266 
   1267 			if (n > 0 && n < nelem(cpu) && cpu[n])
   1268 				p = cpu[n];
   1269 			else {
   1270 				/* try the other byte order */
   1271 				n = (buf[18] << 8) | buf[19];
   1272 				if (n > 0 && n < nelem(cpu) && cpu[n])
   1273 					p = cpu[n];
   1274 			}
   1275 			print("%s ELF executable\n", p);
   1276 		}
   1277 		else
   1278 			print("application/x-elf-executable");
   1279 		return 1;
   1280 	}
   1281 
   1282 	return 0;
   1283 }