file.c (23931B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include <mach.h> 6 7 /* 8 * file - determine type of file 9 */ 10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24)) 11 12 uchar buf[6001]; 13 short cfreq[140]; 14 short wfreq[50]; 15 int nbuf; 16 Dir* mbuf; 17 int fd; 18 char *fname; 19 char *slash; 20 21 enum 22 { 23 Cword, 24 Fword, 25 Aword, 26 Alword, 27 Lword, 28 I1, 29 I2, 30 I3, 31 Clatin = 128, 32 Cbinary, 33 Cnull, 34 Ceascii, 35 Cutf, 36 }; 37 struct 38 { 39 char* word; 40 int class; 41 } dict[] = 42 { 43 "PATH", Lword, 44 "TEXT", Aword, 45 "adt", Alword, 46 "aggr", Alword, 47 "alef", Alword, 48 "array", Lword, 49 "block", Fword, 50 "chan", Alword, 51 "char", Cword, 52 "common", Fword, 53 "con", Lword, 54 "data", Fword, 55 "dimension", Fword, 56 "double", Cword, 57 "extern", Cword, 58 "bio", I2, 59 "float", Cword, 60 "fn", Lword, 61 "function", Fword, 62 "h", I3, 63 "implement", Lword, 64 "import", Lword, 65 "include", I1, 66 "int", Cword, 67 "integer", Fword, 68 "iota", Lword, 69 "libc", I2, 70 "long", Cword, 71 "module", Lword, 72 "real", Fword, 73 "ref", Lword, 74 "register", Cword, 75 "self", Lword, 76 "short", Cword, 77 "static", Cword, 78 "stdio", I2, 79 "struct", Cword, 80 "subroutine", Fword, 81 "u", I2, 82 "void", Cword, 83 }; 84 85 /* codes for 'mode' field in language structure */ 86 enum { 87 Normal = 0, 88 First, /* first entry for language spanning several ranges */ 89 Multi, /* later entries " " " ... */ 90 Shared, /* codes used in several languages */ 91 }; 92 93 struct 94 { 95 int mode; /* see enum above */ 96 int count; 97 int low; 98 int high; 99 char *name; 100 101 } language[] = 102 { 103 Normal, 0, 0x0080, 0x0080, "Extended Latin", 104 Normal, 0, 0x0100, 0x01FF, "Extended Latin", 105 Normal, 0, 0x0370, 0x03FF, "Greek", 106 Normal, 0, 0x0400, 0x04FF, "Cyrillic", 107 Normal, 0, 0x0530, 0x058F, "Armenian", 108 Normal, 0, 0x0590, 0x05FF, "Hebrew", 109 Normal, 0, 0x0600, 0x06FF, "Arabic", 110 Normal, 0, 0x0900, 0x097F, "Devanagari", 111 Normal, 0, 0x0980, 0x09FF, "Bengali", 112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi", 113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati", 114 Normal, 0, 0x0B00, 0x0B7F, "Oriya", 115 Normal, 0, 0x0B80, 0x0BFF, "Tamil", 116 Normal, 0, 0x0C00, 0x0C7F, "Telugu", 117 Normal, 0, 0x0C80, 0x0CFF, "Kannada", 118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam", 119 Normal, 0, 0x0E00, 0x0E7F, "Thai", 120 Normal, 0, 0x0E80, 0x0EFF, "Lao", 121 Normal, 0, 0x1000, 0x105F, "Tibetan", 122 Normal, 0, 0x10A0, 0x10FF, "Georgian", 123 Normal, 0, 0x3040, 0x30FF, "Japanese", 124 Normal, 0, 0x3100, 0x312F, "Chinese", 125 First, 0, 0x3130, 0x318F, "Korean", 126 Multi, 0, 0x3400, 0x3D2F, "Korean", 127 Shared, 0, 0x4e00, 0x9fff, "CJK", 128 Normal, 0, 0, 0, 0, /* terminal entry */ 129 }; 130 131 132 enum 133 { 134 Fascii, /* printable ascii */ 135 Flatin, /* latin 1*/ 136 Futf, /* UTf character set */ 137 Fbinary, /* binary */ 138 Feascii, /* ASCII with control chars */ 139 Fnull, /* NULL in file */ 140 } guess; 141 142 void bump_utf_count(Rune); 143 int cistrncmp(char*, char*, int); 144 void filetype(int); 145 int getfontnum(uchar*, uchar**); 146 int isas(void); 147 int isc(void); 148 int isenglish(void); 149 int ishp(void); 150 int ishtml(void); 151 int isrfc822(void); 152 int ismbox(void); 153 int islimbo(void); 154 int ismung(void); 155 int isp9bit(void); 156 int isp9font(void); 157 int isrtf(void); 158 int ismsdos(void); 159 int iself(void); 160 int istring(void); 161 int iff(void); 162 int long0(void); 163 int istar(void); 164 int p9bitnum(uchar*); 165 int p9subfont(uchar*); 166 void print_utf(void); 167 void type(char*, int); 168 int utf_count(void); 169 void wordfreq(void); 170 171 int (*call[])(void) = 172 { 173 long0, /* recognizable by first 4 bytes */ 174 istring, /* recognizable by first string */ 175 iff, /* interchange file format (strings) */ 176 isrfc822, /* email file */ 177 ismbox, /* mail box */ 178 istar, /* recognizable by tar checksum */ 179 ishtml, /* html keywords */ 180 /* iscint, /* compiler/assembler intermediate */ 181 islimbo, /* limbo source */ 182 isc, /* c & alef compiler key words */ 183 isas, /* assembler key words */ 184 ismung, /* entropy compressed/encrypted */ 185 isp9font, /* plan 9 font */ 186 isp9bit, /* plan 9 image (as from /dev/window) */ 187 isenglish, /* char frequency English */ 188 isrtf, /* rich text format */ 189 ismsdos, /* msdos exe (virus file attachement) */ 190 iself, /* ELF (foreign) executable */ 191 0 192 }; 193 194 int mime; 195 196 #define OCTET "application/octet-stream\n" 197 #define PLAIN "text/plain\n" 198 199 void 200 main(int argc, char *argv[]) 201 { 202 int i, j, maxlen; 203 char *cp; 204 Rune r; 205 206 ARGBEGIN{ 207 case 'm': 208 mime = 1; 209 break; 210 default: 211 fprint(2, "usage: file [-m] [file...]\n"); 212 exits("usage"); 213 }ARGEND; 214 215 maxlen = 0; 216 if(mime == 0 || argc > 1){ 217 for(i = 0; i < argc; i++) { 218 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp)) 219 ; 220 if(j > maxlen) 221 maxlen = j; 222 } 223 } 224 if (argc <= 0) { 225 if(!mime) 226 print ("stdin: "); 227 filetype(0); 228 } 229 else { 230 for(i = 0; i < argc; i++) 231 type(argv[i], maxlen); 232 } 233 exits(0); 234 } 235 236 void 237 type(char *file, int nlen) 238 { 239 Rune r; 240 int i; 241 char *p; 242 243 if(nlen > 0){ 244 slash = 0; 245 for (i = 0, p = file; *p; i++) { 246 if (*p == '/') /* find rightmost slash */ 247 slash = p; 248 p += chartorune(&r, p); /* count runes */ 249 } 250 print("%s:%*s",file, nlen-i+1, ""); 251 } 252 fname = file; 253 if ((fd = open(file, OREAD)) < 0) { 254 print("cannot open\n"); 255 return; 256 } 257 filetype(fd); 258 close(fd); 259 } 260 261 void 262 filetype(int fd) 263 { 264 Rune r; 265 int i, f, n; 266 char *p, *eob; 267 268 free(mbuf); 269 mbuf = dirfstat(fd); 270 if(mbuf == nil){ 271 print("cannot stat: %r\n"); 272 return; 273 } 274 if(mbuf->mode & DMDIR) { 275 print(mime ? "text/directory\n" : "directory\n"); 276 return; 277 } 278 if(mbuf->type != 'M' && mbuf->type != '|') { 279 print(mime ? OCTET : "special file #%c/%s\n", 280 mbuf->type, mbuf->name); 281 return; 282 } 283 nbuf = read(fd, buf, sizeof(buf)-1); 284 285 if(nbuf < 0) { 286 print("cannot read\n"); 287 return; 288 } 289 if(nbuf == 0) { 290 print(mime ? PLAIN : "empty file\n"); 291 return; 292 } 293 buf[nbuf] = 0; 294 295 /* 296 * build histogram table 297 */ 298 memset(cfreq, 0, sizeof(cfreq)); 299 for (i = 0; language[i].name; i++) 300 language[i].count = 0; 301 eob = (char *)buf+nbuf; 302 for(n = 0, p = (char *)buf; p < eob; n++) { 303 if (!fullrune(p, eob-p) && eob-p < UTFmax) 304 break; 305 p += chartorune(&r, p); 306 if (r == 0) 307 f = Cnull; 308 else if (r <= 0x7f) { 309 if (!isprint(r) && !isspace(r)) 310 f = Ceascii; /* ASCII control char */ 311 else f = r; 312 } else if (r == 0x080) { 313 bump_utf_count(r); 314 f = Cutf; 315 } else if (r < 0xA0) 316 f = Cbinary; /* Invalid Runes */ 317 else if (r <= 0xff) 318 f = Clatin; /* Latin 1 */ 319 else { 320 bump_utf_count(r); 321 f = Cutf; /* UTF extension */ 322 } 323 cfreq[f]++; /* ASCII chars peg directly */ 324 } 325 /* 326 * gross classify 327 */ 328 if (cfreq[Cbinary]) 329 guess = Fbinary; 330 else if (cfreq[Cutf]) 331 guess = Futf; 332 else if (cfreq[Clatin]) 333 guess = Flatin; 334 else if (cfreq[Ceascii]) 335 guess = Feascii; 336 else if (cfreq[Cnull] == n) { 337 print(mime ? OCTET : "first block all null bytes\n"); 338 return; 339 } 340 else guess = Fascii; 341 /* 342 * lookup dictionary words 343 */ 344 memset(wfreq, 0, sizeof(wfreq)); 345 if(guess == Fascii || guess == Flatin || guess == Futf) 346 wordfreq(); 347 /* 348 * call individual classify routines 349 */ 350 for(i=0; call[i]; i++) 351 if((*call[i])()) 352 return; 353 354 /* 355 * if all else fails, 356 * print out gross classification 357 */ 358 if (nbuf < 100 && !mime) 359 print(mime ? PLAIN : "short "); 360 if (guess == Fascii) 361 print(mime ? PLAIN : "Ascii\n"); 362 else if (guess == Feascii) 363 print(mime ? PLAIN : "extended ascii\n"); 364 else if (guess == Flatin) 365 print(mime ? PLAIN : "latin ascii\n"); 366 else if (guess == Futf && utf_count() < 4) 367 print_utf(); 368 else print(mime ? OCTET : "binary\n"); 369 } 370 371 void 372 bump_utf_count(Rune r) 373 { 374 int low, high, mid; 375 376 high = sizeof(language)/sizeof(language[0])-1; 377 for (low = 0; low < high;) { 378 mid = (low+high)/2; 379 if (r >=language[mid].low) { 380 if (r <= language[mid].high) { 381 language[mid].count++; 382 break; 383 } else low = mid+1; 384 } else high = mid; 385 } 386 } 387 388 int 389 utf_count(void) 390 { 391 int i, count; 392 393 count = 0; 394 for (i = 0; language[i].name; i++) 395 if (language[i].count > 0) 396 switch (language[i].mode) { 397 case Normal: 398 case First: 399 count++; 400 break; 401 default: 402 break; 403 } 404 return count; 405 } 406 407 int 408 chkascii(void) 409 { 410 int i; 411 412 for (i = 'a'; i < 'z'; i++) 413 if (cfreq[i]) 414 return 1; 415 for (i = 'A'; i < 'Z'; i++) 416 if (cfreq[i]) 417 return 1; 418 return 0; 419 } 420 421 int 422 find_first(char *name) 423 { 424 int i; 425 426 for (i = 0; language[i].name != 0; i++) 427 if (language[i].mode == First 428 && strcmp(language[i].name, name) == 0) 429 return i; 430 return -1; 431 } 432 433 void 434 print_utf(void) 435 { 436 int i, printed, j; 437 438 if(mime){ 439 print(PLAIN); 440 return; 441 } 442 if (chkascii()) { 443 printed = 1; 444 print("Ascii"); 445 } else 446 printed = 0; 447 for (i = 0; language[i].name; i++) 448 if (language[i].count) { 449 switch(language[i].mode) { 450 case Multi: 451 j = find_first(language[i].name); 452 if (j < 0) 453 break; 454 if (language[j].count > 0) 455 break; 456 /* Fall through */ 457 case Normal: 458 case First: 459 if (printed) 460 print(" & "); 461 else printed = 1; 462 print("%s", language[i].name); 463 break; 464 case Shared: 465 default: 466 break; 467 } 468 } 469 if(!printed) 470 print("UTF"); 471 print(" text\n"); 472 } 473 474 void 475 wordfreq(void) 476 { 477 int low, high, mid, r; 478 uchar *p, *p2, c; 479 480 p = buf; 481 for(;;) { 482 while (p < buf+nbuf && !isalpha(*p)) 483 p++; 484 if (p >= buf+nbuf) 485 return; 486 p2 = p; 487 while(p < buf+nbuf && isalpha(*p)) 488 p++; 489 c = *p; 490 *p = 0; 491 high = sizeof(dict)/sizeof(dict[0]); 492 for(low = 0;low < high;) { 493 mid = (low+high)/2; 494 r = strcmp(dict[mid].word, (char*)p2); 495 if(r == 0) { 496 wfreq[dict[mid].class]++; 497 break; 498 } 499 if(r < 0) 500 low = mid+1; 501 else 502 high = mid; 503 } 504 *p++ = c; 505 } 506 } 507 508 typedef struct Filemagic Filemagic; 509 struct Filemagic { 510 ulong x; 511 ulong mask; 512 char *desc; 513 char *mime; 514 }; 515 516 Filemagic long0tab[] = { 517 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET, 518 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET, 519 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET, 520 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET, 521 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET, 522 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip\n", 523 070707, 0xFFFF, "cpio archive\n", OCTET, 524 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi\n", 525 0xfffa0000, 0xfffe0000, "mp3 audio\n", "audio/mpeg\n", 526 0xcafebabe, 0xFFFFFFFF, "Mach-O fat executable\n", "application/x-mach-binary\n", 527 0xfeedface, 0xFFFFFFFE, "Mach-O executable\n", "application/x-mach-binary\n", 528 0xbebafeca, 0xFFFFFFFF, "Java class\n", "application/x-java-applet\n", 529 }; 530 531 int 532 filemagic(Filemagic *tab, int ntab, ulong x) 533 { 534 int i; 535 536 for(i=0; i<ntab; i++) 537 if((x&tab[i].mask) == tab[i].x){ 538 print(mime ? tab[i].mime : tab[i].desc); 539 return 1; 540 } 541 return 0; 542 } 543 544 int 545 long0(void) 546 { 547 /* Fhdr *f; */ 548 long x; 549 550 seek(fd, 0, 0); /* reposition to start of file */ 551 /* 552 if(crackhdr(fd, &f)) { 553 print(mime ? OCTET : "%s\n", f.name); 554 return 1; 555 } 556 */ 557 x = LENDIAN(buf); 558 if(filemagic(long0tab, nelem(long0tab), x)) 559 return 1; 560 return 0; 561 } 562 563 /* from tar.c */ 564 enum { NAMSIZ = 100, TBLOCK = 512 }; 565 566 union hblock 567 { 568 char dummy[TBLOCK]; 569 struct header 570 { 571 char name[NAMSIZ]; 572 char mode[8]; 573 char uid[8]; 574 char gid[8]; 575 char size[12]; 576 char mtime[12]; 577 char chksum[8]; 578 char linkflag; 579 char linkname[NAMSIZ]; 580 /* rest are defined by POSIX's ustar format; see p1003.2b */ 581 char magic[6]; /* "ustar" */ 582 char version[2]; 583 char uname[32]; 584 char gname[32]; 585 char devmajor[8]; 586 char devminor[8]; 587 char prefix[155]; /* if non-null, path = prefix "/" name */ 588 } dbuf; 589 }; 590 591 int 592 checksum(union hblock *hp) 593 { 594 int i; 595 char *cp; 596 struct header *hdr = &hp->dbuf; 597 598 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++) 599 *cp = ' '; 600 i = 0; 601 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++) 602 i += *cp & 0xff; 603 return i; 604 } 605 606 int 607 istar(void) 608 { 609 int chksum; 610 char tblock[TBLOCK]; 611 union hblock *hp = (union hblock *)tblock; 612 struct header *hdr = &hp->dbuf; 613 614 seek(fd, 0, 0); /* reposition to start of file */ 615 if (readn(fd, tblock, sizeof tblock) != sizeof tblock) 616 return 0; 617 chksum = strtol(hdr->chksum, 0, 8); 618 if (hdr->name[0] != '\0' && checksum(hp) == chksum) { 619 if (strcmp(hdr->magic, "ustar") == 0) 620 print(mime? "application/x-ustar\n": 621 "posix tar archive\n"); 622 else 623 print(mime? "application/x-tar\n": "tar archive\n"); 624 return 1; 625 } 626 return 0; 627 } 628 629 /* 630 * initial words to classify file 631 */ 632 struct FILE_STRING 633 { 634 char *key; 635 char *filetype; 636 int length; 637 char *mime; 638 } file_string[] = 639 { 640 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream", 641 "!<arch>\n", "archive", 8, "application/octet-stream", 642 "070707", "cpio archive - ascii header", 6, "application/octet-stream", 643 "%!", "postscript", 2, "application/postscript", 644 "\004%!", "postscript", 3, "application/postscript", 645 "x T post", "troff output for post", 8, "application/troff", 646 "x T Latin1", "troff output for Latin1", 10, "application/troff", 647 "x T utf", "troff output for UTF", 7, "application/troff", 648 "x T 202", "troff output for 202", 7, "application/troff", 649 "x T aps", "troff output for aps", 7, "application/troff", 650 "GIF", "GIF image", 3, "image/gif", 651 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript", 652 "%PDF", "PDF", 4, "application/pdf", 653 "<html>\n", "HTML file", 7, "text/html", 654 "<HTML>\n", "HTML file", 7, "text/html", 655 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream", 656 "\111\111\052\000", "tiff", 4, "image/tiff", 657 "\115\115\000\052", "tiff", 4, "image/tiff", 658 "\377\330\377\340", "jpeg", 4, "image/jpeg", 659 "\377\330\377\341", "jpeg", 4, "image/jpeg", 660 "\377\330\377\333", "jpeg", 4, "image/jpeg", 661 "\106\117\126\142", "x3f", 4, "image/x3f", 662 "BM", "bmp", 2, "image/bmp", 663 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream", 664 "<MakerFile ", "FrameMaker file", 11, "application/framemaker", 665 "\033%-12345X", "HPJCL file", 9, "application/hpjcl", 666 "ID3", "mp3 audio with id3", 3, "audio/mpeg", 667 0,0,0,0 668 }; 669 670 int 671 istring(void) 672 { 673 int i, j; 674 struct FILE_STRING *p; 675 676 for(p = file_string; p->key; p++) { 677 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) { 678 if(mime) 679 print("%s\n", p->mime); 680 else 681 print("%s\n", p->filetype); 682 return 1; 683 } 684 } 685 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */ 686 for(i = 5; i < nbuf; i++) 687 if(buf[i] == '\n') 688 break; 689 if(mime) 690 print(OCTET); 691 else 692 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5); 693 return 1; 694 } 695 if(buf[0]=='#' && buf[1]=='!'){ 696 i=2; 697 for(j=2; j < nbuf && buf[j] != ' ' && buf[j] != '\n' && buf[j] != '\r'; j++) 698 if(buf[j] == '/') 699 i = j+1; 700 if(mime) 701 print(PLAIN); 702 else 703 print("%.*s executable file script\n", utfnlen((char*)buf+i, j-i), (char*)buf+i); 704 return 1; 705 } 706 return 0; 707 } 708 709 int 710 iff(void) 711 { 712 if (strncmp((char*)buf, "FORM", 4) == 0 && 713 strncmp((char*)buf+8, "AIFF", 4) == 0) { 714 print("%s\n", mime? "audio/x-aiff": "aiff audio"); 715 return 1; 716 } 717 return 0; 718 } 719 720 char* html_string[] = 721 { 722 "title", 723 "body", 724 "head", 725 "strong", 726 "h1", 727 "h2", 728 "h3", 729 "h4", 730 "h5", 731 "h6", 732 "ul", 733 "li", 734 "dl", 735 "br", 736 "em", 737 0, 738 }; 739 740 int 741 ishtml(void) 742 { 743 uchar *p, *q; 744 int i, count; 745 746 /* compare strings between '<' and '>' to html table */ 747 count = 0; 748 p = buf; 749 for(;;) { 750 while (p < buf+nbuf && *p != '<') 751 p++; 752 p++; 753 if (p >= buf+nbuf) 754 break; 755 if(*p == '/') 756 p++; 757 q = p; 758 while(p < buf+nbuf && *p != '>') 759 p++; 760 if (p >= buf+nbuf) 761 break; 762 for(i = 0; html_string[i]; i++) { 763 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) { 764 if(count++ > 4) { 765 print(mime ? "text/html\n" : "HTML file\n"); 766 return 1; 767 } 768 break; 769 } 770 } 771 p++; 772 } 773 return 0; 774 } 775 776 char* rfc822_string[] = 777 { 778 "from:", 779 "date:", 780 "to:", 781 "subject:", 782 "received:", 783 "reply to:", 784 "sender:", 785 0, 786 }; 787 788 int 789 isrfc822(void) 790 { 791 792 char *p, *q, *r; 793 int i, count; 794 795 count = 0; 796 p = (char*)buf; 797 for(;;) { 798 q = strchr(p, '\n'); 799 if(q == nil) 800 break; 801 *q = 0; 802 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){ 803 count++; 804 *q = '\n'; 805 p = q+1; 806 continue; 807 } 808 *q = '\n'; 809 if(*p != '\t' && *p != ' '){ 810 r = strchr(p, ':'); 811 if(r == 0 || r > q) 812 break; 813 for(i = 0; rfc822_string[i]; i++) { 814 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){ 815 count++; 816 break; 817 } 818 } 819 } 820 p = q+1; 821 } 822 if(count >= 3){ 823 print(mime ? "message/rfc822\n" : "email file\n"); 824 return 1; 825 } 826 return 0; 827 } 828 829 int 830 ismbox(void) 831 { 832 char *p, *q; 833 834 p = (char*)buf; 835 q = strchr(p, '\n'); 836 if(q == nil) 837 return 0; 838 *q = 0; 839 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){ 840 print(mime ? "text/plain\n" : "mail box\n"); 841 return 1; 842 } 843 *q = '\n'; 844 return 0; 845 } 846 847 int 848 isc(void) 849 { 850 int n; 851 852 n = wfreq[I1]; 853 /* 854 * includes 855 */ 856 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 857 goto yes; 858 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n) 859 goto yes; 860 /* 861 * declarations 862 */ 863 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5) 864 goto yes; 865 /* 866 * assignments 867 */ 868 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1) 869 goto yes; 870 return 0; 871 872 yes: 873 if(mime){ 874 print(PLAIN); 875 return 1; 876 } 877 if(wfreq[Alword] > 0) 878 print("alef program\n"); 879 else 880 print("c program\n"); 881 return 1; 882 } 883 884 int 885 islimbo(void) 886 { 887 888 /* 889 * includes 890 */ 891 if(wfreq[Lword] < 4) 892 return 0; 893 print(mime ? PLAIN : "limbo program\n"); 894 return 1; 895 } 896 897 int 898 isas(void) 899 { 900 901 /* 902 * includes 903 */ 904 if(wfreq[Aword] < 2) 905 return 0; 906 print(mime ? PLAIN : "as program\n"); 907 return 1; 908 } 909 910 /* 911 * low entropy means encrypted 912 */ 913 int 914 ismung(void) 915 { 916 int i, bucket[8]; 917 float cs; 918 919 if(nbuf < 64) 920 return 0; 921 memset(bucket, 0, sizeof(bucket)); 922 for(i=0; i<64; i++) 923 bucket[(buf[i]>>5)&07] += 1; 924 925 cs = 0.; 926 for(i=0; i<8; i++) 927 cs += (bucket[i]-8)*(bucket[i]-8); 928 cs /= 8.; 929 if(cs <= 24.322) { 930 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d)) 931 print(mime ? OCTET : "compressed\n"); 932 else 933 print(mime ? OCTET : "encrypted\n"); 934 return 1; 935 } 936 return 0; 937 } 938 939 /* 940 * english by punctuation and frequencies 941 */ 942 int 943 isenglish(void) 944 { 945 int vow, comm, rare, badpun, punct; 946 char *p; 947 948 if(guess != Fascii && guess != Feascii) 949 return 0; 950 badpun = 0; 951 punct = 0; 952 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++) 953 switch(*p) { 954 case '.': 955 case ',': 956 case ')': 957 case '%': 958 case ';': 959 case ':': 960 case '?': 961 punct++; 962 if(p[1] != ' ' && p[1] != '\n') 963 badpun++; 964 } 965 if(badpun*5 > punct) 966 return 0; 967 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */ 968 return 0; 969 if(2*cfreq[';'] > cfreq['e']) 970 return 0; 971 972 vow = 0; 973 for(p="AEIOU"; *p; p++) { 974 vow += cfreq[(uchar)*p]; 975 vow += cfreq[tolower((uchar)*p)]; 976 } 977 comm = 0; 978 for(p="ETAION"; *p; p++) { 979 comm += cfreq[(uchar)*p]; 980 comm += cfreq[tolower((uchar)*p)]; 981 } 982 rare = 0; 983 for(p="VJKQXZ"; *p; p++) { 984 rare += cfreq[(uchar)*p]; 985 rare += cfreq[tolower((uchar)*p)]; 986 } 987 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) { 988 print(mime ? PLAIN : "English text\n"); 989 return 1; 990 } 991 return 0; 992 } 993 994 /* 995 * pick up a number with 996 * syntax _*[0-9]+_ 997 */ 998 #define P9BITLEN 12 999 int 1000 p9bitnum(uchar *bp) 1001 { 1002 int n, c, len; 1003 1004 len = P9BITLEN; 1005 while(*bp == ' ') { 1006 bp++; 1007 len--; 1008 if(len <= 0) 1009 return -1; 1010 } 1011 n = 0; 1012 while(len > 1) { 1013 c = *bp++; 1014 if(!isdigit(c)) 1015 return -1; 1016 n = n*10 + c-'0'; 1017 len--; 1018 } 1019 if(*bp != ' ') 1020 return -1; 1021 return n; 1022 } 1023 1024 int 1025 depthof(char *s, int *newp) 1026 { 1027 char *es; 1028 int d; 1029 1030 *newp = 0; 1031 es = s+12; 1032 while(s<es && *s==' ') 1033 s++; 1034 if(s == es) 1035 return -1; 1036 if('0'<=*s && *s<='9') 1037 return 1<<atoi(s); 1038 1039 *newp = 1; 1040 d = 0; 1041 while(s<es && *s!=' '){ 1042 s++; /* skip letter */ 1043 d += strtoul(s, &s, 10); 1044 } 1045 1046 switch(d){ 1047 case 32: 1048 case 24: 1049 case 16: 1050 case 8: 1051 return d; 1052 } 1053 return -1; 1054 } 1055 1056 int 1057 isp9bit(void) 1058 { 1059 int dep, lox, loy, hix, hiy, px, new; 1060 ulong t; 1061 long len; 1062 char *newlabel; 1063 1064 newlabel = "old "; 1065 1066 dep = depthof((char*)buf + 0*P9BITLEN, &new); 1067 if(new) 1068 newlabel = ""; 1069 lox = p9bitnum(buf + 1*P9BITLEN); 1070 loy = p9bitnum(buf + 2*P9BITLEN); 1071 hix = p9bitnum(buf + 3*P9BITLEN); 1072 hiy = p9bitnum(buf + 4*P9BITLEN); 1073 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0) 1074 return 0; 1075 1076 if(dep < 8){ 1077 px = 8/dep; /* pixels per byte */ 1078 /* set l to number of bytes of data per scan line */ 1079 if(lox >= 0) 1080 len = (hix+px-1)/px - lox/px; 1081 else{ /* make positive before divide */ 1082 t = (-lox)+px-1; 1083 t = (t/px)*px; 1084 len = (t+hix+px-1)/px; 1085 } 1086 }else 1087 len = (hix-lox)*dep/8; 1088 len *= (hiy-loy); /* col length */ 1089 len += 5*P9BITLEN; /* size of initial ascii */ 1090 1091 /* 1092 * for image file, length is non-zero and must match calculation above 1093 * for /dev/window and /dev/screen the length is always zero 1094 * for subfont, the subfont header should follow immediately. 1095 */ 1096 if (len != 0 && mbuf->length == 0) { 1097 print("%splan 9 image\n", newlabel); 1098 return 1; 1099 } 1100 if (mbuf->length == len) { 1101 print("%splan 9 image\n", newlabel); 1102 return 1; 1103 } 1104 /* Ghostscript sometimes produces a little extra on the end */ 1105 if (mbuf->length < len+P9BITLEN) { 1106 print("%splan 9 image\n", newlabel); 1107 return 1; 1108 } 1109 if (p9subfont(buf+len)) { 1110 print("%ssubfont file\n", newlabel); 1111 return 1; 1112 } 1113 return 0; 1114 } 1115 1116 int 1117 p9subfont(uchar *p) 1118 { 1119 int n, h, a; 1120 1121 /* if image too big, assume it's a subfont */ 1122 if (p+3*P9BITLEN > buf+sizeof(buf)) 1123 return 1; 1124 1125 n = p9bitnum(p + 0*P9BITLEN); /* char count */ 1126 if (n < 0) 1127 return 0; 1128 h = p9bitnum(p + 1*P9BITLEN); /* height */ 1129 if (h < 0) 1130 return 0; 1131 a = p9bitnum(p + 2*P9BITLEN); /* ascent */ 1132 if (a < 0) 1133 return 0; 1134 return 1; 1135 } 1136 1137 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 1138 1139 int 1140 isp9font(void) 1141 { 1142 uchar *cp, *p; 1143 int i, n; 1144 char pathname[1024]; 1145 1146 cp = buf; 1147 if (!getfontnum(cp, &cp)) /* height */ 1148 return 0; 1149 if (!getfontnum(cp, &cp)) /* ascent */ 1150 return 0; 1151 for (i = 0; 1; i++) { 1152 if (!getfontnum(cp, &cp)) /* min */ 1153 break; 1154 if (!getfontnum(cp, &cp)) /* max */ 1155 return 0; 1156 while (WHITESPACE(*cp)) 1157 cp++; 1158 for (p = cp; *cp && !WHITESPACE(*cp); cp++) 1159 ; 1160 /* construct a path name, if needed */ 1161 n = 0; 1162 if (*p != '/' && slash) { 1163 n = slash-fname+1; 1164 if (n < sizeof(pathname)) 1165 memcpy(pathname, fname, n); 1166 else n = 0; 1167 } 1168 if (n+cp-p < sizeof(pathname)) { 1169 memcpy(pathname+n, p, cp-p); 1170 n += cp-p; 1171 pathname[n] = 0; 1172 if (access(pathname, AEXIST) < 0) 1173 return 0; 1174 } 1175 } 1176 if (i) { 1177 print(mime ? "text/plain\n" : "font file\n"); 1178 return 1; 1179 } 1180 return 0; 1181 } 1182 1183 int 1184 getfontnum(uchar *cp, uchar **rp) 1185 { 1186 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ 1187 cp++; 1188 if (*cp < '0' || *cp > '9') 1189 return 0; 1190 strtoul((char *)cp, (char **)rp, 0); 1191 if (!WHITESPACE(**rp)) 1192 return 0; 1193 return 1; 1194 } 1195 1196 int 1197 isrtf(void) 1198 { 1199 if(strstr((char *)buf, "\\rtf1")){ 1200 print(mime ? "application/rtf\n" : "rich text format\n"); 1201 return 1; 1202 } 1203 return 0; 1204 } 1205 1206 int 1207 ismsdos(void) 1208 { 1209 if (buf[0] == 0x4d && buf[1] == 0x5a){ 1210 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n"); 1211 return 1; 1212 } 1213 return 0; 1214 } 1215 1216 int 1217 iself(void) 1218 { 1219 static char *cpu[] = { /* NB: incomplete and arbitary list */ 1220 nil, 1221 /*1*/ "WE32100", 1222 /*2*/ "SPARC", 1223 /*3*/ "i386", 1224 /*4*/ "M68000", 1225 /*5*/ "M88000", 1226 /*6*/ "i486", 1227 /*7*/ "i860", 1228 /*8*/ "R3000", 1229 /*9*/ "S370", 1230 /*10*/ "R4000", 1231 nil, nil, nil, nil, 1232 /*15*/ "HP-PA", 1233 nil, 1234 nil, 1235 /*18*/ "sparc v8+", 1236 /*19*/ "i960", 1237 /*20*/ "PPC-32", 1238 /*21*/ "PPC-64", 1239 nil, nil, nil, nil, 1240 nil, nil, nil, nil, nil, 1241 nil, nil, nil, nil, nil, 1242 nil, nil, nil, nil, 1243 /*40*/ "ARM", 1244 /*41*/ "Alpha", 1245 nil, 1246 /*43*/ "sparc v9", 1247 nil, nil, 1248 nil, nil, nil, nil, 1249 /*50*/ "IA-64", 1250 nil, nil, nil, nil, nil, 1251 nil, nil, nil, nil, nil, 1252 nil, 1253 /*62*/ "AMD64", 1254 nil, nil, nil, 1255 nil, nil, nil, nil, nil, 1256 nil, nil, nil, nil, 1257 /*75*/ "VAX", 1258 }; 1259 1260 1261 if (memcmp(buf, "\177ELF", 4) == 0){ 1262 /* gcc misparses \x7FELF as \x7FE L F */ 1263 if (!mime){ 1264 int n = (buf[19] << 8) | buf[18]; 1265 char *p = "unknown"; 1266 1267 if (n > 0 && n < nelem(cpu) && cpu[n]) 1268 p = cpu[n]; 1269 else { 1270 /* try the other byte order */ 1271 n = (buf[18] << 8) | buf[19]; 1272 if (n > 0 && n < nelem(cpu) && cpu[n]) 1273 p = cpu[n]; 1274 } 1275 print("%s ELF executable\n", p); 1276 } 1277 else 1278 print("application/x-elf-executable"); 1279 return 1; 1280 } 1281 1282 return 0; 1283 }