--- file-5.05/src/Makefile.am.vinejtext 2010-07-22 00:56:10.000000000 +0900 +++ file-5.05/src/Makefile.am 2011-02-11 16:53:06.000000000 +0900 @@ -4,11 +4,11 @@ bin_PROGRAMS = file -AM_CPPFLAGS = -DMAGIC='"$(MAGIC)"' +AM_CPPFLAGS = -DMAGIC='"$(MAGIC)"' -DDETECT_JAPANESE AM_CFLAGS = @WARNINGS@ libmagic_la_SOURCES = magic.c apprentice.c softmagic.c ascmagic.c \ - encoding.c compress.c is_tar.c readelf.c print.c fsmagic.c \ + encoding.c compress.c is_tar.c readelf.c print.c jcode.c fsmagic.c \ funcs.c file.h names.h patchlevel.h readelf.h tar.h apptype.c \ file_opts.h elfclass.h mygetopt.h cdf.c cdf_time.c readcdf.c cdf.h libmagic_la_LDFLAGS = -no-undefined -version-info 1:0:0 --- file-5.05/src/encoding.c.vinejtext 2010-07-22 01:47:17.000000000 +0900 +++ file-5.05/src/encoding.c 2011-02-11 17:26:00.000000000 +0900 @@ -42,7 +42,7 @@ FILE_RCSID("@(#)$File: encoding.c,v 1.5 #include #include #include - +#include "jcode.h" private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *, @@ -68,7 +68,7 @@ protected int file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type) { size_t mlen; - int rv = 1, ucs_type; + int rv = 1, ucs_type, jcode; unsigned char *nbuf = NULL; mlen = (nbytes + 1) * sizeof(nbuf[0]); @@ -83,10 +83,27 @@ file_encoding(struct magic_set *ms, cons } *type = "text"; - if (looks_ascii(buf, nbytes, *ubuf, ulen)) { + jcode = detect_kcode(buf, nbytes, *ubuf, ulen); + if (jcode == ASCII) { DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); *code = "ASCII"; *code_mime = "us-ascii"; + } else if (jcode == JIS) { + DPRINTF(("jis %" SIZE_T_FORMAT "u\n", *ulen)); + *code = "7-bit JIS [ESC$B, ESC(B]"; + *code_mime = "jis"; + } else if (jcode == SJIS){ + DPRINTF(("sjis %" SIZE_T_FORMAT "u\n", *ulen)); + *code = "SJIS"; + *code_mime = "sjis"; + } else if (jcode == EUC){ + DPRINTF(("euc %" SIZE_T_FORMAT "u\n", *ulen)); + *code = "EUC"; + *code_mime = "euc-jp"; + } else if (jcode == EUCORSJIS){ + DPRINTF(("euc or sjis %" SIZE_T_FORMAT "u\n", *ulen)); + *code = "EUC or SJIS"; + *code_mime = "unknown"; } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen)); *code = "UTF-8 Unicode (with BOM)"; --- /dev/null 2011-02-06 21:11:58.373999997 +0900 +++ file-5.05/src/jcode.c 2011-02-11 17:14:29.000000000 +0900 @@ -0,0 +1,205 @@ +/* +jcode.c: Kanji-code detect routing by Jun Nishii + modified by Ryoichi INAGAKI + */ +#include +#include +#include +#include +#include +#include + +typedef unsigned long unichar; + +#define F 0 /* character never appears in text */ +#define T 1 /* character appears in plain ASCII text */ +#define I 2 /* character appears in ISO-8859 text */ +#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ +#define J 4 /* character appears in JIS or plain ASCII */ +#define S 5 /* character appears in SJIS */ +#define E 6 /* character appears in EUC */ +#define O 7 /* character appears in EUC or SJIS */ + +#define ESC 27 + +static char jp_chars1[256] = { + F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ + F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ + T, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x2X */ + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x3X */ + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x4X */ + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x5X */ + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x6X */ + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, F, /* 0x7X */ + /* NEL */ + X, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x8X */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x9X */ + I, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, /* 0xaX */ + E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, /* 0xbX */ + E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, /* 0xcX */ + E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, /* 0xdX */ + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xeX */ + E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, I /* 0xfX */ +}; + +static char jp_chars2[256] = { + /* BEL BS HT LF FF CR */ + F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ + /* ESC */ + F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x4X */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x5X */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x6X */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, F, /* 0x7X */ + /* NEL */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x8X */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x9X */ + S, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xaX */ + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xbX */ + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xcX */ + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xdX */ + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xeX */ + O, O, O, O, O, O, O, O, O, O, O, O, O, E, E, I /* 0xfX */ +}; + + +int +check_asc_jis(buf, nbytes, ubuf, ulen) + const unsigned char *buf; + size_t nbytes; + unichar *ubuf; + size_t *ulen; +{ + size_t i; + int jflag; + + *ulen = 0; jflag=0; + + for (i = 0; i < nbytes; i++) { + int t = jp_chars1[buf[i]]; + + if (t != T && t != J ) + return 0; + + if (buf[i] == ESC && i+2 + modified by Ryoichi INAGAKI + */ + +#define ASCII 1 +#define JIS 2 +#define EUC 3 +#define SJIS 4 +#define EUCORSJIS 5 + +extern int detect_kcode (const unsigned char *, size_t, unichar *, size_t *); +extern int looks_jis (const unsigned char *, size_t, unichar *, size_t *); +extern int looks_sjis (const unsigned char *, size_t, unichar *, size_t *); +extern int looks_euc (const unsigned char *, size_t, unichar *, size_t *); --- file-5.05/src/names.h.vinejtext 2010-10-09 06:58:44.000000000 +0900 +++ file-5.05/src/names.h 2011-02-11 17:28:18.000000000 +0900 @@ -135,8 +135,6 @@ {"/*", L_C, 2 }, /* must precede "The", "the", etc. */ {"#include", L_C, 2 }, {"char", L_C, 2 }, - {"The", L_ENG, 2 }, - {"the", L_ENG, 2 }, {"double", L_C, 1 }, {"extern", L_C, 2 }, {"float", L_C, 1 },