/* htmlconv.c - plain-text to HTML document converter (c) 1999 Hiroyuki Yamamoto Changes: 99/04/21 (Wed) modified stream pointer initialization so that it can be compiled on glibc2.1 environment. 99/03/13 (Sat) modified email address recognition routine to get rid of unnecessary period on tail end. 99/02/04 (Thu) modified character line to
conversion. modified multibyte space to ascii spaces conversion to work properly. 99/02/03 (Wed) added email address recognition and its option. modified fgets2() again. 99/02/02 (Tue) modified fgets2() to process multibyte character properly. 99/01/29 (Fri) added URL recognition and its option. 99/01/28 (Thu) added character line to
conversion. 99/01/27 (Wed) changed program name from txt2html to htmlconv (txt2html has already existed). changed parse algorithm from looking character one-by-one to reading whole line at once. 99/01/18 (Mon) changed conversion of _underscores_ from boldface to italic. 98/12/10 (Thu) added meta tag of generator to header. 98/12/09 (Wed) changed doctype header. 98/12/08 (Tue) modified header output. changed the default of paragraph head space recognition to on. added meta tag of author to header. 98/12/06 (Sun) modified help message. 98/12/05 (Sat) added asterisks to emphasis conversion. 98/12/03 (Thu) added option for setting author. added title and author indication. 98/12/02 (Wed) added option for ignoring empty line. made option processing routine be able to handle packed letters. added option for outputting '
' tag when head space appear (default is off) added css definition for bg/fg color. 98/12/01 (Tue) added option for removing e-mail header. 98/11/30 (Mon) supported two bytes space code(EUC/Shift-JIS). added option for setting hr width. added option for turning off kanji code processing. 98/11/29 (Sun) added option for turning off bracket conversion mode. added option for setting background color. 98/10/03 (Sat) supported CR+LF and CR return code. 98/09/23 (Wed) added conversion of each '<' and '>' to '<' and '>'. 98/09/07 (Mon) added bgcolor in body tag. 98/08/29 (Sat) added option of head space to ' ' conversion. if there isn't an empty line above a line which has head space, '
' tag is inserted before the line. 98/08/28 (Fri) added option of slashes to italic conversion. added option of underscores to emphasis conversion. now default value of horizontal line output is off. added help message. */ #include #include #include #define BUFLEN 1024 #ifndef TRUE #define TRUE 1 #define FALSE 0 #endif /* TRUE */ enum {JIS, EUC, SJIS}; #define DOCTYPE "" /* #define DOCTYPE "" #define DOCTYPE "" */ #define VERSION "htmlconv 0.9.2" #define HEADER_SIGN "" #define TAG_ITALIC_OP "" #define TAG_ITALIC_CL "" /* #define TAG_EMPHASIS_OP "" #define TAG_EMPHASIS_CL "" */ #define TAG_EMPHASIS_OP "" #define TAG_EMPHASIS_CL "" #define iskanji(c) \ kanji_code == EUC ? (((c) & 0xff) >= 0xa0 && ((c) & 0xff) <= 0xff) : \ ((((c) & 0xff) >= 0x81 && ((c) & 0xff) <= 0x9f) || \ (((c) & 0xff) >= 0xe0 && ((c) & 0xff) <= 0xfc)) FILE *infp = _IO_stdin; FILE *outfp = _IO_stdout; int horizontal_line = FALSE; int italic = FALSE; int emphasis = FALSE; int par_space = TRUE; int conv_space = FALSE; int conv_bracket = TRUE; int kanji_code = EUC; int process_kanji = TRUE; int rm_email_header = FALSE; int ignore_empty_line = FALSE; int out_br = FALSE; int recognize_url = TRUE; int recognize_emailaddr = TRUE; char mb_space1; char mb_space2; char *title = ""; char *author = ""; char *input_filename = ""; char *output_filename = ""; char *bgcolor = "#e0e0e0"; char *hrwidth = "20%"; void parse_line(char *buf); void output_header(void); char *fgets2(char *s, int size, FILE *stream); size_t getwordlen(const char *s); size_t geturllen(const char *s); size_t getemailaddrlen(const char *s); void getopt(int argc, char *argv[]); void usage(char *cmdname); int main(int argc, char *argv[]) { char buf[BUFLEN]; infp = stdin; outfp = stdout; getopt(argc, argv); if (*input_filename) if ((infp = fopen(input_filename, "r")) == NULL) { fprintf(stderr, "%s: File not found\n", input_filename); exit(1); } if (*output_filename) if ((outfp = fopen(output_filename, "w")) == NULL) { fprintf(stderr, "%s: Can't open file\n", input_filename); exit(1); } /* set the codes of multibyte space letter */ if (kanji_code == EUC) { mb_space1 = 0xa1; mb_space2 = 0xa1; } else { mb_space1 = 0x81; mb_space2 = 0x40; } fputs(DOCTYPE, outfp); fputs("\n\n", outfp); output_header(); if (*bgcolor) { fprintf(outfp, "", bgcolor); fprintf(outfp, "\n\n\n", bgcolor); } else fputs("\n\n\n", outfp); /* remove email header */ if (rm_email_header) { char buf[BUFSIZ]; while (fgets(buf, BUFSIZ, infp) != NULL) if (buf[0] == '\n') break; } /* if (*title) fprintf(outfp, "

%s

\n", title); if (*author) fprintf(outfp, "by %s

\n\n", author); */ /* main loop */ while (fgets2(buf, BUFLEN, infp) != NULL) parse_line(buf); fputs("\n\n\n", outfp); if (*input_filename) fclose(infp); if (*output_filename) fclose(outfp); return 0; } void parse_line(char *buf) { static int f_par_head = TRUE; static int f_empty_line = FALSE; static int f_italic_on = FALSE; static int f_emphasis_on = FALSE; static char prev_eol = '\0'; char tmp[BUFLEN]; char *bufp = buf; size_t len; /* trim space-only line */ for (bufp = buf; *bufp != '\0' && (*bufp == ' ' || *bufp == '\t'); bufp++); if (*bufp == '\0' || *bufp == '\n') { *buf = *bufp; *(buf + 1) = '\0'; bufp = buf; } else { char prev_ch; int count = 1; int f_trimmed_space = FALSE; if (bufp != buf) f_trimmed_space = TRUE; prev_ch = *bufp++; for (; *bufp != '\0'; bufp++) { if (prev_ch == *bufp) count++; else break; } if (count > 9) { fputs("', outfp); f_par_head = FALSE; } else bufp = buf; } while (*bufp != '\0') { switch (*bufp) { case '\n': /* empty line */ if (buf == bufp) { if (ignore_empty_line) { bufp++; break; } if (TRUE == f_empty_line) fputs("
\n", outfp); else fputs("\n

\n", outfp); if (horizontal_line) fprintf(outfp, "


\n\n

\n", hrwidth); f_par_head = TRUE; f_empty_line = TRUE; } else { fputc('\n', outfp); f_empty_line = FALSE; } bufp++; break; case ' ': case '\t': if (buf == bufp && prev_eol == '\n') { if (!f_empty_line) { if (par_space) fputs("\n

\n", outfp); else if (out_br) fputs("
\n", outfp); f_par_head = TRUE; } } if (!f_par_head) { fputc(*bufp++, outfp); break; } if (conv_space) { if (*bufp == ' ') fputs(" ", outfp); else fputs("    ", outfp); } bufp++; break; case '_': case '/': if (italic) { char prev_ch; if (buf == bufp) prev_ch = prev_eol; else prev_ch = *(bufp - 1); if (!f_italic_on) { if (strchr(" \t\n\"`'", prev_ch) == NULL) { fputc(*bufp, outfp); } else { fputs(TAG_ITALIC_OP, outfp); f_italic_on = TRUE; } } else if (strchr(" \t\n_/", prev_ch) == NULL) { fputs(TAG_ITALIC_CL, outfp); f_italic_on = FALSE; } else fputc(*bufp, outfp); } else fputc(*bufp, outfp); f_par_head = FALSE; bufp++; break; case '*': if (emphasis) { char prev_ch; if (buf == bufp) prev_ch = prev_eol; else prev_ch = *(bufp - 1); if (!f_emphasis_on) { if (strchr(" \t\n\"`'", prev_ch) == NULL) { fputc(*bufp, outfp); } else { fputs(TAG_EMPHASIS_OP, outfp); f_emphasis_on = TRUE; } } else if (strchr(" \t\n*", prev_ch) == NULL) { fputs(TAG_EMPHASIS_CL, outfp); f_emphasis_on = FALSE; } else fputc(*bufp, outfp); } else fputc(*bufp, outfp); f_par_head = FALSE; bufp++; break; case '<': if (conv_bracket) fputs("<", outfp); else fputc(*bufp, outfp); f_par_head = FALSE; bufp++; break; case '>': if (conv_bracket) fputs(">", outfp); else fputc(*bufp, outfp); f_par_head = FALSE; bufp++; break; default: /* parse URL */ if (recognize_url) { if ((len = geturllen(bufp)) != 0) { strncpy(tmp, bufp, len); tmp[len] = '\0'; fprintf(outfp, "%s", tmp, tmp); bufp += len; f_par_head = FALSE; break; } } /* parse email address */ if (recognize_emailaddr) { if ((len = getemailaddrlen(bufp)) != 0) { strncpy(tmp, bufp, len); tmp[len] = '\0'; fprintf(outfp, "%s", tmp, tmp); bufp += len; f_par_head = FALSE; break; } } fputc(*bufp++, outfp); f_par_head = FALSE; } } if (buf != bufp) prev_eol = *(bufp - 1); } void output_header(void) { fputs("\n", outfp); fputs("\t"HEADER_SIGN"\n", outfp); if (*author) fprintf(outfp, "\t\n", author); fputs("\t\n", outfp); if (*title) fprintf(outfp, "\t%s\n", title); fputs("\n", outfp); } char *fgets2(char *s, int size, FILE *stream) { int i = 1; int c; char *p = s; static int cr = FALSE; static char kanji1 = '\0'; if (process_kanji && kanji1 != '\0') i++; for (; i < size; i++) { c = fgetc(stream); switch (c) { case EOF: if (kanji1 != '\0') { fputs("Unexpected EOF\n", stderr); exit(1); } if (i == 1) { cr = FALSE; return NULL; } *p = '\0'; cr = FALSE; return s; case '\r': *p++ = '\n'; *p = '\0'; cr = TRUE; return s; case '\n': /* for CR+LF */ if (i == 1 && cr) break; *p++ = '\n'; *p = '\0'; cr = FALSE; return s; default: if (!process_kanji) { *p++ = c; break; } if (kanji1 != '\0') { /* 2 bytes space -> 2 ascii space */ if (kanji1 == mb_space1 && (c & 0xff) == mb_space2) { *p++ = ' '; *p++ = ' '; } else { *p++ = kanji1; *p++ = c; } kanji1 = '\0'; i++; } else if (iskanji(c)) { kanji1 = c; /* first byte of multibyte char occured on the end of string */ if (i == size - 1) break; } else *p++ = c; } } *p = '\0'; cr = FALSE; return s; } size_t getwordlen(const char *s) { int i; for (i = 0; ; i++) if (!isgraph(s[i])) return (size_t)i; } size_t geturllen(const char *s) { int i; if (!strncasecmp(s, "http://", 7) || !strncasecmp(s, "ftp://", 6)) for (i = 0; ; i++) { if (!isgraph(s[i])) return (size_t)i; } else return 0; } size_t getemailaddrlen(const char *s) { int i = 0; int isemailaddr = 0; for (i = 0; ; i++) { if (!isalnum(s[i]) && s[i] != '-' && s[i] != '.' && s[i] != '@') break; if (s[i] == '@') { if (i > 0) { if (isemailaddr == 0) isemailaddr = 1; else return 0; } else return 0; } else if (isemailaddr == 1) isemailaddr = 2; } if (isemailaddr == 2) { if (s[i - 1] == '.') return (size_t)(i - 1); else return (size_t)i; } else return 0; } void getopt(int argc, char *argv[]) { int i, p, c, skip_next_arg = FALSE; for (i = 1; i < argc; i++) { if (skip_next_arg) { skip_next_arg = FALSE; continue; } if (*argv[i] != '-') { input_filename = argv[i]; continue; } for (p = 1; (c = *(argv[i] + p)) != '\0'; p++) { switch (c) { case 'a': /* set author */ if (skip_next_arg) break; if (i + 1 < argc) { author = argv[i + 1]; skip_next_arg = TRUE; } else { fputs("-a must be followed by author name.\n", stderr); exit(1); } break; case 'A': /* turn off email address recognition */ recognize_emailaddr = FALSE; break; case 'b': /* turn off bracket conversion mode */ conv_bracket = FALSE; break; case 'B': /* set bgcolor */ if (skip_next_arg) break; if (i + 1 < argc) { bgcolor = argv[i + 1]; skip_next_arg = TRUE; } else { fputs("-B must be followed by color name.\n", stderr); exit(1); } break; case 'e': /* convert underscore to emphasis */ emphasis = TRUE; break; case 'E': /* ignore empty line */ ignore_empty_line = TRUE; break; case 'H': /* set hr width */ if (skip_next_arg) break; if (i + 1 < argc) { hrwidth = argv[i + 1]; skip_next_arg = TRUE; } else { fputs("-H must be followed by hr width.\n", stderr); exit(1); } break; case 'i': /* convert slash to italic */ italic = TRUE; break; case 'l': /* output horizontal line when we come to empty line */ horizontal_line = TRUE; break; case 'm': /* remove e-mail header */ rm_email_header = TRUE; break; case 'n': /* don't process kanji code */ process_kanji = FALSE; break; case 'o': /* set output filename */ if (skip_next_arg) break; if (i + 1 < argc) { output_filename = argv[i + 1]; skip_next_arg = TRUE; } else { fputs("-o must be followed by file name.\n", stderr); exit(1); } break; case 'p': /* don't recognize space on head of line as a change of paragraph */ par_space = FALSE; break; case 'r': /* output
when head space appear */ out_br = TRUE; break; case 's': /* convert head space to ' ' */ conv_space = TRUE; break; case 'S': /* process text as Shift-JIS */ kanji_code = SJIS; break; case 't': /* set title */ if (skip_next_arg) break; if (i + 1 < argc) { title = argv[i + 1]; skip_next_arg = TRUE; } else { fputs("-t must be followed by title string.\n", stderr); exit(1); } break; case 'u': /* turn off URL recognition */ recognize_url = FALSE; break; case 'h': /* display help message */ usage(argv[0]); exit(1); default: fprintf(stderr, "invalid option -- %c\n", c); usage(argv[0]); exit(1); } } } } void usage(char *cmdname) { fprintf(stderr, "Usage: %s [OPTION]... [FILE]\n" " -a AUTHOR set author\n" " -A turn off email address recognition\n" " -b turn off bracket conversion mode\n" " -B COLOR set bgcolor\n" " -e convert *asterisks* to boldface\n" " -E ignore empty line\n" " -H WIDTH set hr width\n" " -i convert _underscores_ and /slashes/ to italic\n" " -l output horizontal line when empty line appears\n" " -m remove e-mail header\n" " -n don't process kanji code\n" " -o FILE set output filename\n" " -p don't recognize blank on head of line as a change of paragraph\n" " -r output
tag when head space appear\n" " -s convert space on head to ' '\n" " -S process text as Shift-JIS\n" " -t TITLE set title\n" " -u turn off URL recognition\n" " -h print this help message\n", cmdname); }