#define BUFLEN 1024
#ifndef TRUE
#define TRUE 1
#define FALSE 0
#endif /* TRUE */
enum {JIS, EUC, SJIS};
#define DOCTYPE ""
/*
#define DOCTYPE ""
#define DOCTYPE ""
*/
#define VERSION "htmlconv 0.9.2"
#define HEADER_SIGN ""
#define TAG_ITALIC_OP ""
#define TAG_ITALIC_CL ""
/*
#define TAG_EMPHASIS_OP ""
#define TAG_EMPHASIS_CL ""
*/
#define TAG_EMPHASIS_OP ""
#define TAG_EMPHASIS_CL ""
#define iskanji(c) \
kanji_code == EUC ? (((c) & 0xff) >= 0xa0 && ((c) & 0xff) <= 0xff) : \
((((c) & 0xff) >= 0x81 && ((c) & 0xff) <= 0x9f) || \
(((c) & 0xff) >= 0xe0 && ((c) & 0xff) <= 0xfc))
FILE *infp = _IO_stdin;
FILE *outfp = _IO_stdout;
int horizontal_line = FALSE;
int italic = FALSE;
int emphasis = FALSE;
int par_space = TRUE;
int conv_space = FALSE;
int conv_bracket = TRUE;
int kanji_code = EUC;
int process_kanji = TRUE;
int rm_email_header = FALSE;
int ignore_empty_line = FALSE;
int out_br = FALSE;
int recognize_url = TRUE;
int recognize_emailaddr = TRUE;
char mb_space1;
char mb_space2;
char *title = "";
char *author = "";
char *input_filename = "";
char *output_filename = "";
char *bgcolor = "#e0e0e0";
char *hrwidth = "20%";
void parse_line(char *buf);
void output_header(void);
char *fgets2(char *s, int size, FILE *stream);
size_t getwordlen(const char *s);
size_t geturllen(const char *s);
size_t getemailaddrlen(const char *s);
void getopt(int argc, char *argv[]);
void usage(char *cmdname);
int main(int argc, char *argv[])
{
char buf[BUFLEN];
infp = stdin;
outfp = stdout;
getopt(argc, argv);
if (*input_filename)
if ((infp = fopen(input_filename, "r")) == NULL) {
fprintf(stderr, "%s: File not found\n",
input_filename);
exit(1);
}
if (*output_filename)
if ((outfp = fopen(output_filename, "w")) == NULL) {
fprintf(stderr, "%s: Can't open file\n",
input_filename);
exit(1);
}
/* set the codes of multibyte space letter */
if (kanji_code == EUC) {
mb_space1 = 0xa1;
mb_space2 = 0xa1;
} else {
mb_space1 = 0x81;
mb_space2 = 0x40;
}
fputs(DOCTYPE, outfp);
fputs("\n\n", outfp);
output_header();
if (*bgcolor) {
fprintf(outfp, "", bgcolor);
fprintf(outfp, "\n\n\n", bgcolor);
} else
fputs("\n\n\n", outfp);
/* remove email header */
if (rm_email_header) {
char buf[BUFSIZ];
while (fgets(buf, BUFSIZ, infp) != NULL)
if (buf[0] == '\n') break;
}
/*
if (*title)
fprintf(outfp, "%s
\n", title);
if (*author)
fprintf(outfp, "by %s\n\n", author);
*/
/* main loop */
while (fgets2(buf, BUFLEN, infp) != NULL)
parse_line(buf);
fputs("\n\n\n", outfp);
if (*input_filename)
fclose(infp);
if (*output_filename)
fclose(outfp);
return 0;
}
void parse_line(char *buf)
{
static int f_par_head = TRUE;
static int f_empty_line = FALSE;
static int f_italic_on = FALSE;
static int f_emphasis_on = FALSE;
static char prev_eol = '\0';
char tmp[BUFLEN];
char *bufp = buf;
size_t len;
/* trim space-only line */
for (bufp = buf;
*bufp != '\0' && (*bufp == ' ' || *bufp == '\t');
bufp++);
if (*bufp == '\0' || *bufp == '\n') {
*buf = *bufp;
*(buf + 1) = '\0';
bufp = buf;
} else {
char prev_ch;
int count = 1;
int f_trimmed_space = FALSE;
if (bufp != buf) f_trimmed_space = TRUE;
prev_ch = *bufp++;
for (; *bufp != '\0'; bufp++) {
if (prev_ch == *bufp)
count++;
else
break;
}
if (count > 9) {
fputs("
', outfp);
f_par_head = FALSE;
} else
bufp = buf;
}
while (*bufp != '\0') {
switch (*bufp) {
case '\n':
/* empty line */
if (buf == bufp) {
if (ignore_empty_line) {
bufp++;
break;
}
if (TRUE == f_empty_line)
fputs("
\n", outfp);
else fputs("\n\n", outfp);
if (horizontal_line)
fprintf(outfp,
"
\n\n\n",
hrwidth);
f_par_head = TRUE;
f_empty_line = TRUE;
} else {
fputc('\n', outfp);
f_empty_line = FALSE;
}
bufp++;
break;
case ' ':
case '\t':
if (buf == bufp && prev_eol == '\n') {
if (!f_empty_line) {
if (par_space)
fputs("\n
\n", outfp);
else if (out_br)
fputs("
\n", outfp);
f_par_head = TRUE;
}
}
if (!f_par_head) {
fputc(*bufp++, outfp);
break;
}
if (conv_space) {
if (*bufp == ' ')
fputs(" ", outfp);
else fputs(" ", outfp);
}
bufp++;
break;
case '_':
case '/':
if (italic) {
char prev_ch;
if (buf == bufp)
prev_ch = prev_eol;
else
prev_ch = *(bufp - 1);
if (!f_italic_on) {
if (strchr(" \t\n\"`'", prev_ch)
== NULL) {
fputc(*bufp, outfp);
} else {
fputs(TAG_ITALIC_OP, outfp);
f_italic_on = TRUE;
}
} else if (strchr(" \t\n_/", prev_ch)
== NULL) {
fputs(TAG_ITALIC_CL, outfp);
f_italic_on = FALSE;
} else fputc(*bufp, outfp);
} else
fputc(*bufp, outfp);
f_par_head = FALSE;
bufp++;
break;
case '*':
if (emphasis) {
char prev_ch;
if (buf == bufp)
prev_ch = prev_eol;
else
prev_ch = *(bufp - 1);
if (!f_emphasis_on) {
if (strchr(" \t\n\"`'", prev_ch)
== NULL) {
fputc(*bufp, outfp);
} else {
fputs(TAG_EMPHASIS_OP, outfp);
f_emphasis_on = TRUE;
}
} else if (strchr(" \t\n*", prev_ch)
== NULL) {
fputs(TAG_EMPHASIS_CL, outfp);
f_emphasis_on = FALSE;
} else fputc(*bufp, outfp);
} else
fputc(*bufp, outfp);
f_par_head = FALSE;
bufp++;
break;
case '<':
if (conv_bracket)
fputs("<", outfp);
else
fputc(*bufp, outfp);
f_par_head = FALSE;
bufp++;
break;
case '>':
if (conv_bracket)
fputs(">", outfp);
else
fputc(*bufp, outfp);
f_par_head = FALSE;
bufp++;
break;
default:
/* parse URL */
if (recognize_url) {
if ((len = geturllen(bufp)) != 0) {
strncpy(tmp, bufp, len);
tmp[len] = '\0';
fprintf(outfp, "%s",
tmp, tmp);
bufp += len;
f_par_head = FALSE;
break;
}
}
/* parse email address */
if (recognize_emailaddr) {
if ((len = getemailaddrlen(bufp)) != 0) {
strncpy(tmp, bufp, len);
tmp[len] = '\0';
fprintf(outfp, "%s", tmp, tmp);
bufp += len;
f_par_head = FALSE;
break;
}
}
fputc(*bufp++, outfp);
f_par_head = FALSE;
}
}
if (buf != bufp)
prev_eol = *(bufp - 1);
}
void output_header(void)
{
fputs("
\n", outfp);
fputs("\t"HEADER_SIGN"\n", outfp);
if (*author)
fprintf(outfp, "\t\n",
author);
fputs("\t\n", outfp);
if (*title)
fprintf(outfp, "\t%s\n", title);
fputs("\n", outfp);
}
char *fgets2(char *s, int size, FILE *stream)
{
int i = 1;
int c;
char *p = s;
static int cr = FALSE;
static char kanji1 = '\0';
if (process_kanji && kanji1 != '\0')
i++;
for (; i < size; i++) {
c = fgetc(stream);
switch (c) {
case EOF:
if (kanji1 != '\0') {
fputs("Unexpected EOF\n", stderr);
exit(1);
}
if (i == 1) {
cr = FALSE;
return NULL;
}
*p = '\0';
cr = FALSE;
return s;
case '\r':
*p++ = '\n';
*p = '\0';
cr = TRUE;
return s;
case '\n':
/* for CR+LF */
if (i == 1 && cr)
break;
*p++ = '\n';
*p = '\0';
cr = FALSE;
return s;
default:
if (!process_kanji) {
*p++ = c;
break;
}
if (kanji1 != '\0') {
/* 2 bytes space -> 2 ascii space */
if (kanji1 == mb_space1 &&
(c & 0xff) == mb_space2) {
*p++ = ' ';
*p++ = ' ';
} else {
*p++ = kanji1;
*p++ = c;
}
kanji1 = '\0';
i++;
} else if (iskanji(c)) {
kanji1 = c;
/* first byte of multibyte char occured
on the end of string */
if (i == size - 1)
break;
} else
*p++ = c;
}
}
*p = '\0';
cr = FALSE;
return s;
}
size_t getwordlen(const char *s)
{
int i;
for (i = 0; ; i++)
if (!isgraph(s[i])) return (size_t)i;
}
size_t geturllen(const char *s)
{
int i;
if (!strncasecmp(s, "http://", 7) ||
!strncasecmp(s, "ftp://", 6))
for (i = 0; ; i++) {
if (!isgraph(s[i])) return (size_t)i;
}
else return 0;
}
size_t getemailaddrlen(const char *s)
{
int i = 0;
int isemailaddr = 0;
for (i = 0; ; i++) {
if (!isalnum(s[i]) && s[i] != '-' &&
s[i] != '.' && s[i] != '@') break;
if (s[i] == '@') {
if (i > 0) {
if (isemailaddr == 0)
isemailaddr = 1;
else
return 0;
} else
return 0;
} else if (isemailaddr == 1)
isemailaddr = 2;
}
if (isemailaddr == 2) {
if (s[i - 1] == '.')
return (size_t)(i - 1);
else
return (size_t)i;
} else
return 0;
}
void getopt(int argc, char *argv[])
{
int i, p, c, skip_next_arg = FALSE;
for (i = 1; i < argc; i++) {
if (skip_next_arg) {
skip_next_arg = FALSE;
continue;
}
if (*argv[i] != '-') {
input_filename = argv[i];
continue;
}
for (p = 1; (c = *(argv[i] + p)) != '\0'; p++) {
switch (c) {
case 'a': /* set author */
if (skip_next_arg) break;
if (i + 1 < argc) {
author = argv[i + 1];
skip_next_arg = TRUE;
} else {
fputs("-a must be followed by author name.\n",
stderr);
exit(1);
}
break;
case 'A': /* turn off email address
recognition */
recognize_emailaddr = FALSE;
break;
case 'b': /* turn off bracket conversion mode */
conv_bracket = FALSE;
break;
case 'B': /* set bgcolor */
if (skip_next_arg) break;
if (i + 1 < argc) {
bgcolor = argv[i + 1];
skip_next_arg = TRUE;
} else {
fputs("-B must be followed by color name.\n",
stderr);
exit(1);
}
break;
case 'e': /* convert underscore to emphasis */
emphasis = TRUE;
break;
case 'E': /* ignore empty line */
ignore_empty_line = TRUE;
break;
case 'H': /* set hr width */
if (skip_next_arg) break;
if (i + 1 < argc) {
hrwidth = argv[i + 1];
skip_next_arg = TRUE;
} else {
fputs("-H must be followed by hr width.\n",
stderr);
exit(1);
}
break;
case 'i': /* convert slash to italic */
italic = TRUE;
break;
case 'l': /* output horizontal line when
we come to empty line */
horizontal_line = TRUE;
break;
case 'm': /* remove e-mail header */
rm_email_header = TRUE;
break;
case 'n': /* don't process kanji code */
process_kanji = FALSE;
break;
case 'o': /* set output filename */
if (skip_next_arg) break;
if (i + 1 < argc) {
output_filename = argv[i + 1];
skip_next_arg = TRUE;
} else {
fputs("-o must be followed by file name.\n",
stderr);
exit(1);
}
break;
case 'p': /* don't recognize space on head of
line as a change of paragraph */
par_space = FALSE;
break;
case 'r': /* output
when head space
appear */
out_br = TRUE;
break;
case 's': /* convert head space to ' ' */
conv_space = TRUE;
break;
case 'S': /* process text as Shift-JIS */
kanji_code = SJIS;
break;
case 't': /* set title */
if (skip_next_arg) break;
if (i + 1 < argc) {
title = argv[i + 1];
skip_next_arg = TRUE;
} else {
fputs("-t must be followed by title string.\n",
stderr);
exit(1);
}
break;
case 'u': /* turn off URL recognition */
recognize_url = FALSE;
break;
case 'h': /* display help message */
usage(argv[0]);
exit(1);
default:
fprintf(stderr, "invalid option -- %c\n", c);
usage(argv[0]);
exit(1);
}
}
}
}
void usage(char *cmdname)
{
fprintf(stderr,
"Usage: %s [OPTION]... [FILE]\n"
" -a AUTHOR set author\n"
" -A turn off email address recognition\n"
" -b turn off bracket conversion mode\n"
" -B COLOR set bgcolor\n"
" -e convert *asterisks* to boldface\n"
" -E ignore empty line\n"
" -H WIDTH set hr width\n"
" -i convert _underscores_ and /slashes/ to italic\n"
" -l output horizontal line when empty line appears\n"
" -m remove e-mail header\n"
" -n don't process kanji code\n"
" -o FILE set output filename\n"
" -p don't recognize blank on head of line as a change of paragraph\n"
" -r output
tag when head space appear\n"
" -s convert space on head to ' '\n"
" -S process text as Shift-JIS\n"
" -t TITLE set title\n"
" -u turn off URL recognition\n"
" -h print this help message\n",
cmdname);
}