/*
 * Small utility to remove (most of) the HTML formatting codes from a
 * document, essentually turning it into a simple TEXT document. The
 * conversion is **VERY** simple, and manual editing will probably be
 * required afterwards, however this still saves a lot of work.
 *
 * ?COPY.TXT 1997-2003 Dave Dunfield
 *  -- see COPY.TXT --.
 *
 * Permission granted for personal (non-commercial) use only.
 *
 * Compile command: CC unhtml -fop
 */
#include <stdio.h>

#define	TAG_SIZE	200			// Maximum size of HTML tag
#define	HRULE_SIZE	72			// Size of Horizontal rule

char lastc = 0;					/* Keep track of last character output */

/* Some HTML tags that we want/need to recognize */
char *html_tags[] = {
	"P>", "BR>", "LI>", "HR>",
	"H6", "H5", "H4", "H3", "H2", "H1",
	"TD>", "TH>",
	// These HTML tags may have parameters, so we include them
    // in the table to insure a match (no processing is done)
	"BASE", "LINK", "PRE WIDTH", "NEXTID", "IMG", "FORM", "INPUT",
	"TEXT", "SELECT", "OPTION", "TABLE",
	"A ",
	0 };

/*
 * Make a "best guess" if a string within <> is a valid HTML tag,
 * and perform minimal processing/formatting in some cases.
 */
int process_html(char *raw_html)
{
	unsigned n;
	char html[TAG_SIZE], *ptr, c, off, xflag, x;

	off = xflag = x = 0;
	ptr = html;
	if(*++raw_html == '/') {			/* Remove '/' off indicator */
		++raw_html;
		off = -1; }
	while(c = *raw_html++) {			/* Scan for non-alpha, remove <> */
		xflag = x;
		if(!isalpha(c))
			x = -1;
		*ptr++ = toupper(c); }
	*ptr = 0;

	for(n=0; ptr = html_tags[n]; ++n)	/* Lookup HTML tag in list */
		if(strbeg(html, ptr))
			goto found_html;

	// HTML tag not found .. Assume it was a valid tag if it
    // Contained only alphabetic letters in its content
	return xflag;

found_html:
	// Perform some very basic formatting of specific tags
	switch(n) {
		case 0 :											/* P */
		case 1 : putc(lastc = '\n', stdout); break;			/* BR */
		case 2 : 											/* LI */
			if(lastc != '\n')
				putc(lastc = '\n', stdout); break;
			break;
		case 3 :											/* HR */
			putc('\n', stdout);
			for(n=0; n < HRULE_SIZE; ++n)
				putc('-', stdout);
			putc(lastc = '\n', stdout);
			break;
		case 4 : putc('*', stdout);							/* H6 */
		case 5 : putc('*', stdout);							/* H5 */
		case 6 : putc('*', stdout);							/* H4 */
		case 7 : putc('*', stdout);							/* H3 */
		case 8 : putc('*', stdout);							/* H2 */
		case 9 : putc(lastc = '*', stdout);					/* H1 */
			break;
		case 10:											/* TD */
		case 11: putc(lastc = '\t', stdout); break;			/* TH */
	}
	return 0;
}

/*
 * Main program
 */
main(int argc, char *argv[])
{
	int arg, c, flag;
	FILE *fp;
	char buffer[TAG_SIZE];

	if(argc < 2)
		abort("\nUse: UNHTML filename [>newfile]\n\n?COPY.TXT 1997-2003 Dave Dunfield\n -- see COPY.TXT --.\n");

	for(arg = 1; arg < argc; ++arg) {
		fp = fopen(argv[arg], "rvq");
		flag = 0;
		while((c = getc(fp)) >= 0) {
			if(flag) {
				buffer[flag++] = c;
				switch(c) {
				case '\n' :		/* End of line */
				case '<' :		/* Nested brackets */
					buffer[flag] = 0;
					flag = 0;
					fputs(buffer, stdout);
					lastc = c;
					continue;
				case '>' :
					buffer[flag] = 0;
					flag = 0;
					if(process_html(buffer)) {
						fputs(buffer, stdout);
						lastc = c; } } }
			else {
				if(c == '<') {
					buffer[flag++] = c;
					continue; }
				if(!flag)
					putc(lastc = c, stdout); } }
		fclose(fp); }
}
