/* bad_tags.c   --   gcc -O2 bad_tags.c -o ~/bin/bad_tags

   written & Copyright 12/25/03 by Ray Lee <ray-badtags at madrabbit dot org>
   Latest version of this code will probably be linked to at
   http://madrabbit.org/~ray .



     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
     the Free Software Foundation; version 2, June 1991 of the License only.

     This program is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.

     If you did not receive a copy of the GNU General Public License
     with this file, you may obtain one electronically by pointing your
     web browser at http://www.gnu.org/licenses/gpl.txt .



   Sample usage of bad_tags would be to drop it somewhere into your
   .procmailrc (or the system-wide /etc/procmailrc) as such:

	:0
	{
		BADTAG=`mimedecode | ${HOME}/bin/bad_tags`
		bad_tag_status = $?
		
		:0 Bfh
		* bad_tag_status ?? ^^1^^
		| formail -A "X-Bad-Tag-Detected: YES" -A "X-Bad-Tag: $BADTAG"
	}

	# do other things with the message here, such as run through
	# spamassassin, or whatever.
	
	:0
	* ^X-Bad-Tag-Detected: YES
	.spam/

   As you can see, bad_tags relies on mimedecode[1] being available. formail is
   optional, as all it's being used for is to tag the message -- the return
   code (bad_tag_status) could just as easily been used, though you'd have
   to guess why the message was routed to your spam folder.

   (No guarantees that the above is optimal -- or even good -- procmail form.
   It works, but my procmail recipe-foo is too weak to know if there's a
   better way. The :0 and { } pair are purely for visual grouping. Remove
   'em if they bother you.)

          [1]	mimedecode is apt-getable on Debian systems, should be RPM'd
		many places. As of this writing, sources can be obtained from
		http://www.freesoft.org/CIE/FAQ/mimedeco.c



   For people with spamassassin installed, you can integrate the result
   as a test in ~/.spamassassin/user_prefs:

	header   BAD_TAG X-Bad-Tag-Detected =~ /YES/
	describe BAD_TAG Body contains a bogus HTML tag
	score    BAD_TAG 5.0

   Note that for the above to work, your site administrator will have to enable
   custom user tests in /etc/spamassassin/local.cf ("allow_user_rules 1").
   Alternately, site administrators could place the test directly in the
   local site rulelist. See the spamassassin documentation for details.



   To test against a Maildir-style directory full of messages to see what
   fails, you can use something like this in bash:

   $ cd ~/Maildir/cur
   $ for i in *; do mimedecode <$i | bad_tags ; if [ $? != 0 ]; 
   >   then echo -n "$i: " && formail -c -x Subject <$i; fi; done 



   Though this has been tested on a corpus of ~10 000 interesting
   messages, the code will hit some false positives. The primary case
   is due to quoted HTML mail from Hotmail accounts where their code
   fails to properly escape text such as
   
    To:	Ray <ray@abc.def>         ....into the proper
    To:	Ray &lt;ray@abc.def&gt;

   There are other, less common failure modes which you may notice.
   I'll mention in passing that many of these are also due to
   Micro$oft products. Thank you, Micro$oft, may we have another?
   Regardless, this means that the results of processing should
   be used as another layer in spam filtering, rather than a
   definitive classification.



   No profiling of the code has been done beyond the squint-and-scowl
   method of optimization. But $5 says the code is spending most of
   its time inside getc().

*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef __GNUC__
	#undef getc
	#define getc getc_unlocked
#endif

#define ARRAY_ENTRIES(x) (sizeof(x)/sizeof(x[0]))

/* valid tag list may not be complete. It was yanked quickly from
   http://www.w3.org/TR/REC-html40/index/elements.html, with a
   few others added by hand					  */

char *valid_tags[] = {
	// "!--",		/* comments are parsed out */
	"!DOCTYPE",
	"![ENDIF]",	/* Thank you Micro$oft, may I please have another? */
	"![IF",		/* Hey, waitaminute! I was kidding! */
	"?XML:NAMESPACE",
	"A",
	"ABBR",
	"ACRONYM",
	"ADDRESS",
	"APPLET",
	"AREA",
	"B",
	"BASE",
	"BASEFONT",
	"BDO",
	"BIG",
	// "BLINK",	/* I can't bring myself to uncomment this one. */
	"BLOCKQUOTE",
	"BODY",
	"BR",
	"BUTTON",
	"CAPTION",
	"CENTER",
	"CITE",
	"CODE",
	"COL",
	"COLGROUP",
	"DD",
	"DEL",
	"DFN",
	"DIR",
	"DIV",
	"DL",
	"DT",
	"EM",
	"EMBED",
	"FIELDSET",
	"FONT",
	"FORM",
	"FRAME",
	"FRAMESET",
	"H1",
	"H2",
	"H3",
	"H4",
	"H5",
	"H6",
	"HEAD",
	"HR",
	"HTML",
	"I",
	"IFRAME",
	"IMG",
	"INPUT",
	"INS",
	"ISINDEX",
	"KBD",
	// "LAYER",	/* Obsolete now */
	"LI",
	"LINK",
	"MAP",
	// "MARQUEE",	/* Also obsolete, I believe. */
	"MENU",
	"META",
	"NOBR",
	"NOFRAMES",
	"NOSCRIPT",
	"NOWRAP",
	"O:P",		/* Sigh, Them again */
	"O:SMARTTAGTYPE", 	/* and again */
	"OBJECT",	
	"OL",
	"OPTGROUP",
	"OPTION",
	"P",
	"PARAM",
	"PRE",
	"Q",
	"S",
	"SAMP",
	"SCRIPT",
	"SELECT",
	"SMALL",
	"SPACER",
	"SPAN",
	"STRIKE",
	"STRONG",
	"STYLE",
	"SUB",
	"SUP",
	"TABLE",
	"TBODY",
	"TD",
	"TEXTAREA",
	"TFOOT",
	"TH",
	"THEAD",
	"TITLE",
	"TR",
	"TT",
	"U",
	"UL",
	"VAR",
	"X-SIGSEP",
	"X-TAB",
	"XML",
};

int show_failing_tag = 1,
    max_emails = 10,			/* mime content types look like email, sigh. */
    bad_tag_count_threshold = 1;

enum tag_states { inside_a_tag, outside_a_tag };

int compare(const void *a, const void *b) {
	return strcmp( (char *)a, *(char **)b );
}

int bad_tag(char *tag) {
	if (strncmp("ST1:", tag, 4) == 0) // sigh, microsoft 'Smart Tags'
		return 0;
	
	return !bsearch( tag, valid_tags, ARRAY_ENTRIES(valid_tags),
				sizeof(valid_tags[0]), compare );
}

void skip_comment(FILE *fp) {
	unsigned long last_three = 0;
	const unsigned long end_pattern = ('-' << 16) | ('-' << 8) | ('>');
	int c;
	
	do {
		c = getc(fp);
		last_three = ( (last_three & 0xffff) << 8) | (char)c;
	} while ( c != EOF && last_three != end_pattern );
}

void skip_stream_past( FILE *fp, int target ) {
	int c;
	for (c = getc(fp); c != EOF && c != target; c = getc(fp))
		;
}

int main(void) {
	char tag[25];
	int tag_pos, in_html = 0, emails_seen = 0, bad_tags_seen = 0, c;
	enum tag_states state = outside_a_tag;
	FILE *fp = stdin;
	
	c = getc(fp);
	while (c != EOF) {
		switch (state) {
			case outside_a_tag:
				if (c == '<') {
					state=inside_a_tag;
					tag_pos = 0;
				}
				break;
			case inside_a_tag:
				switch (c) {
					case '/':
						break;
					case '@':
						if (in_html && emails_seen++ < max_emails) {
							skip_stream_past( fp, '>' );
							state=outside_a_tag;
							break;
						}
					/* the next line is isspace() open-coded */
					case ' ': case '\t': case '\n': case '\r': case '\f': case '\v':
						skip_stream_past(fp, '>');
					case '>':
						state = outside_a_tag;
						tag[tag_pos] = '\0';
						if (in_html && bad_tag(tag)) {
							if (++bad_tags_seen >= bad_tag_count_threshold) {
								if (show_failing_tag)
									fprintf(stdout, "%s ", tag);
								return 1;
							}
						}
						if (!strcmp(tag,"HTML"))
							in_html = 1;
						break;
					default:
						if (tag_pos < sizeof(tag)-1)
							tag[tag_pos++] = toupper(c);
						if (in_html && tag_pos == 3 && !strncmp("!--",tag,3)) {
							skip_comment(fp);
							state = outside_a_tag;
						}
						break;
				}
				break;
		}
		c = getc(fp);
	}
	return 0;
}

