/* * The contents of this file are subject to the Mozilla Public License Version 1.1 * (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at . * * Software distributed under the License is distributed on an "AS IS" basis, WITHOUT * WARRANTY OF ANY KIND, either express or implied. See the License for the specific * language governing rights and limitations under the License. * * The Original Code is the Venice Web Community System. * * The Initial Developer of the Original Code is Eric J. Bowersox , * for Silverwrist Design Studios. Portions created by Eric J. Bowersox are * Copyright (C) 2001 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. * * Contributor(s): */ package com.silverwrist.venice.htmlcheck.impl; import java.util.*; import com.silverwrist.venice.htmlcheck.*; class HTMLCheckerImpl implements HTMLChecker, HTMLCheckerBackend, RewriterServices { /*-------------------------------------------------------------------------------- * Wrapper for Rewriters to implement counting where necessary *-------------------------------------------------------------------------------- */ static class CountingRewriter implements Rewriter { private Rewriter inner; private int count = 0; public CountingRewriter(Rewriter inner) { this.inner = inner; } // end constructor public String getName() { return inner.getName(); } // end getName public MarkupData rewrite(String data, RewriterServices svc) { MarkupData rc = inner.rewrite(data,svc); if (rc!=null) count++; return rc; } // end rewrite public int getCount() { return count; } // end getCount public void reset() { count = 0; } // end reset } // end class CountingRewriter /*-------------------------------------------------------------------------------- * State machine constants *-------------------------------------------------------------------------------- */ private static final short ST_WHITESPACE = 0; private static final short ST_CHARS = 1; private static final short ST_LEFTANGLE = 2; private static final short ST_TAG = 3; private static final short ST_PAREN = 4; private static final short ST_TAGQUOTE = 5; /*-------------------------------------------------------------------------------- * Attributes *-------------------------------------------------------------------------------- */ private HTMLCheckerConfigImpl config; // the configuration we're working from private boolean started = false; // have we started parsing? private boolean finished = false; // have we finished parsing? private short state = ST_WHITESPACE; // current parser state private char quote_char; // quote character to match private int paren_level = 0; // parenthesization level private short columns = 0; // current number of columns private int lines = 0; // current number of lines private int nobreak_count = 0; // "no-break" count private boolean trigger_WBR = false; // word break trigger private StringBuffer output_buffer = null; // output buffer for characters private StringBuffer temp_buffer = null; // temporary buffer used within one state private Vector tag_stack = null; // stack of tags that have been opened private Hashtable counters = new Hashtable(); // the CountingRewriter instances private Vector string_rewriters = new Vector(); // string rewriter instances private Vector word_rewriters = new Vector(); // word rewriter instances private Vector tag_rewriters = new Vector(); // tag rewriter instances private Vector paren_rewriters = new Vector(); // paren rewriter instances private Hashtable context_data = new Hashtable(); // context variables /*-------------------------------------------------------------------------------- * Constructor *-------------------------------------------------------------------------------- */ HTMLCheckerImpl(HTMLCheckerConfigImpl config) { TagRepository.init(); this.config = config; copyRewriters(string_rewriters,config.getStringRewriters()); copyRewriters(word_rewriters,config.getWordRewriters()); copyRewriters(tag_rewriters,config.getTagRewriters()); copyRewriters(paren_rewriters,config.getParenRewriters()); } // end constructor /*-------------------------------------------------------------------------------- * Internal functions *-------------------------------------------------------------------------------- */ private static final boolean isWordChar(char ch) { return (Character.isUpperCase(ch) || Character.isLowerCase(ch) || (ch=='-')); } // end isWordChar private static final int getRunLength(StringBuffer buf) { boolean word_char = isWordChar(buf.charAt(0)); int l = 1; while (l0) { // look through all the output filters to see what we can do Iterator it = filters.iterator(); while (!handled && it.hasNext()) { // look for an output filter that matches this character OutputFilter of = (OutputFilter)(it.next()); handled = of.tryOutputCharacter(output_buffer,ch); } // end while } // end if if (!handled) // output the character output_buffer.append(ch); if (count_cols && (config.getWordWrapLength()>0)) columns++; // adjust column indicator } // end emitChar private void emitString(String str, List filters, boolean count_cols) { boolean real_count_cols = count_cols && (config.getWordWrapLength()>0); if (filters.size()==0) { // if there are no filters, just append the entire string directly output_buffer.append(str); if (real_count_cols) columns += (short)(str.length()); return; } // end if String temp = str; while ((temp!=null) && (temp.length()>0)) { // We output as much of the string as we possibly can at once. For starters, // assume we're going to output the whole thing. int output_len = temp.length(); // Now look at each of the output filters to see if we should try outputting // a lesser amount (i.e., does the string contain a "stopper" that one of the // output filters would like to mogrify?) Iterator it = filters.iterator(); OutputFilter stopper = null; while (it.hasNext() && (output_len>0)) { // look to find the length of characters that doesn't match this filter OutputFilter of = (OutputFilter)(it.next()); int lnm = of.lengthNoMatch(temp); if ((lnm>=0) && (lnm0) { // At least this many unaltered characters can be output, so copy them. output_buffer.append(temp.substring(0,output_len)); if (real_count_cols) columns += (short)output_len; } // end if if (stopper!=null) { // one of the output filters stopped us, so invoke it char tmpch = temp.charAt(output_len++); if (!(stopper.tryOutputCharacter(output_buffer,tmpch))) output_buffer.append(tmpch); if (real_count_cols) columns++; } // end if // Chop the string down the middle and go around again. if (output_len==temp.length()) temp = null; else if (output_len>0) temp = temp.substring(output_len); } // end while (still data left to append) } // end emitString private void emitLineBreak() { emitString("\r\n",config.getRawOutputFilters(),false); if (config.getWordWrapLength()>0) columns = 0; lines++; } // end emitLineBreak private void emitPossibleLineBreak() { if ((config.getWordWrapLength()>0) && (nobreak_count<=0) && (columns>=config.getWordWrapLength())) emitLineBreak(); } // end emitPossibleLineBreak private void ensureSpaceOnLine(int nchars) { if ((config.getWordWrapLength()>0) && (nobreak_count<=0)) { // line break might be required here int remain_space = (int)(config.getWordWrapLength() - columns); if (remain_space0) { // calculate where the next line break is int line_break = temp_buffer.toString().indexOf('\n'); int output_len = line_break; if (output_len<0) output_len = temp_buffer.length(); if ((config.getWordWrapLength()>0) && (nobreak_count<=0)) { // adjust output if necessary for word wrapping int remain_space = (int)(config.getWordWrapLength() - columns); if (remain_space0) emitString(temp_buffer.substring(0,output_len),config.getOutputFilters(),true); if (line_break>=0) { // there's a line break present - emit the line break emitLineBreak(); // output the line break character if (++line_break0) && (nobreak_count<=0)) { // we can output the line break anywhere in the subsequence... while (nchars>0) { // figure out how many characters we can output WITHOUT causing a line break int curlen = nchars; int remaining_space = (int)(config.getWordWrapLength() - columns); if (curlen>remaining_space) curlen = remaining_space; // output those characters emitString(temp_buffer.substring(0,curlen),config.getOutputFilters(),true); temp_buffer.delete(0,curlen); nchars -= curlen; if (columns==config.getWordWrapLength()) emitLineBreak(); // and line break us to get to the next line } // end while } // end if else { // just output the run of characters straight emitString(temp_buffer.substring(0,nchars),config.getOutputFilters(),true); temp_buffer.delete(0,nchars); } // end else } // end emitFromStartOfTempBuffer private void doFlushString() { MarkupData md = attemptRewrite(string_rewriters,temp_buffer.toString()); if (md!=null) { // we've got something marked up! output it... emitMarkupData(md); temp_buffer.setLength(0); return; } // end if while (temp_buffer.length()>0) { // find the length of the initial string of word or non-word characters int sublen = getRunLength(temp_buffer); if (isWordChar(temp_buffer.charAt(0))) { // we need to check the word...but first, we must eliminate leading hyphens int hyph_count = 0; while ((hyph_count0) && (temp_buffer.charAt(word_len-1)=='-')) { // decrement word length, increment hyphen count hyph_count++; word_len--; } // end while if (word_len>0) { // extract the word from the start of the buffer String word = temp_buffer.substring(0,word_len); temp_buffer.delete(0,word_len); // try to rewrite this word... md = attemptRewrite(word_rewriters,word); if (md!=null) emitMarkupData(md); else { // just output the word normally ensureSpaceOnLine(word.length()); emitString(word,config.getOutputFilters(),true); } // end else } // end if // now emit the rest of the hyphens emitFromStartOfTempBuffer(hyph_count); } // end if else // just emit this many characters, line-breaking where required emitFromStartOfTempBuffer(sublen); } // end while } // end doFlushString private boolean handleAsHTML() { trigger_WBR = false; // initialize // Figure out the place in the buffer where the command word starts. int start_cmd = 0; boolean closing_tag = false; if ((start_cmdTagRepository.getMaxTagLength())) return false; // the command word is empty or is just too long to be an HTML tag // Look up the tag name to get a tag index from the repository. int tag_index = TagRepository.tagNameToIndex(temp_buffer.substring(start_cmd,end_cmd)); if (tag_index<0) return false; // not a known HTML tag // Look up the tag object that corresponds to the tag index. SimpleTag tagobj = TagRepository.tagIndexToObject(tag_index); if (closing_tag && !(tagobj.allowClose())) return false; // this is a closing tag, and this tag doesn't permit the "close" form // Get the HTML tag set index for this tag, and see if we allow that set. int tag_set_id = TagRepository.tagIndexToSet(tag_index); if (!(config.isTagSetAllowed(tag_set_id)) && !(config.getDiscardHTMLTags())) return false; // we're not allowing it, we're not discarding it, so punt! boolean valid = false; if (!(config.getDiscardHTMLTags()) && tagobj.balanceTags()) { // this tag needs to be balanced - here is where we manipulate the stack if (closing_tag) { // hunt through the list to find the most recently-opened tag of this type int i = tag_stack.size() - 1; while (i>=0) { // look through the stack... Integer foo = (Integer)(tag_stack.get(i)); if (foo.intValue()==tag_index) { // found it - remove it from the tag stack tag_stack.remove(i); valid = true; break; } // end if } // end while } // end if else { // push a new opening tag! tag_stack.add(new Integer(tag_index)); valid = true; } // end else } // end if // else tag doesn't need to be auto-balanced, or is being discarded if (!valid && !(config.getDiscardHTMLTags())) return false; // not validated by the stack code, and not being discarded // Give the tag object one last chance to dictate what we do with the tag. String real_tag_data = tagobj.rewriteTagContents(temp_buffer.toString(),closing_tag,this); if ((real_tag_data==null) || config.getDiscardHTMLTags()) return true; // tag is being erased by rewriter, or is being discarded anyway // Emit the tag to the output. emitChar('<',config.getRawOutputFilters(),false); emitString(real_tag_data,config.getRawOutputFilters(),false); emitChar('>',config.getRawOutputFilters(),false); // Determine whether this tag causes a "logical line break." boolean logical_line_break = false; if (trigger_WBR && !closing_tag && (nobreak_count>0)) logical_line_break = true; else logical_line_break = tagobj.causeLineBreak(closing_tag); if (logical_line_break) columns = 0; return true; // handled! } // end handleAsHTML() private void finishTag() { if (handleAsHTML()) { // the tag has been handled as an HTML tag - bail out immediately temp_buffer.setLength(0); state = ST_WHITESPACE; return; } // end if // now try to handle it using a tag rewriter MarkupData md = attemptRewrite(tag_rewriters,temp_buffer.toString()); if (md!=null) { // we've got something marked up! output it... emitMarkupData(md,'<','>'); temp_buffer.setLength(0); state = ST_WHITESPACE; return; } // end if // This tag has been rejected! We need to process it normally, as character data. String rejection = temp_buffer.toString(); temp_buffer.setLength(0); temp_buffer.append('<'); state = ST_CHARS; if (rejection.length()>0) parse(rejection); // just run it through the parser, now that we've fixed up the state parse(">"); } // end finishTag private void finishParen() { // Try to handle the paren element using a paren rewriter. MarkupData md = attemptRewrite(paren_rewriters,temp_buffer.toString()); if (md!=null) { // we've got something marked up! output it... emitMarkupData(md,'(',')'); temp_buffer.setLength(0); state = ST_WHITESPACE; paren_level = 0; return; } // end if // This tag has been rejected! We need to process it normally, as character data. String rejection = temp_buffer.toString(); temp_buffer.setLength(0); temp_buffer.append('('); state = ST_CHARS; paren_level = 0; if (rejection.length()>0) parse(rejection); // just run it through the parser, now that we've fixed up the state parse(")"); } // end finishParen private void parse(String str) { int i = 0; while (i': // end tag finishTag(); // this changes the state, and maybe calls parse() recursively i++; break; case '\'': // go into "quote string" mode inside tag case '\"': temp_buffer.append(ch); state = ST_TAGQUOTE; quote_char = ch; i++; break; default: // just append more data to the temp buffer temp_buffer.append(ch); i++; break; } // end switch } // end case break; case ST_PAREN: { // inside parentheses - try to build something up switch (ch) { case '(': // append the open parenthesis and kick it up a notch! temp_buffer.append(ch); paren_level++; i++; break; case ')': if (paren_level==0) finishParen(); // will change the parser state else { // append the close parenthesis and kick it DOWN a notch temp_buffer.append(ch); paren_level--; } // end else i++; break; default: temp_buffer.append(ch); i++; break; } // end switch } // end case break; case ST_TAGQUOTE: temp_buffer.append(ch); if (ch==quote_char) // close quote seen - go back to ST_TAG state state = ST_TAG; i++; break; default: throw new IllegalStateException("invalid parser state value"); } // end switch } // end while (looking through string) } // end parse /*-------------------------------------------------------------------------------- * Implementations from interface HTMLChecker *-------------------------------------------------------------------------------- */ public void append(String str) throws AlreadyFinishedException { if (finished) throw new AlreadyFinishedException(); if (!started) { // initialize the parser state initState(); started = true; } // end if parse(str); // parse things } // end append public void finish() throws AlreadyFinishedException { if (finished) throw new AlreadyFinishedException(); if (!started) { // set up the initial parser state (so we don't kill ourselves later) initState(); started = true; } // end if // This is the "end parse" loop, in which we resolve any funny state the parser has // found itself in and clear out the internal buffers. boolean running = false; do { // what we do depends on the parser state... switch (state) { case ST_WHITESPACE: break; // discard any whitespace at the end of output case ST_CHARS: doFlushString(); // flush out the temporary buffer break; case ST_LEFTANGLE: // just emit a '<' character emitPossibleLineBreak(); emitChar('<',config.getOutputFilters(),true); break; case ST_TAG: case ST_TAGQUOTE: { // we won't finish this tag, so it's automagically rejected String rejection = temp_buffer.toString(); temp_buffer.setLength(0); temp_buffer.append('<'); state = ST_CHARS; // now parse the tag contents again if (rejection.length()>0) parse(rejection); running = true; // go back around for another try } // end case break; case ST_PAREN: { // we won't finish this paren tag, so it's automagically rejected String rejection = temp_buffer.toString(); temp_buffer.setLength(0); temp_buffer.append('('); state = ST_CHARS; paren_level = 0; // now parse the parenthesis contents again if (rejection.length()>0) parse(rejection); running = true; // go back around for another try } // end case break; } // end switch } while (running); // end do // Now close all the HTML tags that were left open. for (int i=(tag_stack.size()-1); i>=0; i--) { // get each element in the tag stack and append the appropriate closing tag Integer foo = (Integer)(tag_stack.get(i)); SimpleTag tagobj = TagRepository.tagIndexToObject(foo.intValue()); output_buffer.append(tagobj.makeClosingTag()); } // end for // deallocate some excess crap and mark the object as finished killState(); lines++; // there's one extra line at the end finished = true; } // end finish public void reset() { started = false; finished = false; trigger_WBR = false; state = ST_WHITESPACE; quote_char = '\0'; columns = 0; lines = 0; paren_level = 0; output_buffer = null; killState(); // Also reset all the counters. Iterator it = counters.values().iterator(); while (it.hasNext()) { // reset each counter in turn CountingRewriter cr = (CountingRewriter)(it.next()); cr.reset(); } // end while } // end reset public String getValue() throws NotYetFinishedException { if (!finished) throw new NotYetFinishedException(); return output_buffer.toString(); } // end getValue public int getLength() throws NotYetFinishedException { if (!finished) throw new NotYetFinishedException(); return output_buffer.length(); } // end getLength public int getLines() throws NotYetFinishedException { if (!finished) throw new NotYetFinishedException(); return lines; } // end getLines public int getCounter(String name) throws NotYetFinishedException { if (!finished) throw new NotYetFinishedException(); CountingRewriter cr = (CountingRewriter)(counters.get(name)); if (cr==null) return 0; else return cr.getCount(); } // end getCounter public Object getContextValue(String name) { return context_data.get(name); } // end getContextValue public void setContextValue(String name, Object val) { context_data.put(name,val); } // end setContextValue /*-------------------------------------------------------------------------------- * Implementations from interface HTMLCheckerBackend *-------------------------------------------------------------------------------- */ public String getCheckerAttrValue(String name) { if (name=="ANCHORTAIL") return config.getAnchorTail(); throw new IllegalArgumentException("attribute \"" + name + "\" is not defined"); } // end getCheckerAttrValue public void sendTagMessage(String msg) { if (msg=="NOBR") { // increment the no-break count nobreak_count++; return; } // end if if (msg=="/NOBR") { // decrement the no-break count nobreak_count--; return; } // end if if (msg=="WBR") { // trigger a word break trigger_WBR = true; return; } // end if throw new IllegalArgumentException("message \"" + msg + "\" is not defined"); } // end sendTagMessage public Object getCheckerContextValue(String name) { return context_data.get(name); } // end getCheckerContextValue /*-------------------------------------------------------------------------------- * Implementations from interface RewriterServices *-------------------------------------------------------------------------------- */ public String getRewriterAttrValue(String name) { return getCheckerAttrValue(name); } // end getRewriterAttrValue public Object getRewriterContextValue(String name) { return context_data.get(name); } // end getRewriterContextValue } // end class HTMLCheckerImpl