1112 lines
30 KiB
Java
1112 lines
30 KiB
Java
/*
|
|
* The contents of this file are subject to the Mozilla Public License Version 1.1
|
|
* (the "License"); you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at <http://www.mozilla.org/MPL/>.
|
|
*
|
|
* Software distributed under the License is distributed on an "AS IS" basis, WITHOUT
|
|
* WARRANTY OF ANY KIND, either express or implied. See the License for the specific
|
|
* language governing rights and limitations under the License.
|
|
*
|
|
* The Original Code is the Venice Web Community System.
|
|
*
|
|
* The Initial Developer of the Original Code is Eric J. Bowersox <erbo@silcom.com>,
|
|
* for Silverwrist Design Studios. Portions created by Eric J. Bowersox are
|
|
* Copyright (C) 2001 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved.
|
|
*
|
|
* Contributor(s):
|
|
*/
|
|
package com.silverwrist.venice.htmlcheck.impl;
|
|
|
|
import java.util.*;
|
|
import com.silverwrist.venice.htmlcheck.*;
|
|
|
|
class HTMLCheckerImpl implements HTMLChecker, HTMLCheckerBackend, RewriterServices
|
|
{
|
|
/*--------------------------------------------------------------------------------
|
|
* Wrapper for Rewriters to implement counting where necessary
|
|
*--------------------------------------------------------------------------------
|
|
*/
|
|
|
|
static class CountingRewriter implements Rewriter
|
|
{
|
|
private Rewriter inner;
|
|
private int count = 0;
|
|
|
|
public CountingRewriter(Rewriter inner)
|
|
{
|
|
this.inner = inner;
|
|
|
|
} // end constructor
|
|
|
|
public String getName()
|
|
{
|
|
return inner.getName();
|
|
|
|
} // end getName
|
|
|
|
public MarkupData rewrite(String data, RewriterServices svc)
|
|
{
|
|
MarkupData rc = inner.rewrite(data,svc);
|
|
if (rc!=null)
|
|
count++;
|
|
return rc;
|
|
|
|
} // end rewrite
|
|
|
|
public int getCount()
|
|
{
|
|
return count;
|
|
|
|
} // end getCount
|
|
|
|
public void reset()
|
|
{
|
|
count = 0;
|
|
|
|
} // end reset
|
|
|
|
} // end class CountingRewriter
|
|
|
|
/*--------------------------------------------------------------------------------
|
|
* State machine constants
|
|
*--------------------------------------------------------------------------------
|
|
*/
|
|
|
|
private static final short ST_WHITESPACE = 0;
|
|
private static final short ST_CHARS = 1;
|
|
private static final short ST_LEFTANGLE = 2;
|
|
private static final short ST_TAG = 3;
|
|
private static final short ST_PAREN = 4;
|
|
private static final short ST_TAGQUOTE = 5;
|
|
|
|
/*--------------------------------------------------------------------------------
|
|
* Attributes
|
|
*--------------------------------------------------------------------------------
|
|
*/
|
|
|
|
private HTMLCheckerConfigImpl config; // the configuration we're working from
|
|
private boolean started = false; // have we started parsing?
|
|
private boolean finished = false; // have we finished parsing?
|
|
private short state = ST_WHITESPACE; // current parser state
|
|
private char quote_char; // quote character to match
|
|
private int paren_level = 0; // parenthesization level
|
|
private short columns = 0; // current number of columns
|
|
private int lines = 0; // current number of lines
|
|
private int nobreak_count = 0; // "no-break" count
|
|
private boolean trigger_WBR = false; // word break trigger
|
|
private StringBuffer output_buffer = null; // output buffer for characters
|
|
private StringBuffer temp_buffer = null; // temporary buffer used within one state
|
|
private Vector tag_stack = null; // stack of tags that have been opened
|
|
private Hashtable counters = new Hashtable(); // the CountingRewriter instances
|
|
private Vector string_rewriters = new Vector(); // string rewriter instances
|
|
private Vector word_rewriters = new Vector(); // word rewriter instances
|
|
private Vector tag_rewriters = new Vector(); // tag rewriter instances
|
|
private Vector paren_rewriters = new Vector(); // paren rewriter instances
|
|
private Hashtable context_data = new Hashtable(); // context variables
|
|
|
|
/*--------------------------------------------------------------------------------
|
|
* Constructor
|
|
*--------------------------------------------------------------------------------
|
|
*/
|
|
|
|
HTMLCheckerImpl(HTMLCheckerConfigImpl config)
|
|
{
|
|
TagRepository.init();
|
|
this.config = config;
|
|
copyRewriters(string_rewriters,config.getStringRewriters());
|
|
copyRewriters(word_rewriters,config.getWordRewriters());
|
|
copyRewriters(tag_rewriters,config.getTagRewriters());
|
|
copyRewriters(paren_rewriters,config.getParenRewriters());
|
|
|
|
} // end constructor
|
|
|
|
/*--------------------------------------------------------------------------------
|
|
* Internal functions
|
|
*--------------------------------------------------------------------------------
|
|
*/
|
|
|
|
private static final boolean isWordChar(char ch)
|
|
{
|
|
return (Character.isUpperCase(ch) || Character.isLowerCase(ch) || (ch=='-'));
|
|
|
|
} // end isWordChar
|
|
|
|
private static final int getRunLength(StringBuffer buf)
|
|
{
|
|
boolean word_char = isWordChar(buf.charAt(0));
|
|
int l = 1;
|
|
while (l<buf.length())
|
|
{ // see if there's a break from word characters to non-word characters
|
|
if (isWordChar(buf.charAt(l))!=word_char)
|
|
break;
|
|
l++;
|
|
|
|
} // end while
|
|
|
|
return l;
|
|
|
|
} // end getStringRunLength
|
|
|
|
private void copyRewriters(Vector dest, List source)
|
|
{
|
|
Iterator it = source.iterator();
|
|
while (it.hasNext())
|
|
{ // get each rewriter, and wrap it if it has a name
|
|
Rewriter r = (Rewriter)(it.next());
|
|
String name = r.getName();
|
|
if (r!=null)
|
|
{ // wrap it in a CountingRewriter and hash it...
|
|
CountingRewriter cr = new CountingRewriter(r);
|
|
counters.put(name,cr);
|
|
r = cr;
|
|
|
|
} // end if
|
|
|
|
dest.add(r);
|
|
|
|
} // end while
|
|
|
|
dest.trimToSize();
|
|
|
|
} // end copyRewriters
|
|
|
|
private MarkupData attemptRewrite(List rewriters, String data)
|
|
{
|
|
Iterator it = rewriters.iterator();
|
|
MarkupData rc = null;
|
|
while ((rc==null) && it.hasNext())
|
|
{ // look at each rewriter in turn and try seeing if it will mark this text up
|
|
Rewriter r = (Rewriter)(it.next());
|
|
rc = r.rewrite(data,this);
|
|
|
|
} // end while
|
|
|
|
return rc;
|
|
|
|
} // end attemptRewrite
|
|
|
|
private void initState()
|
|
{
|
|
output_buffer = new StringBuffer(1024);
|
|
temp_buffer = new StringBuffer(64);
|
|
tag_stack = new Vector();
|
|
|
|
} // end initState
|
|
|
|
private void killState()
|
|
{
|
|
temp_buffer = null;
|
|
if (tag_stack!=null)
|
|
tag_stack.removeAllElements();
|
|
tag_stack = null;
|
|
|
|
} // end killState
|
|
|
|
private void emitChar(char ch, List filters, boolean count_cols)
|
|
{
|
|
boolean handled = false;
|
|
if (filters.size()>0)
|
|
{ // look through all the output filters to see what we can do
|
|
Iterator it = filters.iterator();
|
|
while (!handled && it.hasNext())
|
|
{ // look for an output filter that matches this character
|
|
OutputFilter of = (OutputFilter)(it.next());
|
|
handled = of.tryOutputCharacter(output_buffer,ch);
|
|
|
|
} // end while
|
|
|
|
} // end if
|
|
|
|
if (!handled) // output the character
|
|
output_buffer.append(ch);
|
|
if (count_cols && (config.getWordWrapLength()>0))
|
|
columns++; // adjust column indicator
|
|
|
|
} // end emitChar
|
|
|
|
private void emitString(String str, List filters, boolean count_cols)
|
|
{
|
|
boolean real_count_cols = count_cols && (config.getWordWrapLength()>0);
|
|
|
|
if (filters.size()==0)
|
|
{ // if there are no filters, just append the entire string directly
|
|
output_buffer.append(str);
|
|
if (real_count_cols)
|
|
columns += (short)(str.length());
|
|
return;
|
|
|
|
} // end if
|
|
|
|
String temp = str;
|
|
while ((temp!=null) && (temp.length()>0))
|
|
{ // We output as much of the string as we possibly can at once. For starters,
|
|
// assume we're going to output the whole thing.
|
|
int output_len = temp.length();
|
|
|
|
// Now look at each of the output filters to see if we should try outputting
|
|
// a lesser amount (i.e., does the string contain a "stopper" that one of the
|
|
// output filters would like to mogrify?)
|
|
Iterator it = filters.iterator();
|
|
OutputFilter stopper = null;
|
|
while (it.hasNext() && (output_len>0))
|
|
{ // look to find the length of characters that doesn't match this filter
|
|
OutputFilter of = (OutputFilter)(it.next());
|
|
int lnm = of.lengthNoMatch(temp);
|
|
if ((lnm>=0) && (lnm<output_len))
|
|
{ // we've found a new stopper - record the length and the filter
|
|
output_len = lnm;
|
|
stopper = of;
|
|
|
|
} // end if
|
|
|
|
} // end while (looking through filters)
|
|
|
|
if (output_len>0)
|
|
{ // At least this many unaltered characters can be output, so copy them.
|
|
output_buffer.append(temp.substring(0,output_len));
|
|
if (real_count_cols)
|
|
columns += (short)output_len;
|
|
|
|
} // end if
|
|
|
|
if (stopper!=null)
|
|
{ // one of the output filters stopped us, so invoke it
|
|
char tmpch = temp.charAt(output_len++);
|
|
if (!(stopper.tryOutputCharacter(output_buffer,tmpch)))
|
|
output_buffer.append(tmpch);
|
|
if (real_count_cols)
|
|
columns++;
|
|
|
|
} // end if
|
|
|
|
// Chop the string down the middle and go around again.
|
|
if (output_len==temp.length())
|
|
temp = null;
|
|
else if (output_len>0)
|
|
temp = temp.substring(output_len);
|
|
|
|
} // end while (still data left to append)
|
|
|
|
} // end emitString
|
|
|
|
private void emitLineBreak()
|
|
{
|
|
emitString("\r\n",config.getRawOutputFilters(),false);
|
|
if (config.getWordWrapLength()>0)
|
|
columns = 0;
|
|
lines++;
|
|
|
|
} // end emitLineBreak
|
|
|
|
private void emitPossibleLineBreak()
|
|
{
|
|
if ((config.getWordWrapLength()>0) && (nobreak_count<=0) && (columns>=config.getWordWrapLength()))
|
|
emitLineBreak();
|
|
|
|
} // end emitPossibleLineBreak
|
|
|
|
private void ensureSpaceOnLine(int nchars)
|
|
{
|
|
if ((config.getWordWrapLength()>0) && (nobreak_count<=0))
|
|
{ // line break might be required here
|
|
int remain_space = (int)(config.getWordWrapLength() - columns);
|
|
if (remain_space<nchars)
|
|
emitLineBreak();
|
|
|
|
} // end if
|
|
|
|
} // end ensureSpaceOnLine
|
|
|
|
private void emitMarkupData(MarkupData md)
|
|
{
|
|
ensureSpaceOnLine(md.getText().length());
|
|
emitString(md.getBeginMarkup(),config.getRawOutputFilters(),false);
|
|
emitString(md.getText(),config.getOutputFilters(),true);
|
|
emitString(md.getEndMarkup(),config.getRawOutputFilters(),false);
|
|
|
|
} // end emitMarkupData
|
|
|
|
private void emitMarkupData(MarkupData md, char prefix, char suffix)
|
|
{
|
|
ensureSpaceOnLine(md.getText().length() + 2);
|
|
emitChar(prefix,config.getOutputFilters(),true);
|
|
emitString(md.getBeginMarkup(),config.getRawOutputFilters(),false);
|
|
emitString(md.getText(),config.getOutputFilters(),true);
|
|
emitString(md.getEndMarkup(),config.getRawOutputFilters(),false);
|
|
emitChar(suffix,config.getOutputFilters(),true);
|
|
|
|
} // end emitMarkupData
|
|
|
|
private void doFlushWhitespace()
|
|
{
|
|
while (temp_buffer.length()>0)
|
|
{ // calculate where the next line break is
|
|
int line_break = temp_buffer.toString().indexOf('\n');
|
|
int output_len = line_break;
|
|
if (output_len<0)
|
|
output_len = temp_buffer.length();
|
|
|
|
if ((config.getWordWrapLength()>0) && (nobreak_count<=0))
|
|
{ // adjust output if necessary for word wrapping
|
|
int remain_space = (int)(config.getWordWrapLength() - columns);
|
|
if (remain_space<output_len)
|
|
output_len = remain_space;
|
|
|
|
} // end if
|
|
|
|
if (output_len>0)
|
|
emitString(temp_buffer.substring(0,output_len),config.getOutputFilters(),true);
|
|
|
|
if (line_break>=0)
|
|
{ // there's a line break present - emit the line break
|
|
emitLineBreak(); // output the line break character
|
|
if (++line_break<temp_buffer.length())
|
|
temp_buffer.delete(0,line_break);
|
|
else
|
|
temp_buffer.setLength(0);
|
|
|
|
} // end if
|
|
else // no more line breaks on this line - clear out the buffer
|
|
temp_buffer.setLength(0);
|
|
|
|
} // end while (still data in temp buffer)
|
|
|
|
} // end doFlushWhitespace
|
|
|
|
private void emitFromStartOfTempBuffer(int nchars)
|
|
{
|
|
if (nchars<=0)
|
|
return;
|
|
|
|
if ((config.getWordWrapLength()>0) && (nobreak_count<=0))
|
|
{ // we can output the line break anywhere in the subsequence...
|
|
while (nchars>0)
|
|
{ // figure out how many characters we can output WITHOUT causing a line break
|
|
int curlen = nchars;
|
|
int remaining_space = (int)(config.getWordWrapLength() - columns);
|
|
if (curlen>remaining_space)
|
|
curlen = remaining_space;
|
|
|
|
// output those characters
|
|
emitString(temp_buffer.substring(0,curlen),config.getOutputFilters(),true);
|
|
temp_buffer.delete(0,curlen);
|
|
nchars -= curlen;
|
|
|
|
if (columns==config.getWordWrapLength())
|
|
emitLineBreak(); // and line break us to get to the next line
|
|
|
|
} // end while
|
|
|
|
} // end if
|
|
else
|
|
{ // just output the run of characters straight
|
|
emitString(temp_buffer.substring(0,nchars),config.getOutputFilters(),true);
|
|
temp_buffer.delete(0,nchars);
|
|
|
|
} // end else
|
|
|
|
} // end emitFromStartOfTempBuffer
|
|
|
|
private void doFlushString()
|
|
{
|
|
MarkupData md = attemptRewrite(string_rewriters,temp_buffer.toString());
|
|
if (md!=null)
|
|
{ // we've got something marked up! output it...
|
|
emitMarkupData(md);
|
|
temp_buffer.setLength(0);
|
|
return;
|
|
|
|
} // end if
|
|
|
|
while (temp_buffer.length()>0)
|
|
{ // find the length of the initial string of word or non-word characters
|
|
int sublen = getRunLength(temp_buffer);
|
|
|
|
if (isWordChar(temp_buffer.charAt(0)))
|
|
{ // we need to check the word...but first, we must eliminate leading hyphens
|
|
int hyph_count = 0;
|
|
while ((hyph_count<sublen) && (temp_buffer.charAt(hyph_count)=='-'))
|
|
hyph_count++;
|
|
emitFromStartOfTempBuffer(hyph_count);
|
|
sublen -= hyph_count;
|
|
|
|
// now determine how many hyphens there are at the end of the word...
|
|
int word_len = sublen;
|
|
hyph_count = 0;
|
|
while ((word_len>0) && (temp_buffer.charAt(word_len-1)=='-'))
|
|
{ // decrement word length, increment hyphen count
|
|
hyph_count++;
|
|
word_len--;
|
|
|
|
} // end while
|
|
|
|
if (word_len>0)
|
|
{ // extract the word from the start of the buffer
|
|
String word = temp_buffer.substring(0,word_len);
|
|
temp_buffer.delete(0,word_len);
|
|
|
|
// try to rewrite this word...
|
|
md = attemptRewrite(word_rewriters,word);
|
|
if (md!=null)
|
|
emitMarkupData(md);
|
|
else
|
|
{ // just output the word normally
|
|
ensureSpaceOnLine(word.length());
|
|
emitString(word,config.getOutputFilters(),true);
|
|
|
|
} // end else
|
|
|
|
} // end if
|
|
|
|
// now emit the rest of the hyphens
|
|
emitFromStartOfTempBuffer(hyph_count);
|
|
|
|
} // end if
|
|
else // just emit this many characters, line-breaking where required
|
|
emitFromStartOfTempBuffer(sublen);
|
|
|
|
} // end while
|
|
|
|
} // end doFlushString
|
|
|
|
private boolean handleAsHTML()
|
|
{
|
|
trigger_WBR = false; // initialize
|
|
|
|
// Figure out the place in the buffer where the command word starts.
|
|
int start_cmd = 0;
|
|
boolean closing_tag = false;
|
|
if ((start_cmd<temp_buffer.length()) && (temp_buffer.charAt(start_cmd)=='/'))
|
|
{ // this is a closing tag - move the command start pointer...
|
|
start_cmd++;
|
|
closing_tag = true;
|
|
|
|
} // end if
|
|
|
|
// Now figure out where it ends.
|
|
int end_cmd = start_cmd;
|
|
while ((end_cmd<temp_buffer.length()) && !(Character.isWhitespace(temp_buffer.charAt(end_cmd))))
|
|
end_cmd++;
|
|
|
|
if ((end_cmd==start_cmd) || ((end_cmd-start_cmd)>TagRepository.getMaxTagLength()))
|
|
return false; // the command word is empty or is just too long to be an HTML tag
|
|
|
|
// Look up the tag name to get a tag index from the repository.
|
|
int tag_index = TagRepository.tagNameToIndex(temp_buffer.substring(start_cmd,end_cmd));
|
|
if (tag_index<0)
|
|
return false; // not a known HTML tag
|
|
|
|
// Look up the tag object that corresponds to the tag index.
|
|
SimpleTag tagobj = TagRepository.tagIndexToObject(tag_index);
|
|
if (closing_tag && !(tagobj.allowClose()))
|
|
return false; // this is a closing tag, and this tag doesn't permit the "close" form
|
|
|
|
// Get the HTML tag set index for this tag, and see if we allow that set.
|
|
int tag_set_id = TagRepository.tagIndexToSet(tag_index);
|
|
if (!(config.isTagSetAllowed(tag_set_id)) && !(config.getDiscardHTMLTags()))
|
|
return false; // we're not allowing it, we're not discarding it, so punt!
|
|
|
|
boolean valid = false;
|
|
if (!(config.getDiscardHTMLTags()) && tagobj.balanceTags())
|
|
{ // this tag needs to be balanced - here is where we manipulate the stack
|
|
if (closing_tag)
|
|
{ // hunt through the list to find the most recently-opened tag of this type
|
|
int i = tag_stack.size() - 1;
|
|
while (i>=0)
|
|
{ // look through the stack...
|
|
Integer foo = (Integer)(tag_stack.get(i));
|
|
if (foo.intValue()==tag_index)
|
|
{ // found it - remove it from the tag stack
|
|
tag_stack.remove(i);
|
|
valid = true;
|
|
break;
|
|
|
|
} // end if
|
|
|
|
} // end while
|
|
|
|
} // end if
|
|
else
|
|
{ // push a new opening tag!
|
|
tag_stack.add(new Integer(tag_index));
|
|
valid = true;
|
|
|
|
} // end else
|
|
|
|
} // end if
|
|
// else tag doesn't need to be auto-balanced, or is being discarded
|
|
|
|
if (!valid && !(config.getDiscardHTMLTags()))
|
|
return false; // not validated by the stack code, and not being discarded
|
|
|
|
// Give the tag object one last chance to dictate what we do with the tag.
|
|
String real_tag_data = tagobj.rewriteTagContents(temp_buffer.toString(),closing_tag,this);
|
|
if ((real_tag_data==null) || config.getDiscardHTMLTags())
|
|
return true; // tag is being erased by rewriter, or is being discarded anyway
|
|
|
|
// Emit the tag to the output.
|
|
emitChar('<',config.getRawOutputFilters(),false);
|
|
emitString(real_tag_data,config.getRawOutputFilters(),false);
|
|
emitChar('>',config.getRawOutputFilters(),false);
|
|
|
|
// Determine whether this tag causes a "logical line break."
|
|
boolean logical_line_break = false;
|
|
if (trigger_WBR && !closing_tag && (nobreak_count>0))
|
|
logical_line_break = true;
|
|
else
|
|
logical_line_break = tagobj.causeLineBreak(closing_tag);
|
|
if (logical_line_break)
|
|
columns = 0;
|
|
|
|
return true; // handled!
|
|
|
|
} // end handleAsHTML()
|
|
|
|
private void finishTag()
|
|
{
|
|
if (handleAsHTML())
|
|
{ // the tag has been handled as an HTML tag - bail out immediately
|
|
temp_buffer.setLength(0);
|
|
state = ST_WHITESPACE;
|
|
return;
|
|
|
|
} // end if
|
|
|
|
// now try to handle it using a tag rewriter
|
|
MarkupData md = attemptRewrite(tag_rewriters,temp_buffer.toString());
|
|
if (md!=null)
|
|
{ // we've got something marked up! output it...
|
|
emitMarkupData(md,'<','>');
|
|
temp_buffer.setLength(0);
|
|
state = ST_WHITESPACE;
|
|
return;
|
|
|
|
} // end if
|
|
|
|
// This tag has been rejected! We need to process it normally, as character data.
|
|
String rejection = temp_buffer.toString();
|
|
temp_buffer.setLength(0);
|
|
temp_buffer.append('<');
|
|
state = ST_CHARS;
|
|
if (rejection.length()>0)
|
|
parse(rejection); // just run it through the parser, now that we've fixed up the state
|
|
parse(">");
|
|
|
|
} // end finishTag
|
|
|
|
private void finishParen()
|
|
{
|
|
// Try to handle the paren element using a paren rewriter.
|
|
MarkupData md = attemptRewrite(paren_rewriters,temp_buffer.toString());
|
|
if (md!=null)
|
|
{ // we've got something marked up! output it...
|
|
emitMarkupData(md,'(',')');
|
|
temp_buffer.setLength(0);
|
|
state = ST_WHITESPACE;
|
|
paren_level = 0;
|
|
return;
|
|
|
|
} // end if
|
|
|
|
// This tag has been rejected! We need to process it normally, as character data.
|
|
String rejection = temp_buffer.toString();
|
|
temp_buffer.setLength(0);
|
|
temp_buffer.append('(');
|
|
state = ST_CHARS;
|
|
paren_level = 0;
|
|
if (rejection.length()>0)
|
|
parse(rejection); // just run it through the parser, now that we've fixed up the state
|
|
parse(")");
|
|
|
|
} // end finishParen
|
|
|
|
private void parse(String str)
|
|
{
|
|
int i = 0;
|
|
while (i<str.length())
|
|
{ // get the character from the string
|
|
char ch = str.charAt(i);
|
|
|
|
// now process differently based on the current parser state
|
|
switch (state)
|
|
{
|
|
case ST_WHITESPACE:
|
|
{ // Whitespace handling - look at the character
|
|
switch (ch)
|
|
{
|
|
case ' ': // append spaces, tabs, and newlines verbatim to the temp buffer
|
|
case '\t':
|
|
case '\n':
|
|
temp_buffer.append(ch);
|
|
i++;
|
|
break;
|
|
|
|
case '\r': // compress 1 or more \r's followe by optional \n to a single \n
|
|
if ( (i==(str.length()-1))
|
|
|| ((str.charAt(i+1)!='\r') && (str.charAt(i+1)!='\n')))
|
|
temp_buffer.append('\n');
|
|
i++;
|
|
break;
|
|
|
|
case '<':
|
|
doFlushWhitespace(); // flush the whitespace we've already got
|
|
if (config.getProcessAngles())
|
|
state = ST_LEFTANGLE;
|
|
else
|
|
{ // just process as an ordinary character
|
|
state = ST_CHARS;
|
|
temp_buffer.append(ch);
|
|
|
|
} // end else
|
|
i++;
|
|
break;
|
|
|
|
case '(':
|
|
doFlushWhitespace(); // flush the whitespace we've already got
|
|
if (config.getProcessParens())
|
|
state = ST_PAREN;
|
|
else
|
|
{ // just process as an ordinary character
|
|
state = ST_CHARS;
|
|
temp_buffer.append(ch);
|
|
|
|
} // end else
|
|
i++;
|
|
break;
|
|
|
|
case '\\': // backslash processing is complext - shift to ST_CHARS state to handle it
|
|
doFlushWhitespace();
|
|
state = ST_CHARS;
|
|
break;
|
|
|
|
default:
|
|
doFlushWhitespace(); // flush the whitespace we've already got
|
|
state = ST_CHARS;
|
|
temp_buffer.append(ch);
|
|
i++;
|
|
break;
|
|
|
|
} // end switch
|
|
|
|
} // end case
|
|
break;
|
|
|
|
case ST_CHARS:
|
|
{ // Character data handling - look at the character
|
|
switch (ch)
|
|
{
|
|
case ' ': // whitespace - drop back to whitespace mode
|
|
case '\t':
|
|
case '\n':
|
|
doFlushString();
|
|
state = ST_WHITESPACE;
|
|
temp_buffer.append(ch);
|
|
i++;
|
|
break;
|
|
|
|
case '\r': // handle \r processing in ST_WHITESPACE 'cause it's complicated
|
|
doFlushString();
|
|
state = ST_WHITESPACE;
|
|
break;
|
|
|
|
case '<': // left angle bracket - may be a start-of-tag
|
|
if (config.getProcessAngles())
|
|
{ // this is a tag start - go to LEFTANGLE state
|
|
doFlushString();
|
|
state = ST_LEFTANGLE;
|
|
|
|
} // end if
|
|
else // just handle it normally
|
|
temp_buffer.append(ch);
|
|
i++;
|
|
break;
|
|
|
|
case '(': // left parenthesis - may be a start-of-paren
|
|
if (config.getProcessParens())
|
|
{ // we're going into Parens mode...
|
|
doFlushString();
|
|
state = ST_PAREN;
|
|
|
|
} // end if
|
|
else // just handle it normally
|
|
temp_buffer.append(ch);
|
|
i++;
|
|
break;
|
|
|
|
case '\\':
|
|
if (i<(str.length()-1))
|
|
{ // look at the character following the backslash
|
|
ch = str.charAt(++i);
|
|
if (((ch=='(') && config.getProcessParens()) || ((ch=='<') && config.getProcessAngles()))
|
|
{ // append the escaped character, omitting the backslash
|
|
temp_buffer.append(ch);
|
|
i++;
|
|
|
|
} // end if
|
|
else // append the backslash and hit the new character
|
|
temp_buffer.append('\\');
|
|
|
|
} // end if
|
|
else
|
|
{ // just append the backslash normally
|
|
temp_buffer.append('\\');
|
|
i++;
|
|
|
|
} // end else
|
|
break;
|
|
|
|
default: // just append the next non-white character
|
|
temp_buffer.append(ch);
|
|
i++;
|
|
break;
|
|
|
|
} // end switch
|
|
|
|
} // end case
|
|
break;
|
|
|
|
case ST_LEFTANGLE:
|
|
{ // Left Angle processing - this isn't very difficult
|
|
switch (ch)
|
|
{
|
|
case ' ':
|
|
case '\t':
|
|
case '\r':
|
|
case '\n': // output the < and click back to whitespace mode
|
|
emitChar('<',config.getOutputFilters(),true);
|
|
state = ST_WHITESPACE;
|
|
break;
|
|
|
|
case '<': // output the < and stay in this mode
|
|
emitChar('<',config.getOutputFilters(),true);
|
|
i++;
|
|
break;
|
|
|
|
default: // click over to TAG mode
|
|
state = ST_TAG;
|
|
temp_buffer.append(ch);
|
|
i++;
|
|
break;
|
|
|
|
} // end switch
|
|
|
|
} // end case
|
|
break;
|
|
|
|
case ST_TAG:
|
|
{ // inside a tag - process the character here
|
|
switch (ch)
|
|
{
|
|
case '>': // end tag
|
|
finishTag(); // this changes the state, and maybe calls parse() recursively
|
|
i++;
|
|
break;
|
|
|
|
case '\'': // go into "quote string" mode inside tag
|
|
case '\"':
|
|
temp_buffer.append(ch);
|
|
state = ST_TAGQUOTE;
|
|
quote_char = ch;
|
|
i++;
|
|
break;
|
|
|
|
default: // just append more data to the temp buffer
|
|
temp_buffer.append(ch);
|
|
i++;
|
|
break;
|
|
|
|
} // end switch
|
|
|
|
} // end case
|
|
break;
|
|
|
|
case ST_PAREN:
|
|
{ // inside parentheses - try to build something up
|
|
switch (ch)
|
|
{
|
|
case '(': // append the open parenthesis and kick it up a notch!
|
|
temp_buffer.append(ch);
|
|
paren_level++;
|
|
i++;
|
|
break;
|
|
|
|
case ')':
|
|
if (paren_level==0)
|
|
finishParen(); // will change the parser state
|
|
else
|
|
{ // append the close parenthesis and kick it DOWN a notch
|
|
temp_buffer.append(ch);
|
|
paren_level--;
|
|
|
|
} // end else
|
|
i++;
|
|
break;
|
|
|
|
default:
|
|
temp_buffer.append(ch);
|
|
i++;
|
|
break;
|
|
|
|
} // end switch
|
|
|
|
} // end case
|
|
break;
|
|
|
|
case ST_TAGQUOTE:
|
|
temp_buffer.append(ch);
|
|
if (ch==quote_char) // close quote seen - go back to ST_TAG state
|
|
state = ST_TAG;
|
|
i++;
|
|
break;
|
|
|
|
default:
|
|
throw new IllegalStateException("invalid parser state value");
|
|
|
|
} // end switch
|
|
|
|
} // end while (looking through string)
|
|
|
|
} // end parse
|
|
|
|
/*--------------------------------------------------------------------------------
|
|
* Implementations from interface HTMLChecker
|
|
*--------------------------------------------------------------------------------
|
|
*/
|
|
|
|
public void append(String str) throws AlreadyFinishedException
|
|
{
|
|
if (finished)
|
|
throw new AlreadyFinishedException();
|
|
if (!started)
|
|
{ // initialize the parser state
|
|
initState();
|
|
started = true;
|
|
|
|
} // end if
|
|
|
|
parse(str); // parse things
|
|
|
|
} // end append
|
|
|
|
public void finish() throws AlreadyFinishedException
|
|
{
|
|
if (finished)
|
|
throw new AlreadyFinishedException();
|
|
if (!started)
|
|
{ // set up the initial parser state (so we don't kill ourselves later)
|
|
initState();
|
|
started = true;
|
|
|
|
} // end if
|
|
|
|
// This is the "end parse" loop, in which we resolve any funny state the parser has
|
|
// found itself in and clear out the internal buffers.
|
|
boolean running = false;
|
|
do
|
|
{ // what we do depends on the parser state...
|
|
switch (state)
|
|
{
|
|
case ST_WHITESPACE:
|
|
break; // discard any whitespace at the end of output
|
|
|
|
case ST_CHARS:
|
|
doFlushString(); // flush out the temporary buffer
|
|
break;
|
|
|
|
case ST_LEFTANGLE: // just emit a '<' character
|
|
emitPossibleLineBreak();
|
|
emitChar('<',config.getOutputFilters(),true);
|
|
break;
|
|
|
|
case ST_TAG:
|
|
case ST_TAGQUOTE:
|
|
{ // we won't finish this tag, so it's automagically rejected
|
|
String rejection = temp_buffer.toString();
|
|
temp_buffer.setLength(0);
|
|
temp_buffer.append('<');
|
|
state = ST_CHARS;
|
|
|
|
// now parse the tag contents again
|
|
if (rejection.length()>0)
|
|
parse(rejection);
|
|
|
|
running = true; // go back around for another try
|
|
|
|
} // end case
|
|
break;
|
|
|
|
case ST_PAREN:
|
|
{ // we won't finish this paren tag, so it's automagically rejected
|
|
String rejection = temp_buffer.toString();
|
|
temp_buffer.setLength(0);
|
|
temp_buffer.append('(');
|
|
state = ST_CHARS;
|
|
paren_level = 0;
|
|
|
|
// now parse the parenthesis contents again
|
|
if (rejection.length()>0)
|
|
parse(rejection);
|
|
|
|
running = true; // go back around for another try
|
|
|
|
} // end case
|
|
break;
|
|
|
|
} // end switch
|
|
|
|
} while (running); // end do
|
|
|
|
// Now close all the HTML tags that were left open.
|
|
for (int i=(tag_stack.size()-1); i>=0; i--)
|
|
{ // get each element in the tag stack and append the appropriate closing tag
|
|
Integer foo = (Integer)(tag_stack.get(i));
|
|
SimpleTag tagobj = TagRepository.tagIndexToObject(foo.intValue());
|
|
output_buffer.append(tagobj.makeClosingTag());
|
|
|
|
} // end for
|
|
|
|
// deallocate some excess crap and mark the object as finished
|
|
killState();
|
|
lines++; // there's one extra line at the end
|
|
finished = true;
|
|
|
|
} // end finish
|
|
|
|
public void reset()
|
|
{
|
|
started = false;
|
|
finished = false;
|
|
trigger_WBR = false;
|
|
state = ST_WHITESPACE;
|
|
quote_char = '\0';
|
|
columns = 0;
|
|
lines = 0;
|
|
paren_level = 0;
|
|
output_buffer = null;
|
|
killState();
|
|
|
|
// Also reset all the counters.
|
|
Iterator it = counters.values().iterator();
|
|
while (it.hasNext())
|
|
{ // reset each counter in turn
|
|
CountingRewriter cr = (CountingRewriter)(it.next());
|
|
cr.reset();
|
|
|
|
} // end while
|
|
|
|
} // end reset
|
|
|
|
public String getValue() throws NotYetFinishedException
|
|
{
|
|
if (!finished)
|
|
throw new NotYetFinishedException();
|
|
return output_buffer.toString();
|
|
|
|
} // end getValue
|
|
|
|
public int getLength() throws NotYetFinishedException
|
|
{
|
|
if (!finished)
|
|
throw new NotYetFinishedException();
|
|
return output_buffer.length();
|
|
|
|
} // end getLength
|
|
|
|
public int getLines() throws NotYetFinishedException
|
|
{
|
|
if (!finished)
|
|
throw new NotYetFinishedException();
|
|
return lines;
|
|
|
|
} // end getLines
|
|
|
|
public int getCounter(String name) throws NotYetFinishedException
|
|
{
|
|
if (!finished)
|
|
throw new NotYetFinishedException();
|
|
CountingRewriter cr = (CountingRewriter)(counters.get(name));
|
|
if (cr==null)
|
|
return 0;
|
|
else
|
|
return cr.getCount();
|
|
|
|
} // end getCounter
|
|
|
|
public Object getContextValue(String name)
|
|
{
|
|
return context_data.get(name);
|
|
|
|
} // end getContextValue
|
|
|
|
public void setContextValue(String name, Object val)
|
|
{
|
|
context_data.put(name,val);
|
|
|
|
} // end setContextValue
|
|
|
|
/*--------------------------------------------------------------------------------
|
|
* Implementations from interface HTMLCheckerBackend
|
|
*--------------------------------------------------------------------------------
|
|
*/
|
|
|
|
public String getCheckerAttrValue(String name)
|
|
{
|
|
if (name=="ANCHORTAIL")
|
|
return config.getAnchorTail();
|
|
|
|
throw new IllegalArgumentException("attribute \"" + name + "\" is not defined");
|
|
|
|
} // end getCheckerAttrValue
|
|
|
|
public void sendTagMessage(String msg)
|
|
{
|
|
if (msg=="NOBR")
|
|
{ // increment the no-break count
|
|
nobreak_count++;
|
|
return;
|
|
|
|
} // end if
|
|
|
|
if (msg=="/NOBR")
|
|
{ // decrement the no-break count
|
|
nobreak_count--;
|
|
return;
|
|
|
|
} // end if
|
|
|
|
if (msg=="WBR")
|
|
{ // trigger a word break
|
|
trigger_WBR = true;
|
|
return;
|
|
|
|
} // end if
|
|
|
|
throw new IllegalArgumentException("message \"" + msg + "\" is not defined");
|
|
|
|
} // end sendTagMessage
|
|
|
|
public Object getCheckerContextValue(String name)
|
|
{
|
|
return context_data.get(name);
|
|
|
|
} // end getCheckerContextValue
|
|
|
|
/*--------------------------------------------------------------------------------
|
|
* Implementations from interface RewriterServices
|
|
*--------------------------------------------------------------------------------
|
|
*/
|
|
|
|
public String getRewriterAttrValue(String name)
|
|
{
|
|
return getCheckerAttrValue(name);
|
|
|
|
} // end getRewriterAttrValue
|
|
|
|
public Object getRewriterContextValue(String name)
|
|
{
|
|
return context_data.get(name);
|
|
|
|
} // end getRewriterContextValue
|
|
|
|
} // end class HTMLCheckerImpl
|