From 4c5c7ffe85d717ca4ebfe4c0acde56aca0501b1f Mon Sep 17 00:00:00 2001 From: "Eric J. Bowersox" Date: Thu, 30 Dec 2004 08:08:13 +0000 Subject: [PATCH] added the ability for the HTML checker to keep track of internal and external references in any post, so we can do trackbacks --- .../venice/db/PostLinkRewriter.java | 18 +- .../venice/htmlcheck/HTMLChecker.java | 6 + .../venice/htmlcheck/RewriterServices.java | 14 +- .../htmlcheck/filters/EmailRewriter.java | 51 ++++- .../venice/htmlcheck/filters/URLRewriter.java | 151 ++++++++++----- .../htmlcheck/impl/HTMLCheckerBackend.java | 16 +- .../htmlcheck/impl/HTMLCheckerImpl.java | 57 +++++- .../venice/htmlcheck/impl/TagA.java | 179 +++++++++++------- 8 files changed, 341 insertions(+), 151 deletions(-) diff --git a/src/com/silverwrist/venice/db/PostLinkRewriter.java b/src/com/silverwrist/venice/db/PostLinkRewriter.java index b94b9ac..f60d034 100644 --- a/src/com/silverwrist/venice/db/PostLinkRewriter.java +++ b/src/com/silverwrist/venice/db/PostLinkRewriter.java @@ -9,9 +9,9 @@ * * The Original Code is the Venice Web Communities System. * - * The Initial Developer of the Original Code is Eric J. Bowersox , + * The Initial Developer of the Original Code is Eric J. Bowersox , * for Silverwrist Design Studios. Portions created by Eric J. Bowersox are - * Copyright (C) 2001-02 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. + * Copyright (C) 2001-2004 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. * * Contributor(s): */ @@ -40,7 +40,7 @@ public class PostLinkRewriter implements Rewriter *-------------------------------------------------------------------------------- */ - private GlobalSite globalsite; // global site containing utilities + private final GlobalSite globalsite; // global site containing utilities /*-------------------------------------------------------------------------------- * Constructor @@ -60,7 +60,7 @@ public class PostLinkRewriter implements Rewriter private static final String buildPostLink(PostLinkDecoder pl, PostLinkDecoderContext ctxt) { - StringBuffer b = new StringBuffer(URI_PREFIX); + StringBuffer b = new StringBuffer(); boolean started = false; if (pl.getCommunity()==null) b.append(ctxt.getCommunityName()); @@ -180,14 +180,18 @@ public class PostLinkRewriter implements Rewriter } // end catch + // build the post link and add it as an internal reference + String link = buildPostLink(pl,ctxt); + svc.addInternalReference(link); + // build the necessary markup and return it - StringBuffer open_a = new StringBuffer("'); - return new MarkupData(open_a.toString(),data,""); + return new MarkupData(open_a.toString(),data,""); } // end rewrite diff --git a/src/com/silverwrist/venice/htmlcheck/HTMLChecker.java b/src/com/silverwrist/venice/htmlcheck/HTMLChecker.java index 448b0ce..f24e783 100644 --- a/src/com/silverwrist/venice/htmlcheck/HTMLChecker.java +++ b/src/com/silverwrist/venice/htmlcheck/HTMLChecker.java @@ -17,6 +17,8 @@ */ package com.silverwrist.venice.htmlcheck; +import java.util.Set; + public interface HTMLChecker { public void append(String str) throws AlreadyFinishedException; @@ -37,4 +39,8 @@ public interface HTMLChecker public void setContextValue(String name, Object val); + public Set getExternalReferences() throws NotYetFinishedException; + + public Set getInternalReferences() throws NotYetFinishedException; + } // end interface HTMLChecker diff --git a/src/com/silverwrist/venice/htmlcheck/RewriterServices.java b/src/com/silverwrist/venice/htmlcheck/RewriterServices.java index b9e1152..7c98bb4 100644 --- a/src/com/silverwrist/venice/htmlcheck/RewriterServices.java +++ b/src/com/silverwrist/venice/htmlcheck/RewriterServices.java @@ -9,18 +9,24 @@ * * The Original Code is the Venice Web Community System. * - * The Initial Developer of the Original Code is Eric J. Bowersox , + * The Initial Developer of the Original Code is Eric J. Bowersox , * for Silverwrist Design Studios. Portions created by Eric J. Bowersox are - * Copyright (C) 2001 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. + * Copyright (C) 2001-2004 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. * * Contributor(s): */ package com.silverwrist.venice.htmlcheck; +import java.net.URL; + public interface RewriterServices { - public abstract String getRewriterAttrValue(String name); + public String getRewriterAttrValue(String name); - public abstract Object getRewriterContextValue(String name); + public Object getRewriterContextValue(String name); + + public void addExternalReference(URL ref); + + public void addInternalReference(String ref); } // end interface RewriterServices diff --git a/src/com/silverwrist/venice/htmlcheck/filters/EmailRewriter.java b/src/com/silverwrist/venice/htmlcheck/filters/EmailRewriter.java index dc788d9..0b4fe95 100644 --- a/src/com/silverwrist/venice/htmlcheck/filters/EmailRewriter.java +++ b/src/com/silverwrist/venice/htmlcheck/filters/EmailRewriter.java @@ -9,21 +9,33 @@ * * The Original Code is the Venice Web Community System. * - * The Initial Developer of the Original Code is Eric J. Bowersox , + * The Initial Developer of the Original Code is Eric J. Bowersox , * for Silverwrist Design Studios. Portions created by Eric J. Bowersox are - * Copyright (C) 2001 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. + * Copyright (C) 2001-2004 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. * * Contributor(s): */ package com.silverwrist.venice.htmlcheck.filters; import java.util.*; +import org.apache.log4j.Logger; +import org.apache.regexp.*; +import com.silverwrist.util.*; import com.silverwrist.venice.htmlcheck.Rewriter; import com.silverwrist.venice.htmlcheck.RewriterServices; import com.silverwrist.venice.htmlcheck.MarkupData; public class EmailRewriter implements Rewriter { + /*-------------------------------------------------------------------------------- + * Static data members + *-------------------------------------------------------------------------------- + */ + + private static Logger logger = Logger.getLogger(EmailRewriter.class); + + private static REProgram s_match = null; + /*-------------------------------------------------------------------------------- * Constructor *-------------------------------------------------------------------------------- @@ -46,23 +58,42 @@ public class EmailRewriter implements Rewriter public MarkupData rewrite(String data, RewriterServices svc) { - int at_pos = data.indexOf('@'); - if ((at_pos<=0) || (at_pos==(data.length()-1))) - return null; - - // TODO: put in more validation checking + RE m = new RE(s_match,RE.MATCH_CASEINDEPENDENT); + if (!(m.match(data))) + return null; // not a valid E-mail address // build the tag (the gnarliest part) - StringBuffer open_a = new StringBuffer("0)) + if (!(StringUtil.isStringEmpty(catenate))) open_a.append(' ').append(catenate); open_a.append('>'); // return the markup data back to the checker - return new MarkupData(open_a.toString(),data,""); + return new MarkupData(open_a.toString(),data,""); } // end rewrite + /*-------------------------------------------------------------------------------- + * Static initializer + *-------------------------------------------------------------------------------- + */ + + static + { + try + { // compile our regular expression + RECompiler compiler = new RECompiler(); + s_match = compiler.compile("^[A-Za-z0-9!#$%*+-/=?^_`{|}~.]+@[A-Za-z0-9_-]+(?:\\.[A-Za-z0-9_-]+)+$"); + + } // end try + catch (RESyntaxException e) + { // shouldn't happen + logger.fatal("caught RESyntaxException in EmailRewriter initializer",e); + + } // end catch + + } // end static initializer + } // end class EmailRewriter diff --git a/src/com/silverwrist/venice/htmlcheck/filters/URLRewriter.java b/src/com/silverwrist/venice/htmlcheck/filters/URLRewriter.java index ea18a00..f65fc86 100644 --- a/src/com/silverwrist/venice/htmlcheck/filters/URLRewriter.java +++ b/src/com/silverwrist/venice/htmlcheck/filters/URLRewriter.java @@ -9,29 +9,87 @@ * * The Original Code is the Venice Web Community System. * - * The Initial Developer of the Original Code is Eric J. Bowersox , + * The Initial Developer of the Original Code is Eric J. Bowersox , * for Silverwrist Design Studios. Portions created by Eric J. Bowersox are - * Copyright (C) 2001 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. + * Copyright (C) 2001-2004 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. * * Contributor(s): */ package com.silverwrist.venice.htmlcheck.filters; +import java.net.*; import java.util.*; +import org.apache.log4j.Logger; +import org.apache.regexp.*; +import com.silverwrist.util.*; import com.silverwrist.venice.htmlcheck.Rewriter; import com.silverwrist.venice.htmlcheck.RewriterServices; import com.silverwrist.venice.htmlcheck.MarkupData; public class URLRewriter implements Rewriter { + /*-------------------------------------------------------------------------------- + * Internal class containing URL elements. + *-------------------------------------------------------------------------------- + */ + + private static class URLElement + { + private REProgram m_match; + private String m_prefix; + + URLElement(String pattern, String prefix) + { + try + { // fill the classes + m_match = COMPILER.compile(pattern); + m_prefix = prefix; + + } // end try + catch (RESyntaxException e) + { // shouldn't happen + logger.fatal("got RESyntaxException in URLElement",e); + + } // end catch + + } // end constructor + + String eval(String input) + { + RE m = new RE(m_match,RE.MATCH_CASEINDEPENDENT); + if (m.match(input)) + return m_prefix + input; + else + return null; + + } // end eval + + } // end class URLElement + /*-------------------------------------------------------------------------------- * Static data members *-------------------------------------------------------------------------------- */ - private static final String NULLSTRING = ""; - private static Hashtable prefix_list = null; - private static boolean set_up = true; + private static Logger logger = Logger.getLogger(URLRewriter.class); + + private static final RECompiler COMPILER = new RECompiler(); + private static final String[] SETUP_DATA = + { + "^http://[A-Za-z0-9_-]+(?:\\.[A-Za-z0-9_-]+)+", "", + "^ftp://[A-Za-z0-9_-]+(?:\\.[A-Za-z0-9_-]+)+", "", + "^gopher://[A-Za-z0-9_-]+(?:\\.[A-Za-z0-9_-]+)+", "", + "^mailto:[A-Za-z0-9!#$%*+-/=?^_`{|}~.]+@[A-Za-z0-9_-]+(?:\\.[A-Za-z0-9_-]+)+$", "", + "^news:[A-Za-z0-9_-]+(?:\\.[A-Za-z0-9_-]+)+$", "", + "^nntp://[A-Za-z0-9_-]+(?:\\.[A-Za-z0-9_-]+)+", "", + "^telnet://[A-Za-z0-9_-]+(?:\\.[A-Za-z0-9_-]+)+", "", + "^tn3270://[A-Za-z0-9_-]+(?:\\.[A-Za-z0-9_-]+)+", "", + "^www\\.[A-Za-z0-9_-]+(?:\\.[A-Za-z0-9_-]+)*", "http://", + "^ftp\\.[A-Za-z0-9_-]+(?:\\.[A-Za-z0-9_-]+)*", "ftp://", + "^gopher\\.[A-Za-z0-9_-]+(?:\\.[A-Za-z0-9_-]+)*", "gopher://" + }; + + private static final List KNOWN_ELEMENTS; /*-------------------------------------------------------------------------------- * Constructor @@ -39,40 +97,9 @@ public class URLRewriter implements Rewriter */ public URLRewriter() - { - setUpPrefixes(); // make sure the prefix data is set up - + { // do nothing } // end constructor - /*-------------------------------------------------------------------------------- - * Internal functions - *-------------------------------------------------------------------------------- - */ - - private static void setUpPrefixes() - { - if (set_up) - { // allocate the hash table - set_up = false; - prefix_list = new Hashtable(10,0.9F); - - // fill it with the proper URL prefixes - prefix_list.put("http:",NULLSTRING); - prefix_list.put("ftp:",NULLSTRING); - prefix_list.put("gopher:",NULLSTRING); - prefix_list.put("mailto:",NULLSTRING); - prefix_list.put("news:",NULLSTRING); - prefix_list.put("nntp:",NULLSTRING); - prefix_list.put("telnet:",NULLSTRING); - prefix_list.put("tn3270:",NULLSTRING); - prefix_list.put("www.",new String("http://")); - prefix_list.put("ftp.",new String("ftp://")); - prefix_list.put("gopher.",new String("gopher://")); - - } // end if - - } // end setUpPrefixes - /*-------------------------------------------------------------------------------- * Implementations from interface Rewriter *-------------------------------------------------------------------------------- @@ -86,29 +113,53 @@ public class URLRewriter implements Rewriter public MarkupData rewrite(String data, RewriterServices svc) { - Enumeration prefixes = prefix_list.keys(); - while (prefixes.hasMoreElements()) - { // get the next prefix and compare against the beginning of the string - String pfx = (String)(prefixes.nextElement()); - if (data.regionMatches(true,0,pfx,0,pfx.length())) - { // good enough! build the open tag (the gnarliest part of the markup) - StringBuffer open_a = new StringBuffer("0)) + for (Iterator it=KNOWN_ELEMENTS.iterator(); it.hasNext(); ) + { // test each element in turn + URLElement ue = (URLElement)(it.next()); + String s = ue.eval(data); + if (s!=null) + { // got a match! record the external reference and build the open tag + try + { // create URL and add it + if (s.toLowerCase().startsWith("http:")) + svc.addExternalReference(new URL(s)); + + } // end try + catch (MalformedURLException e) + { // forget it + } // end catch + + StringBuffer open_a = new StringBuffer("'); // here's how you mark it up! - return new MarkupData(open_a.toString(),data,""); + return new MarkupData(open_a.toString(),data,""); } // end if - } // end while + } // end for return null; // sorry, no can do } // end rewrite + /*-------------------------------------------------------------------------------- + * Static initializer + *-------------------------------------------------------------------------------- + */ + + static + { + ArrayList tmp = new ArrayList(); + for (int i=0; i, + * The Initial Developer of the Original Code is Eric J. Bowersox , * for Silverwrist Design Studios. Portions created by Eric J. Bowersox are - * Copyright (C) 2001 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. + * Copyright (C) 2001-2004 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. * * Contributor(s): */ package com.silverwrist.venice.htmlcheck.impl; +import java.net.URL; + public interface HTMLCheckerBackend { - public abstract String getCheckerAttrValue(String name); + public String getCheckerAttrValue(String name); - public abstract void sendTagMessage(String msg); + public void sendTagMessage(String msg); - public abstract Object getCheckerContextValue(String name); + public Object getCheckerContextValue(String name); + + public void addExternalReference(URL ref); + + public void addInternalReference(String ref); } // end interface HTMLCheckerBackend diff --git a/src/com/silverwrist/venice/htmlcheck/impl/HTMLCheckerImpl.java b/src/com/silverwrist/venice/htmlcheck/impl/HTMLCheckerImpl.java index a2687a9..c8adee4 100644 --- a/src/com/silverwrist/venice/htmlcheck/impl/HTMLCheckerImpl.java +++ b/src/com/silverwrist/venice/htmlcheck/impl/HTMLCheckerImpl.java @@ -17,6 +17,7 @@ */ package com.silverwrist.venice.htmlcheck.impl; +import java.net.URL; import java.util.*; import org.apache.log4j.*; import com.silverwrist.venice.htmlcheck.*; @@ -139,6 +140,8 @@ class HTMLCheckerImpl implements HTMLChecker, HTMLCheckerBackend, RewriterServic private ArrayList m_tag_rewriters = new ArrayList(); // tag rewriter instances private ArrayList m_paren_rewriters = new ArrayList(); // paren rewriter instances private HashMap m_context_data = new HashMap(); // context variables + private HashSet m_external_references = new HashSet(); // list of external references + private HashSet m_internal_references = new HashSet(); // list of internal references /*-------------------------------------------------------------------------------- * Constructor @@ -161,7 +164,7 @@ class HTMLCheckerImpl implements HTMLChecker, HTMLCheckerBackend, RewriterServic */ /** - * Returns true if this character belongs as part of a word, false if not. + * Returns true if this character belongs as part of a word, false if not. * * @param ch Character to be tested. * @return See above. @@ -210,8 +213,8 @@ class HTMLCheckerImpl implements HTMLChecker, HTMLCheckerBackend, RewriterServic } // end getRunLength /** - * Copies the Rewriter objects from an outside list to an internal list, wrapping - * named rewriters in CountingRewriter objects as appropriate. + * Copies the Rewriter objects from an outside list to an internal list, wrapping + * named rewriters in CountingRewriter objects as appropriate. * * @param dest Destination to copy rewriters to. * @param source List to copy rewriters from. @@ -290,8 +293,8 @@ class HTMLCheckerImpl implements HTMLChecker, HTMLCheckerBackend, RewriterServic * * @param ch Character to output. * @param filters List of filters to use to attempt to process the character. - * @param count_cols true if the character output adds to the column counter, - * false if not. + * @param count_cols true if the character output adds to the column counter, + * false if not. */ private final void emitChar(char ch, List filters, boolean count_cols) { @@ -841,7 +844,7 @@ class HTMLCheckerImpl implements HTMLChecker, HTMLCheckerBackend, RewriterServic } // end handleAsHTML /** - * Returns true if the temporary buffer contains the start of an HTML comment. (The + * Returns true if the temporary buffer contains the start of an HTML comment. (The * leading and trailing angle brackets are assumed.) * * @return See above. @@ -853,7 +856,7 @@ class HTMLCheckerImpl implements HTMLChecker, HTMLCheckerBackend, RewriterServic } // end containsHTMLComment /** - * Returns true if the temporary buffer contains a complete HTML comment. (The leading + * Returns true if the temporary buffer contains a complete HTML comment. (The leading * and trailing angle brackets are assumed.) * * @return See above. @@ -869,7 +872,7 @@ class HTMLCheckerImpl implements HTMLChecker, HTMLCheckerBackend, RewriterServic } // end containsCompleteHTMLComment /** - * Returns true if the temporary buffer contains an XML construct, i.e. a tag that + * Returns true if the temporary buffer contains an XML construct, i.e. a tag that * contains a ':', and may or may not have a leading '/'. (The leading and trailing angle brackets * are assumed.) * @@ -1381,6 +1384,8 @@ class HTMLCheckerImpl implements HTMLChecker, HTMLCheckerBackend, RewriterServic m_lines = 0; m_paren_level = 0; m_output_buffer = null; + m_external_references.clear(); + m_internal_references.clear(); killState(); // Also reset all the counters. @@ -1441,6 +1446,28 @@ class HTMLCheckerImpl implements HTMLChecker, HTMLCheckerBackend, RewriterServic } // end setContextValue + public Set getExternalReferences() throws NotYetFinishedException + { + if (!m_finished) + throw new NotYetFinishedException(); + if (m_external_references.isEmpty()) + return Collections.EMPTY_SET; + HashSet rc = new HashSet(m_external_references); + return Collections.unmodifiableSet(rc); + + } // end getExternalReferences + + public Set getInternalReferences() throws NotYetFinishedException + { + if (!m_finished) + throw new NotYetFinishedException(); + if (m_internal_references.isEmpty()) + return Collections.EMPTY_SET; + HashSet rc = new HashSet(m_internal_references); + return Collections.unmodifiableSet(rc); + + } // end getInternalReferences + /*-------------------------------------------------------------------------------- * Implementations from interface HTMLCheckerBackend *-------------------------------------------------------------------------------- @@ -1488,6 +1515,18 @@ class HTMLCheckerImpl implements HTMLChecker, HTMLCheckerBackend, RewriterServic } // end getCheckerContextValue + public void addExternalReference(URL ref) + { + m_external_references.add(ref); + + } // end addExternalReference + + public void addInternalReference(String ref) + { + m_internal_references.add(ref); + + } // end addInternalReference + /*-------------------------------------------------------------------------------- * Implementations from interface RewriterServices *-------------------------------------------------------------------------------- @@ -1505,4 +1544,6 @@ class HTMLCheckerImpl implements HTMLChecker, HTMLCheckerBackend, RewriterServic } // end getRewriterContextValue + // addExternalReference is implemented as part of HTMLCheckerBackend + } // end class HTMLCheckerImpl diff --git a/src/com/silverwrist/venice/htmlcheck/impl/TagA.java b/src/com/silverwrist/venice/htmlcheck/impl/TagA.java index f6391fb..03343ba 100644 --- a/src/com/silverwrist/venice/htmlcheck/impl/TagA.java +++ b/src/com/silverwrist/venice/htmlcheck/impl/TagA.java @@ -9,14 +9,19 @@ * * The Original Code is the Venice Web Community System. * - * The Initial Developer of the Original Code is Eric J. Bowersox , + * The Initial Developer of the Original Code is Eric J. Bowersox , * for Silverwrist Design Studios. Portions created by Eric J. Bowersox are - * Copyright (C) 2001 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. + * Copyright (C) 2001-2004 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. * * Contributor(s): */ package com.silverwrist.venice.htmlcheck.impl; +import java.net.*; +import org.apache.log4j.Logger; +import org.apache.regexp.*; +import com.silverwrist.util.*; + class TagA extends BalancedTag { /*-------------------------------------------------------------------------------- @@ -24,21 +29,94 @@ class TagA extends BalancedTag *-------------------------------------------------------------------------------- */ - private static final String TARGET_ATTR = "TARGET"; + /** The instance of {@link org.apache.log4j.Logger Logger} for use by this class. */ + private static Logger logger = Logger.getLogger(TagA.class); + + /*-------------------------------------------------------------------------------- + * Attributes + *-------------------------------------------------------------------------------- + */ + + /** Regular expression program to look for "HREF=" attribute. */ + private REProgram m_href = null; + + /** Regular expression program to look for "TARGET=" attribute. */ + private REProgram m_target = null; /*-------------------------------------------------------------------------------- * Constructor *-------------------------------------------------------------------------------- */ + /** + * Creates a new instance of TagA. + */ TagA() { super("A",false); + try + { // compile regular expressions + RECompiler compiler = new RECompiler(); + m_href = compiler.compile("href\\s*="); + m_target = compiler.compile("target\\s*="); + + } // end try + catch (RESyntaxException e) + { // shouldn't happen + logger.fatal("got RESyntaxException in TagA",e); + + } // end catch } // end constructor /*-------------------------------------------------------------------------------- - * External operations + * Internal operations + *-------------------------------------------------------------------------------- + */ + + /** + * Extracts an attribute value from the start of the string. The attribute value may be enclosed + * in quotes, or may simply be a series of nonblank characters delimited by blanks. + * + * @param s The string to extract the attribute value from. + * @return The attribute value extracted. + */ + private static final String extractAttribute(String s) + { + char[] a = s.toCharArray(); + int i = 0; + while ((i