From 7da81049547a205274260297f3371e13ab600675 Mon Sep 17 00:00:00 2001 From: "Eric J. Bowersox" Date: Sat, 14 Jun 2003 06:59:11 +0000 Subject: [PATCH] added a simple query language (with a JavaCC-compiled parser) and the appropriate query methods in IndexService --- INSTALL | 2 + build.properties.sample | 5 + build.xml | 22 +- .../dynamo/iface/IndexService.java | 8 + .../dynamo/index/IndexMessages.properties | 2 + .../dynamo/index/IndexServiceImpl.java | 291 +++++++++++++++ .../dynamo/index/ItemAndScore.java | 59 +++ .../com/silverwrist/dynamo/index/Parser.jj | 350 ++++++++++++++++++ .../dynamo/index/StaticCharStream.java | 189 ++++++++++ 9 files changed, 926 insertions(+), 2 deletions(-) create mode 100644 src/dynamo-framework/com/silverwrist/dynamo/index/ItemAndScore.java create mode 100644 src/dynamo-framework/com/silverwrist/dynamo/index/Parser.jj create mode 100644 src/dynamo-framework/com/silverwrist/dynamo/index/StaticCharStream.java diff --git a/INSTALL b/INSTALL index 823a123..4f3fe3c 100644 --- a/INSTALL +++ b/INSTALL @@ -24,12 +24,14 @@ Venice. PACKAGES REQUIRED FOR VENICE ---------------------------- The following packages must be referenced from within build.properties: +- JavaCC 3.0 (parser generator tool) - Java Servlet API 2.3 (use the servlet.jar file from Tomcat) - Jakarta Bean Scripting Framework 2.3 - Jakarta Commons Collections Library, 2.1 - Jakarta Commons Lang Library, 1.0.1 - Mozilla.org Rhino, 1.5R3 - Jakarta Log4J, 1.2.7 +- Jakarta Lucene, 1.3RC1 - Jakarta Velocity, 1.3.1 Optionally: diff --git a/build.properties.sample b/build.properties.sample index 0c87661..628c9e6 100644 --- a/build.properties.sample +++ b/build.properties.sample @@ -24,6 +24,11 @@ # [Logging directory] # logfile.dir=${user.home} +# [Location of JavaCC 3.0] +javacc.base=/usr/local/java/javacc-3.0 +# javacc.lib=${javacc.base}/bin/lib +# javacc.jarfile=javacc.jar + # [Location of Servlet API 2.3] servlet.base=/usr/local/jakarta/jakarta-tomcat-4.1.24-LE-jdk14 servlet.lib=${servlet.base}/common/lib diff --git a/build.xml b/build.xml index 623f277..e2e0d91 100644 --- a/build.xml +++ b/build.xml @@ -37,6 +37,11 @@ + + + + + @@ -140,9 +145,22 @@ "build-dynamo" - Builds the Dynamo framework classes (com.silverwrist.dynamo.*). ============================================================================ --> + + + + + + + + + - + diff --git a/src/dynamo-framework/com/silverwrist/dynamo/iface/IndexService.java b/src/dynamo-framework/com/silverwrist/dynamo/iface/IndexService.java index 904d259..9abbf0f 100644 --- a/src/dynamo-framework/com/silverwrist/dynamo/iface/IndexService.java +++ b/src/dynamo-framework/com/silverwrist/dynamo/iface/IndexService.java @@ -17,6 +17,8 @@ */ package com.silverwrist.dynamo.iface; +import java.util.Date; +import java.util.List; import com.silverwrist.dynamo.except.IndexException; public interface IndexService @@ -26,4 +28,10 @@ public interface IndexService public boolean deleteItem(String item_namespace, String item_name, Object item) throws IndexException; + public List query(String query_string, java.util.Date date_low, java.util.Date date_high, DynamoUser match_owner, + String match_scope, int offset, int count) throws IndexException; + + public int queryCount(String query_string, java.util.Date date_low, java.util.Date date_high, DynamoUser match_owner, + String match_scope) throws IndexException; + } // end interface IndexService diff --git a/src/dynamo-framework/com/silverwrist/dynamo/index/IndexMessages.properties b/src/dynamo-framework/com/silverwrist/dynamo/index/IndexMessages.properties index a2d1239..13016c2 100644 --- a/src/dynamo-framework/com/silverwrist/dynamo/index/IndexMessages.properties +++ b/src/dynamo-framework/com/silverwrist/dynamo/index/IndexMessages.properties @@ -22,3 +22,5 @@ analyzer.noCreate=Unable to create an instance of the analyzer class {0}. analyzer.badType=The specified analyzer class {0} is of the wrong type. addItem.fail=Unable to add a new item (namespace {0}, name {1}) to index {2}. deleteItem.fail=Unable to remove an item (namespace {0}, name {1}) from index {2}. +query.syntax=Parse error in query string: {0} +query.fail=Unable to execute search query. diff --git a/src/dynamo-framework/com/silverwrist/dynamo/index/IndexServiceImpl.java b/src/dynamo-framework/com/silverwrist/dynamo/index/IndexServiceImpl.java index 79e86e2..fbea77a 100644 --- a/src/dynamo-framework/com/silverwrist/dynamo/index/IndexServiceImpl.java +++ b/src/dynamo-framework/com/silverwrist/dynamo/index/IndexServiceImpl.java @@ -19,16 +19,156 @@ package com.silverwrist.dynamo.index; import java.io.*; import java.lang.ref.*; +import java.util.*; +import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.*; import org.apache.lucene.index.*; +import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; +import com.silverwrist.util.*; import com.silverwrist.dynamo.except.*; import com.silverwrist.dynamo.iface.*; import com.silverwrist.dynamo.util.*; class IndexServiceImpl implements IndexService { + /*-------------------------------------------------------------------------------- + * Internal counting HitCollector + *-------------------------------------------------------------------------------- + */ + + private static class CountingCollector extends HitCollector + { + /*==================================================================== + * Attributes + *==================================================================== + */ + + private int m_count = 0; + + /*==================================================================== + * Constructor + *==================================================================== + */ + + CountingCollector() + { // do nothing + } // end constructor + + /*==================================================================== + * Abstract implementations from class HitCollector + *==================================================================== + */ + + public void collect(int doc, float score) + { + m_count++; + + } // end collect + + /*==================================================================== + * External operations + *==================================================================== + */ + + int getCount() + { + return m_count; + + } // end getCount + + } // end class CountingCollector + + /*-------------------------------------------------------------------------------- + * Internal HitCollector that gathers a request subset + *-------------------------------------------------------------------------------- + */ + + private class SubsetCollector extends HitCollector + { + /*==================================================================== + * Attributes + *==================================================================== + */ + + private int[] m_docs; + private float[] m_scores; + private int m_offset; + private int m_size = 0; + + /*==================================================================== + * Constructor + *==================================================================== + */ + + SubsetCollector(int offset, int count) + { + m_docs = new int[count]; + m_scores = new float[count]; + m_offset = offset; + + } // end constructor + + /*==================================================================== + * Abstract implementations from class HitCollector + *==================================================================== + */ + + public void collect(int doc, float score) + { + if (m_offset>0) + { // skip documents at beginning of list + m_offset--; + return; + + } // end if + + if (m_size=0) + queries.add(new WildcardQuery(new Term("scope",match_scope))); + else if (match_scope.indexOf('*')>=0) + { // append another query + String s = match_scope.substring(0,match_scope.length()-1); + if (s.indexOf('*')<0) + queries.add(new PrefixQuery(new Term("scope",s))); + else + queries.add(new WildcardQuery(new Term("scope",match_scope))); + + } // end else if + else // match the scope directly + queries.add(new TermQuery(new Term("scope",match_scope))); + + } // end if + + // Boil down all the queries for me. + if (queries.size()==0) + return null; + if (queries.size()==1) + return (Query)(queries.get(0)); + BooleanQuery rc = new BooleanQuery(); + for (int i=0; i. + * + * Software distributed under the License is distributed on an "AS IS" basis, WITHOUT + * WARRANTY OF ANY KIND, either express or implied. See the License for the specific + * language governing rights and limitations under the License. + * + * The Original Code is the Venice Web Communities System. + * + * The Initial Developer of the Original Code is Eric J. Bowersox , + * for Silverwrist Design Studios. Portions created by Eric J. Bowersox are + * Copyright (C) 2003 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. + * + * Contributor(s): + */ +package com.silverwrist.dynamo.index; + +public final class ItemAndScore +{ + /*-------------------------------------------------------------------------------- + * Attributes + *-------------------------------------------------------------------------------- + */ + + private Object m_item; + private float m_score; + + /*-------------------------------------------------------------------------------- + * Constructor + *-------------------------------------------------------------------------------- + */ + + public ItemAndScore(Object item, float score) + { + m_item = item; + m_score = score; + + } // end constructor + + /*-------------------------------------------------------------------------------- + * External getters + *-------------------------------------------------------------------------------- + */ + + public Object getItem() + { + return m_item; + + } // end getItem + + public float getScore() + { + return m_score; + + } // end getScore + +} // end class ItemAndScore diff --git a/src/dynamo-framework/com/silverwrist/dynamo/index/Parser.jj b/src/dynamo-framework/com/silverwrist/dynamo/index/Parser.jj new file mode 100644 index 0000000..fafe835 --- /dev/null +++ b/src/dynamo-framework/com/silverwrist/dynamo/index/Parser.jj @@ -0,0 +1,350 @@ +/* + * The contents of this file are subject to the Mozilla Public License Version 1.1 + * (the "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at . + * + * Software distributed under the License is distributed on an "AS IS" basis, WITHOUT + * WARRANTY OF ANY KIND, either express or implied. See the License for the specific + * language governing rights and limitations under the License. + * + * The Original Code is the Venice Web Communities System. + * + * The Initial Developer of the Original Code is Eric J. Bowersox , + * for Silverwrist Design Studios. Portions created by Eric J. Bowersox are + * Copyright (C) 2003 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. + * + * Contributor(s): + */ + +options +{ + STATIC = false; + JAVA_UNICODE_ESCAPE = true; + USER_CHAR_STREAM = true; +} + +PARSER_BEGIN(Parser) + +package com.silverwrist.dynamo.index; + +import java.io.*; +import java.util.*; +import org.apache.lucene.index.Term; +import org.apache.lucene.analysis.*; +import org.apache.lucene.document.*; +import org.apache.lucene.search.*; + +/** + * N.B.: A lot of this is based on the Lucene QueryParser code, but streamlined to fit the needs of the Dynamo + * indexing system. + */ +class Parser +{ + /*-------------------------------------------------------------------------------- + * Attributes + *-------------------------------------------------------------------------------- + */ + + private Analyzer m_analyzer; + + /*-------------------------------------------------------------------------------- + * Internal operations + *-------------------------------------------------------------------------------- + */ + + private static final Query createWildcardQuery(String data) + { + Term t = new Term("text",data.toLowerCase()); + return new WildcardQuery(t); + + } // end createWildcardQuery + + private static final Query createPrefixQuery(String data) + { + Term t = new Term("text",data.toLowerCase()); + return new PrefixQuery(t); + + } // end createPrefixQuery + + private static final Query createFuzzyQuery(String data) + { + Term t = new Term("text",data); + return new FuzzyQuery(t); + + } // end createFuzzyQuery + + private final Query createNormalQuery(String data) + { + TokenStream tstm = m_analyzer.tokenStream("text",new StringReader(data)); + ArrayList toks = new ArrayList(); + org.apache.lucene.analysis.Token t = null; + + for (;;) + { // use the Lucene TokenStream to find all the tokens and eliminate stopwords + try + { // get the next token from the input + t = tstm.next(); + + } // end try + catch (IOException e) + { // whoops! + t = null; + + } // end catch + + if (t==null) + break; // done scanning the string + + toks.add(t.termText()); + + } // end for (ever) + + if (toks.size()==0) + return null; // no query + if (toks.size()==1) // single term query + return new TermQuery(new Term("text",(String)(toks.get(0)))); + + // Build a PhraseQuery and return that. + PhraseQuery rc = new PhraseQuery(); + rc.setSlop(0); + for (int i=0; i TOKEN: +{ + <#_DIGIT: ["0"-"9"]> + | <#_WHITESPACE: [" ", "\t"]> + | <#_RESERVED: ["+", "-", "(", ")", "~", "^", "\"", "*", "?"]> + | <#_ESCAPED: "\\" ( <_RESERVED> | "\\" )> + | <#_VALID: ~["+", "-", "(", ")", "~", "^", "\"", "*", "?", " ", "\t"]> + | <#_TERMCHAR: <_VALID> | <_ESCAPED>> + +} // end token definitions + + SKIP: +{ + <<_WHITESPACE>> + +} // end skip definition + + TOKEN: +{ + // AND query + | // OR query + | // plus sign + | // minus sign + | // left parenthesis + | // right parenthesis + | // "fuzzy" operator + | : Weight // "weighting" operator + | // quoted string + | (<_TERMCHAR>)*> // simple term + | (<_TERMCHAR>)* "*"> // prefix term + | (<_TERMCHAR> | "?" | "*")*> // term with wildcards + +} // end default token definitions + + TOKEN: +{ + )+ ( "." (<_DIGIT>)+ )?> : DEFAULT +} + +/*-------------------------------------------------------------------------------- + * BNF (parser) definitions + *-------------------------------------------------------------------------------- + */ + +Query search(): +{ + Query rc = null; + +} // end search declarations +{ + ( + + | rc=search_expression() + ) + { return rc; } + +} // end search + +Query search_expression(): +{ + ArrayList clauses = new ArrayList(); + BooleanClause x = null; + +} // end search_expression declarations +{ + x=or_expression() { if (x!=null) clauses.add(x); } + ( ()? x=or_expression() { if (x!=null) clauses.add(x); } )* + { + if (clauses.size()==0) + return null; + if (clauses.size()==1) + { + x = (BooleanClause)(clauses.get(0)); + if (!(x.prohibited)) + return x.query; + BooleanQuery rc = new BooleanQuery(); + rc.add(x); + return rc; + + } // end if + + BooleanQuery q = new BooleanQuery(); + for (int i=0; i x=boolean_expression() { if (x!=null) clauses.add(x); } )* + { + if (clauses.size()==0) + return null; + if (clauses.size()==1) + return (BooleanClause)(clauses.get(0)); + BooleanQuery q = new BooleanQuery(); + for (int i=0; i { required = true; } | { prohibited = true; } ] q=simple_expression() + { + return (q==null) ? null : new BooleanClause(q,required,prohibited); + + } // end block + +} // end boolean_expression + +Query simple_expression(): +{ + Query rc = null; + Token wght = null; + Token data = null; + boolean is_prefix = false; + boolean is_wildcard = false; + boolean is_fuzzy = false; + +} // end simple_expression declarations +{ + ( + ( + data= + | data= { is_prefix = true; } + | data= { is_wildcard = true; } + | data= + ) + [ { is_fuzzy = true; } ] [ wght= [ { is_fuzzy = true; } ] ] + { // "data" contains the search term value + if (is_wildcard) + rc = createWildcardQuery(data.image); + else if (is_prefix) + rc = createPrefixQuery(data.image.substring(0,data.image.length()-1)); + else if (is_fuzzy) + rc = createFuzzyQuery(data.image); + else + rc = createNormalQuery(data.image); + + } // end block + + | data= [ wght= ] + { // "data" contains the search term value + rc = createNormalQuery(data.image.substring(1,data.image.length()-1)); + + } // end block + + | rc=search_expression() [ wght= ] + ) + { + if (wght!=null) + { // set the weight of this query + float weightval = 1.0F; + try + { // parse the float value + weightval = Float.parseFloat(wght.image); + + } // end try + catch (Exception e) + { // ignore exceptions + } // end catch + + if (rc!=null) + rc.setBoost(weightval); + + } // end if + + return rc; + + } // end block + +} // end simple_expression diff --git a/src/dynamo-framework/com/silverwrist/dynamo/index/StaticCharStream.java b/src/dynamo-framework/com/silverwrist/dynamo/index/StaticCharStream.java new file mode 100644 index 0000000..6c38682 --- /dev/null +++ b/src/dynamo-framework/com/silverwrist/dynamo/index/StaticCharStream.java @@ -0,0 +1,189 @@ +/* + * The contents of this file are subject to the Mozilla Public License Version 1.1 + * (the "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at . + * + * Software distributed under the License is distributed on an "AS IS" basis, WITHOUT + * WARRANTY OF ANY KIND, either express or implied. See the License for the specific + * language governing rights and limitations under the License. + * + * The Original Code is the Venice Web Communities System. + * + * The Initial Developer of the Original Code is Eric J. Bowersox , + * for Silverwrist Design Studios. Portions created by Eric J. Bowersox are + * Copyright (C) 2003 Eric J. Bowersox/Silverwrist Design Studios. All Rights Reserved. + * + * Contributor(s): + */ +package com.silverwrist.dynamo.index; + +class StaticCharStream implements CharStream +{ + /*-------------------------------------------------------------------------------- + * Attributes + *-------------------------------------------------------------------------------- + */ + + private char[] m_array; // array full of characters to be read + int m_pos = 0; // index of next character to be read + int m_tokenstart = 0; // index of start of token + + /*-------------------------------------------------------------------------------- + * Constructor + *-------------------------------------------------------------------------------- + */ + + StaticCharStream(String s) + { + m_array = s.toCharArray(); + + } // end constructor + + /*-------------------------------------------------------------------------------- + * Implementations from interface CharStream + *-------------------------------------------------------------------------------- + */ + + /** + * Returns the next character from the selected input. The method + * of selecting the input is the responsibility of the class + * implementing this interface. Can throw any java.io.IOException. + */ + public char readChar() throws java.io.IOException + { + if (m_pos==m_array.length) + throw new java.io.IOException("read past EOF"); + return m_array[m_pos++]; + + } // end readChar + + /** + * Returns the column position of the character last read. + * @deprecated + * @see #getEndColumn + */ + public int getColumn() + { + return m_pos; + + } // end getColumn + + /** + * Returns the line number of the character last read. + * @deprecated + * @see #getEndLine + */ + public int getLine() + { + return 1; + + } // end getLine + + /** + * Returns the column number of the last character for current token (being + * matched after the last call to BeginTOken). + */ + public int getEndColumn() + { + return m_pos; + + } // end getEndColumn + + /** + * Returns the line number of the last character for current token (being + * matched after the last call to BeginTOken). + */ + public int getEndLine() + { + return 1; + + } // end getLine + + /** + * Returns the column number of the first character for current token (being + * matched after the last call to BeginTOken). + */ + public int getBeginColumn() + { + return m_tokenstart; + + } // end getBeginColumn + + /** + * Returns the line number of the first character for current token (being + * matched after the last call to BeginTOken). + */ + public int getBeginLine() + { + return 1; + + } // end getBeginLine + + /** + * Backs up the input stream by amount steps. Lexer calls this method if it + * had already read some characters, but could not use them to match a + * (longer) token. So, they will be used again as the prefix of the next + * token and it is the implemetation's responsibility to do this right. + */ + public void backup(int amount) + { + m_pos -= amount; + + } // end backup + + /** + * Returns the next character that marks the beginning of the next token. + * All characters must remain in the buffer between two successive calls + * to this method to implement backup correctly. + */ + public char BeginToken() throws java.io.IOException + { + m_tokenstart = m_pos; + return this.readChar(); + + } // end BeginToken + + /** + * Returns a string made up of characters from the marked token beginning + * to the current buffer position. Implementations have the choice of returning + * anything that they want to. For example, for efficiency, one might decide + * to just return null, which is a valid implementation. + */ + public String GetImage() + { + return new String(m_array,m_tokenstart,m_pos - m_tokenstart); + + } // end GetImage + + /** + * Returns an array of characters that make up the suffix of length 'len' for + * the currently matched token. This is used to build up the matched string + * for use in actions in the case of MORE. A simple and inefficient + * implementation of this is as follows : + * + * { + * String t = GetImage(); + * return t.substring(t.length() - len, t.length()).toCharArray(); + * } + */ + public char[] GetSuffix(int len) + { + char[] rc = new char[len]; + System.arraycopy(m_array,m_pos - len,rc,0,len); + return rc; + + } // end getSuffix + + /** + * The lexer calls this function to indicate that it is done with the stream + * and hence implementations can free any resources held by this class. + * Again, the body of this function can be just empty and it will not + * affect the lexer's operation. + */ + public void Done() + { + m_array = null; + + } // end Done + +} // end class StaticCharStream