001 /**
002 * Copyright 2011 The Buzz Media, LLC
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016 package com.thebuzzmedia.sjxp.rule;
017
018 import com.thebuzzmedia.sjxp.XMLParser;
019
020 /**
021 * Interface used to describe a "rule" in SJXP.
022 * <p/>
023 * The most important part of a rule is its <code>locationPath</code>, this
024 * literal {@link String} value is how the {@link XMLParser} matches up its
025 * current position inside of an XML doc with any {@link IRule}s that want
026 * information from that location.
027 * <p/>
028 * The <code>type</code> of the {@link IRule} indicates to the executing
029 * {@link XMLParser} when the rule should be queried for a match against its
030 * current position.
031 * <p/>
032 * All implementors must provide an implementation for the
033 * <code>handleParsedXXX</code> method matching the <code>type</code> of rule
034 * they have created. More specifically, if you are creating a
035 * {@link Type#ATTRIBUTE} rule, you need to implement the
036 * {@link #handleParsedAttribute(XMLParser, int, String, Object)} method; if you
037 * are implementing a {@link Type#CHARACTER} rule, you need to implement the
038 * {@link #handleParsedCharacters(XMLParser, String, Object)} method.
039 * <h3>Rule Matching</h3>
040 * Rules will execute every single time they match an element in an XML
041 * document. There is no XPath-like expression system to tell them to only get
042 * you the first, or 10th or every-other value from a document; you must
043 * implement that logic yourself inside of the <code>handleParsedXXX</code>
044 * handlers.
045 * <h3>Instance Reuse</h3>
046 * Instances of {@link IRule} are meant to be immutable and maintain no internal
047 * state which makes them safe for reuse among multiple instances of
048 * {@link XMLParser}.
049 * <h3>Rule Format</h3>
050 * The format of a location path is like a simple XPath rule with no
051 * expressions, for example:
052 *
053 * <pre>
054 * /library/book/title
055 * </pre>
056 *
057 * would point the "title" element inside of the "book" element which is inside
058 * the "library" element. If you are after a specific attribute of that element,
059 * simply provide its name as an attribute argument.
060 * <h3>Rule Format - Namespaces</h3>
061 * Referring to a namespace-qualified element in an XML doc is easy; whether it
062 * is part of the location path or an attribute name, all you have to do is
063 * prefix the local name of the element with brackets ([]) and the full
064 * namespace URI within the brackets, like:
065 *
066 * <pre>
067 * /library/[http://w3.org/texts]book/title
068 * </pre>
069 *
070 * In the example above, the "book" element is from a namespace defined by
071 * "http://w3.org/texts". Inside the actual XML markup, it is likely written
072 * with a friendly URI prefix that is defined at the top of the file, and would
073 * look more like this: <em>
074 * <txt:books>
075 * </em> but using the URI prefixes is not exact, as they can change from
076 * document to document, so SJXP requires that you reference the namespace using
077 * the URI itself, and not a prefix.
078 * <p/>
079 * In the case where the attribute itself is namespace-qualified, like
080 * <em><item rdf:about="blah" /></em>, you use the same notation for the
081 * attribute name, in this case (assuming the official RDF namespace) the
082 * attribute name argument you would actually return would look like this:
083 *
084 * <pre>
085 * [http://www.w3.org/1999/02/22-rdf-syntax-ns#]about
086 * </pre>
087 *
088 * It can look a little confusing, but it is exact and won't lead to
089 * impossible-to-debug scenarios.
090 * <h3>Rule Format - Default Namespaces</h3>
091 * Some XML files will define a default namespace using the <code>xmlns</code>
092 * argument, by itself, in the header. If your document does this, any tag in
093 * the document that isn't defined with a namespace prefix, will have to be
094 * referenced with the default namespace because that is how the XML file is
095 * technically defined.
096 * <p/>
097 * An example of this is Slashdot's RDF feed
098 * (http://rss.slashdot.org/Slashdot/slashdot); a default namespace of
099 * "http://purl.org/rss/1.0/" is defined, so all un-prefixed tags in the
100 * document (like <title>, <link> or <description>) all need
101 * to be qualified with that default URI, looking like this:
102 *
103 * <pre>
104 * [http://purl.org/rss/1.0/]title
105 * </pre>
106 *
107 * when you define the location path for those parse elements.
108 * <p/>
109 * It is important to be aware of this aspect of XML files otherwise you will
110 * run into scenarios where you can't understand why the parse value isn't being
111 * passed to you.
112 * <h3>Location Path & Attribute Name Strictness</h3>
113 * The implementation of SJXP is all based around strict name and namespace URI
114 * matching. If you do not specify a namespace URI for your element or attribute
115 * names, then only non-namespace-qualified elements will be looked for and
116 * matched; and visa-versa.
117 * <p/>
118 * If the XML content you are parsing is sloppy and you aren't sure if the
119 * values will be qualified correctly in every case, you will need to define 2
120 * {@link IRule}s; 1 for non-namespace-qualified values and 1 for
121 * namespace-qualified values.
122 * <p/>
123 * The SJXP library was purposefully designed to be pedantic to avoid "fuzzy"
124 * behavior that becomes maddening to debug in edge-case scenarios where you
125 * can't figure out why it is working one minute and breaking the next.
126 * <p/>
127 * Given the need of XML parsing in everything from video games to banking
128 * applications, SJXP had to take a very conservative approach and be as
129 * pedantic as possible so as not to hide any behavior from the caller.
130 *
131 * @param <T>
132 * The class type of any user-supplied object that the caller wishes
133 * to be passed through from one of the {@link XMLParser}'s
134 * <code>parse</code> methods directly to the handler when an
135 * {@link IRule} matches. This is typically a data storage mechanism
136 * like a DAO or cache used to store the parsed value in some
137 * valuable way, but it can ultimately be anything. If you do not
138 * need to make use of the user object, there is no need to
139 * parameterize the class.
140 *
141 * @author Riyad Kalla (software@thebuzzmedia.com)
142 */
143 public interface IRule<T> {
144 /**
145 * Used to describe the type of the parse rule.
146 */
147 public static enum Type {
148 /**
149 * Type used to indicate a rule interested in START_TAG and END_TAG
150 * events for the matching location path.
151 * <p/>
152 * This can be handy when no parsed data is needed from the underlying
153 * XML, but rather a simple notification that the location path existed
154 * in the XML (e.g. counting element occurrences).
155 */
156 TAG,
157 /**
158 * Type used to indicate that this rule describes 1 or more attribute
159 * values that the caller wants parsed.
160 */
161 ATTRIBUTE,
162 /**
163 * Used to describe a rule that will be called
164 *
165 * Type used to indicate that this rule describes the character data
166 * between an open and close tag that the caller wants parsed.
167 */
168 CHARACTER;
169 }
170
171 /**
172 * Used to get the type of the rule.
173 * <p/>
174 * The {@link XMLParser} uses this value to decide when to call this rule to
175 * see if it matches the current position inside the doc and how to parse
176 * out the values the rule wants.
177 *
178 * @return the type of the rule.
179 */
180 public Type getType();
181
182 /**
183 * Used to get the location path of the element inside the XML document that
184 * this rule is interested in.
185 * <p/>
186 * This value is compared literally against the internal path state of the
187 * {@link XMLParser} to see if they match before processing the rule. If you
188 * have a rule that isn't executing, chances are your location path is
189 * incorrect or mistyped or it is possible that your location path is
190 * correct but you have implemented the wrong <code>handleXXX</code> method
191 * so the default no-op one in {@link DefaultRule} is getting called.
192 * <h3>Namespaces</h3>
193 * Please refer to the class notes on the correct format used to define a
194 * path element that is namespace-qualified by using brackets.
195 * <p/>
196 * Namespace qualifiers can be specified for both element paths and
197 * attribute names.
198 *
199 * @return the location path of the element inside the XML document that
200 * this rule is interested in.
201 */
202 public String getLocationPath();
203
204 /**
205 * Used to get a list of attribute names that are to be parsed from the
206 * element located at {@link #getLocationPath()}.
207 * <p/>
208 * If the rule type is {@link Type#CHARACTER}, the attribute name list
209 * should be ignored.
210 * <h3>Namespaces</h3>
211 * Please refer to the class notes on the correct format used to define a
212 * path element that is namespace-qualified by using brackets.
213 * <p/>
214 * Namespace qualifiers can be specified for both element paths and
215 * attribute names.
216 *
217 * @return a list of attribute names that are to be parsed from the element
218 * located at {@link #getLocationPath()}.
219 */
220 public String[] getAttributeNames();
221
222 /**
223 * Handler method called by the {@link XMLParser} when an {@link IRule} of
224 * type {@link Type#TAG} matches the parser's current location in the
225 * document.
226 * <p/>
227 * This is a notification-style method, no data is parsed from the
228 * underlying document, the handler is merely called to give custom handling
229 * code a chance to respond to the matching open or close tag.
230 *
231 * @param parser
232 * The source {@link XMLParser} currently executing this rule.
233 * Providing access to the originating parser is handy if the
234 * rule wants to stop parsing by calling {@link XMLParser#stop()}
235 * .
236 * @param isStartTag
237 * Used to indicate if this notification is being made because
238 * the START_TAG (<code>true</code>) was encountered or the
239 * END_TAG (<code>false</code>) was encountered.
240 * @param userObject
241 * The user-supplied object passed through from the
242 * {@link XMLParser}'s <code>parse</code> method directly to this
243 * handler. This is typically a data storage mechanism like a DAO
244 * or cache used to hold parsed data or <code>null</code> if you
245 * do not need to make use of this pass-through mechanism and
246 * passed nothing to the {@link XMLParser} when you initiated the
247 * parse.
248 */
249 public void handleTag(XMLParser<T> parser, boolean isStartTag, T userObject);
250
251 /**
252 * Handler method called by the {@link XMLParser} when an {@link IRule} of
253 * type {@link Type#ATTRIBUTE} matches the parser's current location in the
254 * document.
255 *
256 * @param parser
257 * The source {@link XMLParser} currently executing this rule.
258 * Providing access to the originating parser is handy if the
259 * rule wants to stop parsing by calling {@link XMLParser#stop()}
260 * .
261 * @param index
262 * The index of the attribute name (from
263 * {@link #getAttributeNames()}) that this value belongs to.
264 * @param value
265 * The value for the given attribute.
266 * @param userObject
267 * The user-supplied object passed through from the
268 * {@link XMLParser}'s <code>parse</code> method directly to this
269 * handler. This is typically a data storage mechanism like a DAO
270 * or cache used to hold parsed data or <code>null</code> if you
271 * do not need to make use of this pass-through mechanism and
272 * passed nothing to the {@link XMLParser} when you initiated the
273 * parse.
274 *
275 * @see #getLocationPath()
276 * @see #getAttributeNames()
277 */
278 public void handleParsedAttribute(XMLParser<T> parser, int index,
279 String value, T userObject);
280
281 /**
282 * Handler method called by the {@link XMLParser} when an {@link IRule} of
283 * type {@link Type#CHARACTER} matches the parser's current location in the
284 * document.
285 * <p/>
286 * This method is not called by the {@link XMLParser} until all the
287 * character data has been coalesced together into a single {@link String}.
288 * You don't need to worry about re-combining chunked text elements.
289 *
290 * @param parser
291 * The source {@link XMLParser} currently executing this rule.
292 * Providing access to the originating parser is handy if the
293 * rule wants to stop parsing by calling {@link XMLParser#stop()}
294 * .
295 * @param text
296 * The character data contained between the open and close tags
297 * described by {@link #getLocationPath()}.
298 * @param userObject
299 * The user-supplied object passed through from the
300 * {@link XMLParser}'s <code>parse</code> method directly to this
301 * handler. This is typically a data storage mechanism like a DAO
302 * or cache used to hold parsed data or <code>null</code> if you
303 * do not need to make use of this pass-through mechanism and
304 * passed nothing to the {@link XMLParser} when you initiated the
305 * parse.
306 *
307 * @see #getLocationPath()
308 */
309 public void handleParsedCharacters(XMLParser<T> parser, String text,
310 T userObject);
311 }