From 197c4a7869ca009581f9ddd75f118bc3320ac832 Mon Sep 17 00:00:00 2001 From: Manish Singh Date: Wed, 8 Jan 2020 12:51:46 -0800 Subject: [PATCH] use ParserContext and add more element types in HTMLSchema --- build.xml | 8 +- core/definitions/{html.tssl => html5.tssl} | 33 ++++++ core/pom.xml | 10 +- .../com/yahoo/tagchowder/CommandLine.java | 6 +- .../com/yahoo/tagchowder/ElementType.java | 39 +++---- .../java/com/yahoo/tagchowder/Parser.java | 59 ++++++++-- .../com/yahoo/tagchowder/ParserContext.java | 105 ++++++++++++++++++ .../java/com/yahoo/tagchowder/Schema.java | 18 +-- .../java/com/yahoo/tagchowder/ParserTest.java | 78 ++++++++++++- core/src/test/resources/html/html5_badtag.txt | 13 +++ core/templates/HTMLSchema.java | 13 ++- pom.xml | 16 +-- 12 files changed, 331 insertions(+), 67 deletions(-) rename core/definitions/{html.tssl => html5.tssl} (97%) create mode 100644 core/src/main/java/com/yahoo/tagchowder/ParserContext.java create mode 100644 core/src/test/resources/html/html5_badtag.txt diff --git a/build.xml b/build.xml index cd175d9..8f9b469 100644 --- a/build.xml +++ b/build.xml @@ -86,21 +86,21 @@ - + Using ${transformer.factory} as the TransformerFactory - - - diff --git a/core/definitions/html.tssl b/core/definitions/html5.tssl similarity index 97% rename from core/definitions/html.tssl rename to core/definitions/html5.tssl index bce340b..32b933b 100644 --- a/core/definitions/html.tssl +++ b/core/definitions/html5.tssl @@ -2192,6 +2192,17 @@ + + @@ -2520,6 +2531,17 @@ + + @@ -2715,6 +2737,17 @@ + + diff --git a/core/pom.xml b/core/pom.xml index fd7fe4d..4252469 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -7,7 +7,7 @@ com.yahoo.tagchowder tagchowder - 2.0.17 + 2.0.18 tagchowder.core ${project.artifactId} @@ -49,7 +49,7 @@ ${project.basedir}/definitions - html.tssl + html5.tssl ${project.basedir}/tssl/tssl-model.xslt ${project.build.directory}/generated-resources/xml/xslt/model/ @@ -57,7 +57,7 @@ ${project.basedir}/definitions - html.tssl + html5.tssl ${project.basedir}/tssl/tssl-schema.xslt ${project.build.directory}/generated-resources/xml/xslt/schema/ @@ -88,7 +88,7 @@ ${project.build.directory}/generated-sources/com/yahoo/tagchowder/templates/HTMLModels.java false @@MODEL_DEFINITIONS@@ - ${project.build.directory}/generated-resources/xml/xslt/model/html.tssl + ${project.build.directory}/generated-resources/xml/xslt/model/html5.tssl @@ -102,7 +102,7 @@ ${project.build.directory}/generated-sources/com/yahoo/tagchowder/templates/HTMLSchema.java false @@SCHEMA_CALLS@@ - ${project.build.directory}/generated-resources/xml/xslt/schema/html.tssl + ${project.build.directory}/generated-resources/xml/xslt/schema/html5.tssl diff --git a/core/src/main/java/com/yahoo/tagchowder/CommandLine.java b/core/src/main/java/com/yahoo/tagchowder/CommandLine.java index 83934df..1c6a218 100644 --- a/core/src/main/java/com/yahoo/tagchowder/CommandLine.java +++ b/core/src/main/java/com/yahoo/tagchowder/CommandLine.java @@ -142,7 +142,6 @@ private static void doHelp() { private static String theOutputEncoding = null; // Process one source onto an output stream. - private static void process(final String src, final OutputStream os) throws IOException, SAXException { XMLReader r; if (hasOption(options, "--reuse")) { @@ -153,8 +152,9 @@ private static void process(final String src, final OutputStream os) throws IOEx } else { r = new Parser(); } - theSchema = new HTMLSchema(true); - r.setProperty(Parser.SCHEMA_PROPERTY, theSchema); + + r.setProperty(Parser.SCHEMA_PROPERTY, HTMLSchema.class); + theSchema = (HTMLSchema) ((Parser) r).getTheSchema(); if (hasOption(options, "--nocdata")) { r.setFeature(Parser.CDATA_ELEMENTS_FEATURE, false); diff --git a/core/src/main/java/com/yahoo/tagchowder/ElementType.java b/core/src/main/java/com/yahoo/tagchowder/ElementType.java index c59ad0f..89b9f0b 100644 --- a/core/src/main/java/com/yahoo/tagchowder/ElementType.java +++ b/core/src/main/java/com/yahoo/tagchowder/ElementType.java @@ -39,31 +39,40 @@ public class ElementType { private AttributesImpl theAtts; // default attributes private ElementType theParent; // parent of this element type private Schema theSchema; // schema to which this belongs - private boolean useIntern = true; // whether to use string intern or not + private ParserContext parserContext; // parser context /** * Construct an ElementType: but it's better to use Schema.element() instead. The content model, member-of, and flags vectors are specified as * ints. - * @param name The element type name + * @param name The element type name * @param model ORed-together bits representing the content models allowed in the content of this element type * @param memberOf ORed-together bits representing the content models to which this element type belongs * @param flags ORed-together bits representing the flags associated with this element type * @param schema The schema with which this element type will be associated - * @param useIntern whether to use string intern. + * @param parserContext the parser context */ - public ElementType(final String name, final int model, final int memberOf, final int flags, final Schema schema, final boolean useIntern) { + public ElementType(final String name, final int model, final int memberOf, + final int flags, final Schema schema, final ParserContext parserContext) { theName = name; theModel = model; theMemberOf = memberOf; theFlags = flags; theAtts = new AttributesImpl(); theSchema = schema; - this.useIntern = useIntern; + this.parserContext = parserContext; theNamespace = namespace(name, false); theLocalName = localName(name); } + /** + * Clear the state. + */ + public void clear() { + parserContext = null; + theSchema = null; + } + /** * Return a namespace name from a Qname. The attribute flag tells us whether to return an empty namespace name if there is no prefix, or use the * schema default instead. @@ -81,7 +90,7 @@ public String namespace(final String name, final boolean attribute) { if (prefix.equals("xml")) { return "http://www.w3.org/XML/1998/namespace"; } else { - return getReference(useIntern, "urn:x-prefix:" + prefix); + return parserContext.getReference("urn:x-prefix:" + prefix); } } @@ -96,7 +105,7 @@ public String localName(final String name) { if (colon == -1) { return name; } else { - return getReference(useIntern, name.substring(colon + 1)); + return parserContext.getReference(name.substring(colon + 1)); } } @@ -204,20 +213,6 @@ public boolean canContain(final ElementType other) { return (theModel & other.theMemberOf) != 0; } - /** - * Method to get reference with or without interning. - * @param useIntern whether to use string intern or not. - * @param input the input string. - * @return reference to the string. - */ - public String getReference(final boolean useIntern, final String input) { - if (useIntern) { - return input.intern(); - } else { - return input; // TODO: will put the hashmap here. - } - } - /** * Sets an attribute and its value into an AttributesImpl object. Attempts to set a namespace declaration are ignored. * @param atts The AttributesImpl object @@ -238,7 +233,7 @@ public void setAttribute(final AttributesImpl atts, final String name, final Str String localName = localName(n); int i = atts.getIndex(n); if (i == -1) { - n = getReference(useIntern, n); + n = parserContext.getReference(n); if (t == null) { t = "CDATA"; } diff --git a/core/src/main/java/com/yahoo/tagchowder/Parser.java b/core/src/main/java/com/yahoo/tagchowder/Parser.java index c47fb6c..4c419e9 100644 --- a/core/src/main/java/com/yahoo/tagchowder/Parser.java +++ b/core/src/main/java/com/yahoo/tagchowder/Parser.java @@ -28,6 +28,8 @@ import java.io.InputStreamReader; import java.io.Reader; import java.io.UnsupportedEncodingException; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; @@ -67,7 +69,6 @@ public class Parser extends DefaultHandler implements ScanHandler, XMLReader, Le private Schema theSchema; private Scanner theScanner; private AutoDetector theAutoDetector; - private boolean useIntern = true; /** Logger. */ private Logger logger = LoggerFactory.getLogger(Parser.class); @@ -97,6 +98,11 @@ public class Parser extends DefaultHandler implements ScanHandler, XMLReader, Le private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE; private boolean cdataElements = DEFAULT_CDATA_ELEMENTS; + /** + * Parser Context. + */ + private ParserContext theParserContext = new ParserContext.Builder(true).build(); + /** * A value of "true" indicates namespace URIs and unprefixed local names for element and attribute names will be available. **/ @@ -305,6 +311,33 @@ public int getDefaultBufferSize() { return defaultBufferSize; } + /** + * Getter for parser context. + * @return the parser context + */ + public ParserContext getTheParserContext() { + return theParserContext; + } + + /** + * Getter for theSchema object. + * @return theSchema + */ + public Schema getTheSchema() { + return theSchema; + } + + + /** + * Clear the state. + */ + public void clear() { + theParserContext.clear(); + theParserContext = null; + theSchema.clear(); + theSchema = null; + } + @Override public boolean getFeature(final String name) throws SAXNotRecognizedException, SAXNotSupportedException { Boolean b = (Boolean) theFeatures.get(name); @@ -345,7 +378,7 @@ public void setFeature(final String name, final boolean value) throws SAXNotReco } else if (name.equals(CDATA_ELEMENTS_FEATURE)) { cdataElements = value; } else if (name.equals(STRING_INTERNING_FEATURE)) { - useIntern = value; + theParserContext.setUseIntern(value); } } @@ -381,10 +414,18 @@ public void setProperty(final String name, final Object value) throws SAXNotReco throw new SAXNotSupportedException("Your scanner is not a Scanner"); } } else if (name.equals(SCHEMA_PROPERTY)) { - if (value instanceof Schema) { - theSchema = (Schema) value; + if (value instanceof Class && Schema.class.isAssignableFrom((Class) value)) { + try { + String className = ((Class) value).getName(); // Get the class name + Class clazz = Class.forName(className); // Class object + Constructor constructor = clazz.getConstructor(Parser.class); + theSchema = (Schema) constructor.newInstance(this); //Invoke the constructor to get new object + } catch (IllegalAccessException | InstantiationException + | InvocationTargetException | ClassNotFoundException | NoSuchMethodException e) { + throw new SAXNotSupportedException("Not able to create schema object"); + } } else { - throw new SAXNotSupportedException("Your schema is not a Schema"); + throw new SAXNotSupportedException("Either your schema is not a Schema or you did not pass schema class"); } } else if (name.equals(AUTO_DETECTOR_PROPERTY)) { if (value instanceof AutoDetector) { @@ -450,6 +491,7 @@ public void parse(final InputSource input) throws IOException, SAXException { theContentHandler.startPrefixMapping(theSchema.getPrefix(), theSchema.getURI()); } theScanner.scan(r, this); + clear(); } @Override @@ -466,7 +508,7 @@ public void parse(final String systemid) throws IOException, SAXException { // Sets up instance variables that haven't been set by setFeature private void setup() { if (theSchema == null) { - theSchema = new HTMLSchema(useIntern); + theSchema = new HTMLSchema(this); } if (theScanner == null) { theScanner = new HTMLScanner(defaultBufferSize); @@ -501,7 +543,7 @@ private Reader getReader(final InputSource s) throws SAXException, IOException { if (r == null) { if (i == null) { i = getInputStream(publicid, systemid); - // i = new BufferedInputStream(i); + // i = new BufferedInputStream(i); } if (encoding == null) { r = theAutoDetector.autoDetectingReader(i); @@ -1191,7 +1233,7 @@ private String makeName(final char[] buff, final int offset, final int length) { if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') { dst.append('_'); } - return dst.toString().intern(); + return theParserContext.getReference(dst.toString()); } // Default LexicalHandler implementation @@ -1223,5 +1265,4 @@ public void startDTD(final String name, final String publicid, final String syst @Override public void startEntity(final String name) throws SAXException { } - } diff --git a/core/src/main/java/com/yahoo/tagchowder/ParserContext.java b/core/src/main/java/com/yahoo/tagchowder/ParserContext.java new file mode 100644 index 0000000..4c48e00 --- /dev/null +++ b/core/src/main/java/com/yahoo/tagchowder/ParserContext.java @@ -0,0 +1,105 @@ +/* + * + * ==================================================================== + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + */ + +/* + * Changes to the original project are Copyright 2019 Oath Inc. + */ + +package com.yahoo.tagchowder; + +import javax.annotation.Nonnull; +import java.util.HashMap; +import java.util.Map; + +/** + * Class to store the current context/state of the Parser object. + * @author manishsingh + */ +final class ParserContext { + /** + * Flag to decide whether to do string intern or not. + */ + private boolean useIntern = true; + /** + * Hash Map to be used when not using string intern. + */ + private Map stringPoolMap = new HashMap<>(); + + /** + * Constructor. + * @param builder the builder object + */ + private ParserContext(final Builder builder) { + this.useIntern = builder.useIntern; + } + + /** + * Set the use intern flag. + * @param useIntern whether to use string intern or not + */ + void setUseIntern(final boolean useIntern) { + this.useIntern = useIntern; + } + + /** + * Method to get reference with or without interning. + * @param input the input string. + * @return reference to the string. + */ + String getReference(final String input) { + if (useIntern) { + return input.intern(); + } else { + return input; // TODO: will put the hashmap here. + } + } + + /** + * Builder for various parameters to create instance of the parser context. + * @author manishsingh + * + */ + static class Builder { + /** + * Flag to decide whether to do string intern or not. + */ + private boolean useIntern; + + /** + * Constructor for the builder. + * @param useIntern whether to use intern or not. + */ + Builder(@Nonnull final boolean useIntern) { + this.useIntern = useIntern; + } + /** + * Build the ParserContext object. + * @return parser context object + */ + ParserContext build() { + return new ParserContext(this); + } + } + + /** + * Clear the state. + */ + void clear() { + stringPoolMap.clear(); + stringPoolMap = null; + } +} diff --git a/core/src/main/java/com/yahoo/tagchowder/Schema.java b/core/src/main/java/com/yahoo/tagchowder/Schema.java index 19ce2fa..79d2df5 100644 --- a/core/src/main/java/com/yahoo/tagchowder/Schema.java +++ b/core/src/main/java/com/yahoo/tagchowder/Schema.java @@ -52,14 +52,19 @@ public abstract class Schema { private String theURI = ""; private String thePrefix = ""; private ElementType theRoot = null; - private boolean useIntern = true; + private ParserContext theParserContext; + + protected Schema(final Parser parser) { + this.theParserContext = parser.getTheParserContext(); + } /** - * Method to set useIntern. - * @param useIntern whether to use string intern or not + * Clear the state. */ - protected void setUseIntern(final boolean useIntern) { - this.useIntern = useIntern; + public void clear() { + theParserContext = null; + theElementTypes.clear(); + theElementTypes = null; } /** @@ -72,7 +77,7 @@ protected void setUseIntern(final boolean useIntern) { **/ public void elementType(final String name, final int model, final int memberOf, final int flags) { - ElementType e = new ElementType(name, model, memberOf, flags, this, useIntern); + ElementType e = new ElementType(name, model, memberOf, flags, this, theParserContext); theElementTypes.put(name.toLowerCase(), e); if (memberOf == M_ROOT) { theRoot = e; @@ -202,5 +207,4 @@ public void setURI(final String uri) { public void setPrefix(final String prefix) { thePrefix = prefix; } - } diff --git a/core/src/test/java/com/yahoo/tagchowder/ParserTest.java b/core/src/test/java/com/yahoo/tagchowder/ParserTest.java index d195dc2..de2fb6f 100644 --- a/core/src/test/java/com/yahoo/tagchowder/ParserTest.java +++ b/core/src/test/java/com/yahoo/tagchowder/ParserTest.java @@ -14,7 +14,6 @@ * limitations under the License. * ==================================================================== */ - /* * Changes to the original project are Copyright 2019 Oath Inc. */ @@ -27,6 +26,8 @@ import java.io.StringReader; import java.nio.charset.StandardCharsets; +import com.yahoo.tagchowder.templates.HTMLSchema; +import org.testng.Assert; import org.testng.annotations.Test; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -65,6 +66,58 @@ public void testSampleHtmlWithDoubleQuotePublicId() throws IOException, SAXExcep parser.parse(inSource); } + /** + * Parse an sample html with only double quote in public id inside without string intern !DOCTYPE. + * + * @throws IOException IOException when error + * @throws SAXException SAXException when error + */ + @Test + public void testSampleHtmlWithDoubleQuotePublicIdWithoutIntern() throws IOException, SAXException { + final String html = getSampleHtml("html5_badtag.txt"); + final Parser parser = new Parser(); + parser.setFeature(Parser.STRING_INTERNING_FEATURE, false); + // Create a test schema + // Set the intern to false with parser context object + parser.setProperty(Parser.SCHEMA_PROPERTY, HTMLSchema.class); + // Below feature will set the intern to true in parser context object + parser.setFeature(Parser.STRING_INTERNING_FEATURE, true); + final InputSource inSource = new InputSource(new StringReader(html)); + parser.parse(inSource); + Assert.assertNull(parser.getTheParserContext(), "Parser context should be null"); + Assert.assertNull(parser.getTheSchema(), "Parser schema object should be null"); + } + + /** + * Test for checking if the elements "a", "ul" and "ol" have correct model and memberships for html version 5. + * + */ + @Test + public void testSchemaModelAndMembershipsForHTML5Support() { + // Create a test schema + final Schema testSchema = new HTMLSchema(new Parser()); + + int currentModelForA = testSchema.getElementType("a").model(); + int expectedModelForA = 1073774596; + Assert.assertEquals(currentModelForA, expectedModelForA, "Unexpected model found for \"a\""); + + int currentModelForUl = testSchema.getElementType("ul").model(); + int expectedModelForUl = 16384; + Assert.assertEquals(currentModelForUl, expectedModelForUl, "Unexpected model found for \"ul\""); + + int currentModelForOl = testSchema.getElementType("ol").model(); + int expectedModelForOl = 16384; + Assert.assertEquals(currentModelForOl, expectedModelForOl, "Unexpected model found for \"ol\""); + + int currentMembershipForUl = testSchema.getElementType("ul").memberOf(); + int expectedMembershipForUl = 16388; + Assert.assertEquals(currentMembershipForUl, expectedMembershipForUl, "Unexpected membership found for \"ul\""); + + int currentMembershipForOl = testSchema.getElementType("ol").memberOf(); + int expectedMembershipForOl = 16388; + Assert.assertEquals(currentMembershipForOl, expectedMembershipForOl, "Unexpected membership found for \"ol\""); + } + /** * Parse a sample html with only single quote in public id inside !DOCTYPE. * @@ -86,11 +139,30 @@ public void testSampleHtmlWithSingleQuotePublicId() throws IOException, SAXExcep * @throws SAXException SAXException */ @Test - public void testSampleHtml5() throws IOException, SAXException { + public void test1SampleHtml5() throws IOException, SAXException { + final String html = getSampleHtml("html5.txt"); + final Parser parser = new Parser(); + parser.setFeature(Parser.STRING_INTERNING_FEATURE, false); + final InputSource inSource = new InputSource(new StringReader(html)); + parser.parse(inSource); + } + + /** + * Parse an sample html5. + * Verify that the same parser object cannot be used to parse twice. + * + * @throws IOException IOException when error + * @throws SAXException SAXException when error + */ + @Test(expectedExceptions = NullPointerException.class) + public void test2SampleHtml5() throws IOException, SAXException { final String html = getSampleHtml("html5.txt"); final Parser parser = new Parser(); final InputSource inSource = new InputSource(new StringReader(html)); parser.parse(inSource); + final String htmlAgain = getSampleHtml("html.txt"); + final InputSource inSourceAgain = new InputSource(new StringReader(htmlAgain)); + parser.parse(inSourceAgain); } /** @@ -112,4 +184,4 @@ private String getSampleHtml(final String filename) throws IOException { } return retString.toString(); } -} +} \ No newline at end of file diff --git a/core/src/test/resources/html/html5_badtag.txt b/core/src/test/resources/html/html5_badtag.txt new file mode 100644 index 0000000..faf7cc4 --- /dev/null +++ b/core/src/test/resources/html/html5_badtag.txt @@ -0,0 +1,13 @@ + + + + +Title of the document +flsdjflksdj + + + +Content of the document...... + + + \ No newline at end of file diff --git a/core/templates/HTMLSchema.java b/core/templates/HTMLSchema.java index 47e1847..42c1670 100644 --- a/core/templates/HTMLSchema.java +++ b/core/templates/HTMLSchema.java @@ -25,18 +25,19 @@ package com.yahoo.tagchowder.templates; import com.yahoo.tagchowder.Schema; +import com.yahoo.tagchowder.Parser; public class HTMLSchema extends Schema implements HTMLModels { /** - * Returns a newly constructed HTMLSchema object independent of any existing ones. - * @param useIntern enable jvm string intern method - */ - public HTMLSchema(final boolean useIntern) { - setUseIntern(useIntern); + Returns a newly constructed HTMLSchema object independent of + any existing ones. + @param parser the parser object + */ + public HTMLSchema(final Parser parser) { + super(parser); // Start of Schema calls @@SCHEMA_CALLS@@ // End of Schema calls } - } diff --git a/pom.xml b/pom.xml index ca1a4e9..7fc4845 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.yahoo.tagchowder tagchowder pom - 2.0.17 + 2.0.18 ${project.artifactId} https://github.com/yahoo/tagchowder Parent POM file for tagchowder project @@ -219,7 +219,7 @@ - org.eclipse.m2e lifecycle-mapping @@ -384,10 +384,10 @@ 2.1.1 - true @@ -405,7 +405,7 @@ false - check-compile @@ -449,7 +449,7 @@ -Djdk.attach.allowAttachSelf=true notIsolate,EventListenersRegression - true false