Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -86,21 +86,21 @@
</target>

<!-- prepare generation of the parser classes based on the definition files -->
<target depends="init,prepare" description="Prepare generation of parser classes." name="prepare-parser">
<target depends="init,prepare" description="Prepare generation of parser classes." name="prepare-parser">

<echo>
Using ${transformer.factory} as the TransformerFactory
</echo>

<xslt in="${src}/definitions/html.tssl" out="${tmp}/HTMLModels.i"
<xslt in="${src}/definitions/html5.tssl" out="${tmp}/HTMLModels.i"
style="tssl/tssl-models.xslt">
<factory name="${transformer.factory}"/>
</xslt>
<xslt in="${src}/definitions/html.tssl" out="${tmp}/HTMLSchema.i"
<xslt in="${src}/definitions/html5.tssl" out="${tmp}/HTMLSchema.i"
style="tssl/tssl.xslt">
<factory name="${transformer.factory}"/>
</xslt>
<xslt in="${src}/definitions/html.stml" out="${tmp}/HTMLScanner.i"
<xslt in="${src}/definitions/html.stml" out="${tmp}/HTMLScanner.i"
style="stml/stml.xslt">
<factory name="${transformer.factory}"/>
</xslt>
Expand Down
33 changes: 33 additions & 0 deletions core/definitions/html.tssl → core/definitions/html5.tssl
Original file line number Diff line number Diff line change
Expand Up @@ -2192,6 +2192,17 @@
<element name='a' type='mixed'>
<memberOf group='M_INLINE'/>
<contains group='M_NOLINK'/>
<!--
// This is a fix class for Tagsoup's HTMLSchema implementation. It was created to fix an issue where lists were flattened due to the incorrect
// detection of "End-Tag Omission".
// Internally, Tagsoup's HTMLSchema stores information about each element type like their name, what types they contain, their "membership" types. and
// option flags (in that order). In this class, the membership types of [ul] and [ol] are changed so that they are also members of M_LI. Since those
// elements can contain M_LI-type, this allows them to also contain [ul]s within [ul]s, [ol]s within [ul]s, etc.
// The element "ul" is already defined in HTMLSchema. However, the method elementType sets the mapping between lowercase name of the element type and
// the four parameters provided. By setting it again, it overrides the previous definition safely.
// We also configure the element "a" to be a block level element so that it can contain other elements. This is allowed in HTML5. .
-->
<contains group='M_BLOCK'/>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add UTs for these 3 changes please

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need the comments to explain why are these changes here

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

<attribute name='hreflang' type='NMTOKEN' />
<attribute name='shape' default='rect'/>
<attribute name='tabindex' type='NMTOKEN' />
Expand Down Expand Up @@ -2520,6 +2531,17 @@
</element>
<element name='ol' type='element'>
<memberOf group='M_BLOCK'/>
<!--
// This is a fix class for Tagsoup's HTMLSchema implementation. It was created to fix an issue where lists were flattened due to the incorrect
// detection of "End-Tag Omission".
// Internally, Tagsoup's HTMLSchema stores information about each element type like their name, what types they contain, their "membership" types. and
// option flags (in that order). In this class, the membership types of [ul] and [ol] are changed so that they are also members of M_LI. Since those
// elements can contain M_LI-type, this allows them to also contain [ul]s within [ul]s, [ol]s within [ul]s, etc.
// The element "ul" is already defined in HTMLSchema. However, the method elementType sets the mapping between lowercase name of the element type and
// the four parameters provided. By setting it again, it overrides the previous definition safely.
// We also configure the element "a" to be a block level element so that it can contain other elements. This is allowed in HTML5. .
-->
<memberOf group='M_LI'/>
<contains group='M_LI'/>
<attribute name='compact' type='BOOLEAN' />
<attribute name='start' type='NMTOKEN' />
Expand Down Expand Up @@ -2715,6 +2737,17 @@
</element>
<element name='ul' type='element'>
<memberOf group='M_BLOCK'/>
<!--
// This is a fix class for Tagsoup's HTMLSchema implementation. It was created to fix an issue where lists were flattened due to the incorrect
// detection of "End-Tag Omission".
// Internally, Tagsoup's HTMLSchema stores information about each element type like their name, what types they contain, their "membership" types. and
// option flags (in that order). In this class, the membership types of [ul] and [ol] are changed so that they are also members of M_LI. Since those
// elements can contain M_LI-type, this allows them to also contain [ul]s within [ul]s, [ol]s within [ul]s, etc.
// The element "ul" is already defined in HTMLSchema. However, the method elementType sets the mapping between lowercase name of the element type and
// the four parameters provided. By setting it again, it overrides the previous definition safely.
// We also configure the element "a" to be a block level element so that it can contain other elements. This is allowed in HTML5. .
-->
<memberOf group='M_LI'/>
<contains group='M_LI'/>
<attribute name='compact' type='BOOLEAN' />
<attribute name='type' type='NMTOKEN' />
Expand Down
10 changes: 5 additions & 5 deletions core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
<parent>
<groupId>com.yahoo.tagchowder</groupId>
<artifactId>tagchowder</artifactId>
<version>2.0.17</version>
<version>2.0.18</version>
</parent>
<artifactId>tagchowder.core</artifactId>
<name>${project.artifactId}</name>
Expand Down Expand Up @@ -49,15 +49,15 @@
<transformationSet>
<dir>${project.basedir}/definitions</dir>
<includes>
<include>html.tssl</include>
<include>html5.tssl</include>
</includes>
<stylesheet>${project.basedir}/tssl/tssl-model.xslt</stylesheet>
<outputDir>${project.build.directory}/generated-resources/xml/xslt/model/</outputDir>
</transformationSet>
<transformationSet>
<dir>${project.basedir}/definitions</dir>
<includes>
<include>html.tssl</include>
<include>html5.tssl</include>
</includes>
<stylesheet>${project.basedir}/tssl/tssl-schema.xslt</stylesheet>
<outputDir>${project.build.directory}/generated-resources/xml/xslt/schema/</outputDir>
Expand Down Expand Up @@ -88,7 +88,7 @@
<outputFile>${project.build.directory}/generated-sources/com/yahoo/tagchowder/templates/HTMLModels.java</outputFile>
<regex>false</regex>
<token>@@MODEL_DEFINITIONS@@</token>
<valueFile>${project.build.directory}/generated-resources/xml/xslt/model/html.tssl</valueFile>
<valueFile>${project.build.directory}/generated-resources/xml/xslt/model/html5.tssl</valueFile>
</configuration>
</execution>
<execution>
Expand All @@ -102,7 +102,7 @@
<outputFile>${project.build.directory}/generated-sources/com/yahoo/tagchowder/templates/HTMLSchema.java</outputFile>
<regex>false</regex>
<token>@@SCHEMA_CALLS@@</token>
<valueFile>${project.build.directory}/generated-resources/xml/xslt/schema/html.tssl</valueFile>
<valueFile>${project.build.directory}/generated-resources/xml/xslt/schema/html5.tssl</valueFile>
</configuration>
</execution>
<execution>
Expand Down
6 changes: 3 additions & 3 deletions core/src/main/java/com/yahoo/tagchowder/CommandLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,6 @@ private static void doHelp() {
private static String theOutputEncoding = null;

// Process one source onto an output stream.

private static void process(final String src, final OutputStream os) throws IOException, SAXException {
XMLReader r;
if (hasOption(options, "--reuse")) {
Expand All @@ -153,8 +152,9 @@ private static void process(final String src, final OutputStream os) throws IOEx
} else {
r = new Parser();
}
theSchema = new HTMLSchema(true);
r.setProperty(Parser.SCHEMA_PROPERTY, theSchema);

r.setProperty(Parser.SCHEMA_PROPERTY, HTMLSchema.class);
theSchema = (HTMLSchema) ((Parser) r).getTheSchema();

if (hasOption(options, "--nocdata")) {
r.setFeature(Parser.CDATA_ELEMENTS_FEATURE, false);
Expand Down
39 changes: 17 additions & 22 deletions core/src/main/java/com/yahoo/tagchowder/ElementType.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,31 +39,40 @@ public class ElementType {
private AttributesImpl theAtts; // default attributes
private ElementType theParent; // parent of this element type
private Schema theSchema; // schema to which this belongs
private boolean useIntern = true; // whether to use string intern or not
private ParserContext parserContext; // parser context

/**
* Construct an ElementType: but it's better to use Schema.element() instead. The content model, member-of, and flags vectors are specified as
* ints.
* @param name The element type name
* @param name The element type name
* @param model ORed-together bits representing the content models allowed in the content of this element type
* @param memberOf ORed-together bits representing the content models to which this element type belongs
* @param flags ORed-together bits representing the flags associated with this element type
* @param schema The schema with which this element type will be associated
* @param useIntern whether to use string intern.
* @param parserContext the parser context
*/

public ElementType(final String name, final int model, final int memberOf, final int flags, final Schema schema, final boolean useIntern) {
public ElementType(final String name, final int model, final int memberOf,
final int flags, final Schema schema, final ParserContext parserContext) {
theName = name;
theModel = model;
theMemberOf = memberOf;
theFlags = flags;
theAtts = new AttributesImpl();
theSchema = schema;
this.useIntern = useIntern;
this.parserContext = parserContext;
theNamespace = namespace(name, false);
theLocalName = localName(name);
}

/**
* Clear the state.
*/
public void clear() {
parserContext = null;
theSchema = null;
}

/**
* Return a namespace name from a Qname. The attribute flag tells us whether to return an empty namespace name if there is no prefix, or use the
* schema default instead.
Expand All @@ -81,7 +90,7 @@ public String namespace(final String name, final boolean attribute) {
if (prefix.equals("xml")) {
return "http://www.w3.org/XML/1998/namespace";
} else {
return getReference(useIntern, "urn:x-prefix:" + prefix);
return parserContext.getReference("urn:x-prefix:" + prefix);
}
}

Expand All @@ -96,7 +105,7 @@ public String localName(final String name) {
if (colon == -1) {
return name;
} else {
return getReference(useIntern, name.substring(colon + 1));
return parserContext.getReference(name.substring(colon + 1));
}
}

Expand Down Expand Up @@ -204,20 +213,6 @@ public boolean canContain(final ElementType other) {
return (theModel & other.theMemberOf) != 0;
}

/**
* Method to get reference with or without interning.
* @param useIntern whether to use string intern or not.
* @param input the input string.
* @return reference to the string.
*/
public String getReference(final boolean useIntern, final String input) {
if (useIntern) {
return input.intern();
} else {
return input; // TODO: will put the hashmap here.
}
}

/**
* Sets an attribute and its value into an AttributesImpl object. Attempts to set a namespace declaration are ignored.
* @param atts The AttributesImpl object
Expand All @@ -238,7 +233,7 @@ public void setAttribute(final AttributesImpl atts, final String name, final Str
String localName = localName(n);
int i = atts.getIndex(n);
if (i == -1) {
n = getReference(useIntern, n);
n = parserContext.getReference(n);
if (t == null) {
t = "CDATA";
}
Expand Down
59 changes: 50 additions & 9 deletions core/src/main/java/com/yahoo/tagchowder/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
Expand Down Expand Up @@ -67,7 +69,6 @@ public class Parser extends DefaultHandler implements ScanHandler, XMLReader, Le
private Schema theSchema;
private Scanner theScanner;
private AutoDetector theAutoDetector;
private boolean useIntern = true;
/** Logger. */
private Logger logger = LoggerFactory.getLogger(Parser.class);

Expand Down Expand Up @@ -97,6 +98,11 @@ public class Parser extends DefaultHandler implements ScanHandler, XMLReader, Le
private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE;
private boolean cdataElements = DEFAULT_CDATA_ELEMENTS;

/**
* Parser Context.
*/
private ParserContext theParserContext = new ParserContext.Builder(true).build();

/**
* A value of "true" indicates namespace URIs and unprefixed local names for element and attribute names will be available.
**/
Expand Down Expand Up @@ -305,6 +311,33 @@ public int getDefaultBufferSize() {
return defaultBufferSize;
}

/**
* Getter for parser context.
* @return the parser context
*/
public ParserContext getTheParserContext() {
return theParserContext;
}

/**
* Getter for theSchema object.
* @return theSchema
*/
public Schema getTheSchema() {
return theSchema;
}


/**
* Clear the state.
*/
public void clear() {
theParserContext.clear();
theParserContext = null;
theSchema.clear();
theSchema = null;
}

@Override
public boolean getFeature(final String name) throws SAXNotRecognizedException, SAXNotSupportedException {
Boolean b = (Boolean) theFeatures.get(name);
Expand Down Expand Up @@ -345,7 +378,7 @@ public void setFeature(final String name, final boolean value) throws SAXNotReco
} else if (name.equals(CDATA_ELEMENTS_FEATURE)) {
cdataElements = value;
} else if (name.equals(STRING_INTERNING_FEATURE)) {
useIntern = value;
theParserContext.setUseIntern(value);
}
}

Expand Down Expand Up @@ -381,10 +414,18 @@ public void setProperty(final String name, final Object value) throws SAXNotReco
throw new SAXNotSupportedException("Your scanner is not a Scanner");
}
} else if (name.equals(SCHEMA_PROPERTY)) {
if (value instanceof Schema) {
theSchema = (Schema) value;
if (value instanceof Class && Schema.class.isAssignableFrom((Class) value)) {
try {
String className = ((Class) value).getName(); // Get the class name
Class<?> clazz = Class.forName(className); // Class object
Constructor<?> constructor = clazz.getConstructor(Parser.class);
theSchema = (Schema) constructor.newInstance(this); //Invoke the constructor to get new object
} catch (IllegalAccessException | InstantiationException
| InvocationTargetException | ClassNotFoundException | NoSuchMethodException e) {
throw new SAXNotSupportedException("Not able to create schema object");
}
} else {
throw new SAXNotSupportedException("Your schema is not a Schema");
throw new SAXNotSupportedException("Either your schema is not a Schema or you did not pass schema class");
}
} else if (name.equals(AUTO_DETECTOR_PROPERTY)) {
if (value instanceof AutoDetector) {
Expand Down Expand Up @@ -450,6 +491,7 @@ public void parse(final InputSource input) throws IOException, SAXException {
theContentHandler.startPrefixMapping(theSchema.getPrefix(), theSchema.getURI());
}
theScanner.scan(r, this);
clear();
}

@Override
Expand All @@ -466,7 +508,7 @@ public void parse(final String systemid) throws IOException, SAXException {
// Sets up instance variables that haven't been set by setFeature
private void setup() {
if (theSchema == null) {
theSchema = new HTMLSchema(useIntern);
theSchema = new HTMLSchema(this);
}
if (theScanner == null) {
theScanner = new HTMLScanner(defaultBufferSize);
Expand Down Expand Up @@ -501,7 +543,7 @@ private Reader getReader(final InputSource s) throws SAXException, IOException {
if (r == null) {
if (i == null) {
i = getInputStream(publicid, systemid);
// i = new BufferedInputStream(i);
// i = new BufferedInputStream(i);
}
if (encoding == null) {
r = theAutoDetector.autoDetectingReader(i);
Expand Down Expand Up @@ -1191,7 +1233,7 @@ private String makeName(final char[] buff, final int offset, final int length) {
if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') {
dst.append('_');
}
return dst.toString().intern();
return theParserContext.getReference(dst.toString());
}

// Default LexicalHandler implementation
Expand Down Expand Up @@ -1223,5 +1265,4 @@ public void startDTD(final String name, final String publicid, final String syst
@Override
public void startEntity(final String name) throws SAXException {
}

}
Loading