Saturday, June 8, 2013

SvgClean.js

If I edit an SVG file in InkScape and then save it, InkScape pollutes the SVG with all manner of InkScape specific attributes and elements. It also needlessly adds RDF and CreativeCommons and Dublin Core metadata, even when there is no real metadata to export. If I use "Plain SVG" instead of "Inkscape SVG" as the save file type, it excludes the sodipodi and inkscape namespace attributes, but it still adds the needless metadata. So this is a script which removes:
  • All comments, processing instructions, etc.
  • All attributes and elements from non-SVG namespaces.
  • Metadata and defs elements if they have no content after the above.
  • Pretty-printing of resultant XML
nsXMLNS = "http://www.w3.org/2000/xmlns/";
nsSVG = "http://www.w3.org/2000/svg";

// Converts XML document to string
xmlToString = function(doc) {
   var domreg = org.w3c.dom.bootstrap.DOMImplementationRegistry.newInstance();
   var ls = domreg.getDOMImplementation("LS");
   var w = ls.createLSSerializer();
   w.getDomConfig().setParameter("format-pretty-print", true);
   var lsout = ls.createLSOutput();
   var out = new java.io.StringWriter();
   lsout.setCharacterStream(out);
   w.write(doc, lsout);
   out.close();
   return "" + out.toString();
};

// Instantiate document builder
getDocBuilder = function() {
   var dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance();
   dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
   dbf.setNamespaceAware(true);
   return dbf.newDocumentBuilder();
};

// Load XML file from disk into memory
loadXMLFile = function(file) {
   var inp = new java.io.File(file);
   return getDocBuilder().parse(inp);
};

// Retrieve child nodes of given node. Non-live.
getKids = function(node) {
   var kids = [];
   for (var i = 0; i < node.childNodes.length; i++)
      kids.push(node.childNodes.item(i));
   return kids;
};

// Remove all comments, processing instructions, document types
cleanDocument1 = function(node) {
   var toRemove = [];
   var Node = org.w3c.dom.Node;
   for (var i = 0; i < node.childNodes.length; i++) {
      var k = node.childNodes.item(i);
      if (k.nodeType === Node.COMMENT_NODE ||
              k.nodeType === Node.DOCUMENT_TYPE_NODE ||
              k.nodeType === Node.PROCESSING_INSTRUCTION_NODE)
         toRemove.push(k);
   }
   for (var i in toRemove)
      toRemove[i].parentNode.removeChild(toRemove[i]);
   var kids = getKids(node);
   for (var i in kids)
      cleanDocument1(kids[i]);
};

// Removes all elements and attributes in non-SVG namespace
cleanDocument2 = function(node) {
   var Node = org.w3c.dom.Node;
   if (node.nodeType === Node.ELEMENT_NODE) {
      if (!nsSVG.equals(node.namespaceURI)) {
         node.parentNode.removeChild(node);
         return;
      }
      var attrs = node.attributes;
      var toRemove = [];
      for (var i = 0; i < attrs.length; i++) {
         var attr = attrs.item(i);
         if (attr.namespaceURI !== null &&
                 !nsXMLNS.equals(attr.namespaceURI) &&
                 !nsSVG.equals(attr.namespaceURI))
            toRemove.push(attr);
      }
      for (var i in toRemove)
         node.removeAttributeNode(toRemove[i]);
   }
   var kids = getKids(node);
   for (var i in kids)
      cleanDocument2(kids[i]);
};

// Remove empty metadata and defs attributes
cleanDocument3 = function(node) {
   var Node = org.w3c.dom.Node;
   if (node.nodeType === Node.ELEMENT_NODE) {
      if ("metadata".equals(node.localName) ||
              "defs".equals(node.localName)) {
         var k = 0;
         for (var i = 0; i < node.childNodes.length; i++)
            if (node.childNodes.item(i).nodeType === Node.ELEMENT_NODE)
               k++;
         if (k === 0) {
            node.parentNode.removeChild(node);
            return;
         }
      }
   }
   var kids = getKids(node);
   for (var i in kids)
      cleanDocument3(kids[i]);
};

// Remove non-SVG XMLNS declarations
cleanDocument4 = function(node) {
   var Node = org.w3c.dom.Node;
   if (node.nodeType === Node.ELEMENT_NODE) {
      var attrs = node.attributes;
      var toRemove = [];
      for (var i = 0; i < attrs.length; i++) {
         var attr = attrs.item(i);
         if (attr.namespaceURI !== null &&
                 nsXMLNS.equals(attr.namespaceURI) &&
                 !nsSVG.equals(attr.value))
            toRemove.push(attr);
      }
      for (var i in toRemove)
         node.removeAttributeNode(toRemove[i]);
   }
   var kids = getKids(node);
   for (var i in kids)
      cleanDocument4(kids[i]);
};

try {
   var doc = loadXMLFile(arguments[0]);
   if (!nsSVG.equals(doc.documentElement.namespaceURI) ||
           !"svg".equals(doc.documentElement.localName))
      throw "Not an SVG document";
   cleanDocument1(doc);
   cleanDocument2(doc);
   cleanDocument3(doc);
   cleanDocument4(doc);
   println(xmlToString(doc));
} catch (e) {
   if (e.rhinoException)
      e.rhinoException.printStackTrace();
   else if (e.javaException)
      e.javaException.printStackTrace();
   else
      println(e);
}

XML pretty-printing and XSLT processing in Rhino JavaScript

One of my favourite utilities on Linux is xmllint, in particular xmllint --format. Ugly, unreadable XML in, nice and pretty XML out. Unfortunately, xmllint is not usually installed on Windows. Oh, I could just download the Windows executable. Or, I could write my own little XML pretty printer in JavaScript (Mozilla Rhino bundled with the JDK). Well, the advantage of this approach, is that it can be used on any platform where the JDK is installed, even when you are not allowed to compile or install other software.
xmlToString = function(doc) {
   var domreg = org.w3c.dom.bootstrap.DOMImplementationRegistry.newInstance();
   var ls = domreg.getDOMImplementation("LS");
   var w = ls.createLSSerializer();
   w.getDomConfig().setParameter("format-pretty-print", true);
   var lsout = ls.createLSOutput();
   var out = new java.io.StringWriter();
   lsout.setCharacterStream(out);
   w.write(doc, lsout);
   out.close();
   return "" + out.toString();
};

try {
   var inp = new java.io.File(arguments[0]);
   var dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance();
   dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
   dbf.setNamespaceAware(true);
   var db = dbf.newDocumentBuilder();
   var doc = db.parse(inp);
   println(xmlToString(doc));
} catch (e) {
  if (e.javaException)
    e.javaException.printStackTrace();
  throw e;
}
Another tool I often use is xsltproc, for applying XSLT stylesheets. Here is a JavaScript equivalent for that:
xmlToString = function(doc) {
   var domreg = org.w3c.dom.bootstrap.DOMImplementationRegistry.newInstance();
   var ls = domreg.getDOMImplementation("LS");
   var w = ls.createLSSerializer();
   w.getDomConfig().setParameter("format-pretty-print", true);
   var lsout = ls.createLSOutput();
   var out = new java.io.StringWriter();
   lsout.setCharacterStream(out);
   w.write(doc, lsout);
   out.close();
   return "" + out.toString();
};

getDocBuilder = function() {
   var dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance();
   dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
   dbf.setNamespaceAware(true);
   return dbf.newDocumentBuilder();
};

loadXMLFile = function(file) {
   var inp = new java.io.File(file);
   return getDocBuilder().parse(inp);
};

applyXSLT = function(doc, xslt) {
   var dsDoc = new javax.xml.transform.dom.DOMSource(doc);
   var dsXSLT = new javax.xml.transform.dom  .DOMSource(xslt);
   var tf = javax.xml.transform.TransformerFactory.newInstance();
   var xf = tf.newTransformer(dsXSLT);
   var out = getDocBuilder().newDocument();
   var drOut = new javax.xml.transform.dom.DOMResult(out);
   xf.transform(dsDoc, drOut);
   return out;
};

try {
   var doc = loadXMLFile(arguments[0]);
   var xslt = loadXMLFile(arguments[1]);
   var out = applyXSLT(doc, xslt);
   println(xmlToString(out));
} catch (e) {
   if (e.javaException)
      e.javaException.printStackTrace();
   throw e;
}