Replace XSS filter with sanitize-html

2015-01-06 12:20:48 -05:00 · 2015-01-06 12:20:48 -05:00 · 1c3a669279
parent 8630c5972c
commit 1c3a669279
3 changed files with 61 additions and 253 deletions
--- a/lib/channel/customization.js
+++ b/lib/channel/customization.js
@ -36,9 +36,11 @@ CustomizationModule.prototype.load = function (data) {
    if ("motd" in data) {
        this.motd = {
-            motd: data.motd.motd || "",
+            motd: data.motd.motd || ""
            html: data.motd.html || ""
        };
        this.motd.motd = XSS.sanitizeHTML(this.motd.motd);
        this.motd.html = this.motd.motd.replace(/\n/g, "<br>");
    }
 };
--- a/lib/xss.js
+++ b/lib/xss.js
@ -1,260 +1,62 @@
-/*
+var sanitizeHTML = require("sanitize-html");
    WARNING
-    This file contains an XSS prevention module I wrote myself.  It has not
+const ALLOWED_TAGS = [
-    been verified by any external agency, and due to the nature of XSS I cannot
+    "button",
-    guarantee that it will filter correctly.  Feel free to send me bug reports
+    "center",
-    and I will do my best to fix them, but use at your own risk.
+    "details",
    "font",
    "h1",
    "h2",
    "img",
    "marquee", // It pains me to do this, but a lot of people use it...
    "section",
    "span",
    "summary"
 ];
-*/
+const ALLOWED_ATTRIBUTES = [
-
+    "id",
-/* Prototype for a basic XML tag parser */
+    "aria-hidden",
-function TagParser(text) {
+    "border",
-    this.text = text;
+    "class",
-    this.i = 0;
+    "color",
-    this.tag = this.parse();
+    "data-dismiss",
-}
+    "data-target",
-
+    "height",
-/* Moves the position marker past any whitespace characters */
+    "role",
 TagParser.prototype.skipWhitespace = function () {
    while (this.i < this.text.length && this.text[this.i].match(/\s/)) {
        this.i++;
    }
 };
 /* Reads a literal value matching the given regexp.  Defaults
   to /[^\s>]/; i.e. any string not containing whitespace or
   the end of tag character '>'
 */
 TagParser.prototype.readLiteral = function (regexp) {
    if (regexp === void 0) {
        regexp = /[^\s>]/;
    }
    var str = "";
    while (this.i < this.text.length && this.text[this.i].match(regexp)) {
        str += this.text[this.i];
        this.i++;
    }
    str = str.replace(/&#([0-9]{2,7});?/g, function (m, p1) {
        return String.fromCharCode(parseInt(p1));
    });
    str = str.replace(/&#x([0-9a-fA-F]{2,7});?/g, function (m, p1) {
        return String.fromCharCode(parseInt(p1, 16));
    });
    str = str.replace(/[\x00-\x1f]/g, "");
    return str;
 };
 /* If the character at the current position is a quote, read
   a string.  Otherwise, read a literal
 */
 TagParser.prototype.readLiteralOrString = function (regexp) {
    if (this.text[this.i].match(/["'`]/)) {
        return this.readString();
    }
    return this.readLiteral(regexp);
 };
 /* Read a string delimited by the character at the current
   position.  For XML tags this means strings enclosed in
   " or '.  Treats \" as a literal '"' symbol and not a
   delimiter.
 */
 TagParser.prototype.readString = function () {
    var delim = this.text[this.i++];
    var str = "";
    while (this.i < this.text.length && this.text[this.i] !== delim) {
        if (this.text[this.i] === "\\" && this.text[this.i+1] === delim) {
            str += this.text[this.i+1];
            this.i++;
        } else {
            str += this.text[this.i];
        }
        this.i++;
    }
    this.i++;
    str = str.replace(/&#([0-9]{2,7});?/g, function (m, p1) {
        return String.fromCharCode(parseInt(p1));
    });
    str = str.replace(/&#x([0-9a-fA-F]{2,7});?/g, function (m, p1) {
        return String.fromCharCode(parseInt(p1, 16));
    });
    str = str.replace(/[\x00-\x1f]/g, "");
    return str;
 };
 /* Attempts to parse a tagname and attributes from an
   XML tag.
   NOTE: Does not actually parse a DOM node, only parses
   the tag between '<' and '>' because that's all I need
   to do XSS filtering, I don't care what's between a tag
   and its end tag (if it's another tag I handle that
   separately)
 */
 TagParser.prototype.parse = function () {
    this.i = this.text.indexOf("<");
    // Not a tag
    if (this.i === -1) {
        return null;
    }
    this.i++;
    this.skipWhitespace();
    // First non-whitespace string after the opening '<' is the tag name
    var tname = this.readLiteral();
    var attrs = {};
    // Continue parsing attributes until the end of string is reached or
    // the end of tag is reached
    while (this.i < this.text.length && this.text[this.i] !== ">") {
        // Read any string not containing equals, possibly delimited by
        // " or '
        var key = this.readLiteralOrString(/[^\s=>]/);
        this.skipWhitespace();
        // It's possible for tags to have attributes with no value, where
        // the equals sign is not necessary
        if (this.text[this.i] !== "=") {
            if (key.trim().length > 0) {
                attrs[key] = "";
            }
            continue;
        }
        this.i++;
        //this.skipWhitespace();
        var value = this.readLiteralOrString();
        if (key.trim().length > 0) {
            attrs[key] = value;
        }
        this.skipWhitespace();
    }
    // If end-of-string was not reached, consume the ending '>'
    if (this.i < this.text.length) {
        this.i++;
    }
    return {
        tagName: tname,
        attributes: attrs,
        text: this.text.substring(0, this.i) // Original text (for replacement)
    };
 };
 /* Some of these may not even be HTML tags, I borrowed them from the
   [now deprecated] XSS module of node-validator
 */
 const badTags = new RegExp([
    "alert",
    "applet",
    "audio",
    "basefont",
    "base",
    "behavior",
    "bgsound",
    "blink",
    "body",
    "embed",
    "expression",
    "form",
    "frameset",
    "frame",
    "head",
    "html",
    "ilayer",
    "iframe",
    "input",
    "layer",
    "link",
    "meta",
    "object",
    "style",
    "script",
    "textarea",
    "title",
-    "video",
+    "valign",
-    "xml",
+    "width"
-    "xss"
+];
 ].join("|"), "i");
-/* Nasty attributes.  Anything starting with "on" is probably a javascript
+var ATTRIBUTE_MAP = {
-   callback, and I hope you see why formaction is a bad idea.
+    a: ["href", "name", "target"],
-*/
+    font: ["size"],
-const badAttrs = new RegExp([
+    img: ["src"],
-    "\\bon\\S*",
+    marquee: ["behavior", "behaviour", "direction", "scrollamount"],
-    "\\bformaction",
+    table: ["cellpadding", "cellspacing"],
-    "\\baction"
+    th: ["colspan", "rowspan"],
-].join("|"), "i");
+    td: ["colspan", "rowspan"]
 function sanitizeHTML(str) {
    var i = str.indexOf("<");
    if (i === -1) {
        // No HTML tags in the string
        return str;
    }
    // Loop across all tag delimiters '<' in string, parse each one,
    // and replace the results with sanitized tags
    while (i !== -1) {
        var t = new TagParser(str.substring(i)).tag;
        if (t.tagName.replace("/", "").match(badTags)) {
            // Note: Important that I replace the tag with a nonempty value,
            // otherwise <scr<script>ipt> would possibly defeat the filter.
            str = str.replace(t.text, "[tag removed]");
            i = str.indexOf("<", i+1);
            continue;
        }
        for (var k in t.attributes) {
            // Keys should not contain non-word characters.
            var k2 = k.replace(/[^\w]/g, "");
            if (k2 !== k) {
                t.attributes[k2] = t.attributes[k];
                delete t.attributes[k];
                k = k2;
            }
            // If it's an evil attribute, just nuke it entirely
            if (k.match(badAttrs)) {
                delete t.attributes[k];
            } else {
                if (t.attributes[k].replace(/\s/g, "").indexOf("javascript:") !== -1) {
                    t.attributes[k] = "[removed]";
                }
            }
        }
        // Build the sanitized tag
        var fmt = "<" + t.tagName;
        for (var k in t.attributes) {
            if (k.trim().length > 0) {
                fmt += " " + k;
                if (t.attributes[k].trim().length > 0) {
                    var delim = '"';
                    if (t.attributes[k].match(/[^\\]"/)) {
                        delim = "'";
                        if (t.attributes[k].match(/[^\\]'/)) {
                            delim = "`";
                        }
                    }
                    fmt += "=" + delim + t.attributes[k] + delim;
                }
            }
        }
        str = str.replace(t.text, fmt + ">");
        i = str.indexOf("<", i + fmt.length + 1);
    }
    return str;
 }
-/* WIP: Sanitize a string where HTML is prohibited */
+for (var key in ATTRIBUTE_MAP) {
    ALLOWED_ATTRIBUTES.forEach(function (attr) {
        ATTRIBUTE_MAP[key].push(attr);
    });
 }
 sanitizeHTML.defaults.allowedTags.concat(ALLOWED_TAGS).forEach(function (tag) {
    if (!(tag in ATTRIBUTE_MAP)) {
        ATTRIBUTE_MAP[tag] = ALLOWED_ATTRIBUTES;
    }
 });
 const SETTINGS = {
    allowedTags: sanitizeHTML.defaults.allowedTags.concat(ALLOWED_TAGS),
    allowedAttributes: ATTRIBUTE_MAP
 };
 function sanitizeText(str) {
    str = str.replace(/&/g, "&amp;")
             .replace(/</g, "&lt;")
@ -280,6 +82,9 @@ function decodeText(str) {
    return str;
 }
-module.exports.sanitizeHTML = sanitizeHTML;
+module.exports.sanitizeHTML = function (html) {
    return sanitizeHTML(html, SETTINGS);
 };
 module.exports.sanitizeText = sanitizeText;
 module.exports.decodeText = decodeText;
--- a/package.json
+++ b/package.json
@ -22,6 +22,7 @@
    "nodemailer": "^1.2.0",
    "oauth": "^0.9.12",
    "q": "^1.0.1",
    "sanitize-html": "^1.4.3",
    "serve-static": "^1.5.3",
    "socket.io": "^1.2.1",
    "yamljs": "^0.1.5"