Sanitizing is the process of removing any attributes, properties, and values that are not included in an allowlist or that are explicitly forbidden on a disallow list.
For example, if the rendered HTML from our HTML string looked like this:
<p><img src=x" onerror="alert('XSS Attack')"></p>
<p><a href="javascript:alert('Another XSS Attack')">View My Profile</a></p>
The sanitized version might look like this:
<p><img src=x"></p>
<p><a>View My Profile</a></p>
The DOMParser()
method converts an HTML string into real HTML without rendering it in the actual DOM. As a result, any malicious code is not executed (and won’t be until those HTML elements are injected into the UI).
/*!
* Sanitize an HTML string
* (c) 2021 Chris Ferdinandi, MIT License, https://gomakethings.com
* @param {String} str The HTML string to sanitize
* @param {Boolean} nodes If true, returns HTML nodes instead of a string
* @return {String|NodeList} The sanitized string or nodes
*/
function cleanHTML (str, nodes) {
/**
* Convert the string to an HTML document
* @return {Node} An HTML document
*/
function stringToHTML () {
let parser = new DOMParser();
let doc = parser.parseFromString(str, 'text/html');
return doc.body || document.createElement('body');
}
/**
* Remove <script> elements
* @param {Node} html The HTML
*/
function removeScripts (html) {
let scripts = html.querySelectorAll('script');
for (let script of scripts) {
script.remove();
}
}
/**
* Check if the attribute is potentially dangerous
* @param {String} name The attribute name
* @param {String} value The attribute value
* @return {Boolean} If true, the attribute is potentially dangerous
*/
function isPossiblyDangerous (name, value) {
let val = value.replace(/\s+/g, '').toLowerCase();
if (['src', 'href', 'xlink:href'].includes(name)) {
if (val.includes('javascript:') || val.includes('data:')) return true;
}
if (name.startsWith('on')) return true;
}
/**
* Remove potentially dangerous attributes from an element
* @param {Node} elem The element
*/
function removeAttributes (elem) {
// Loop through each attribute
// If it's dangerous, remove it
let atts = elem.attributes;
for (let {name, value} of atts) {
if (!isPossiblyDangerous(name, value)) continue;
elem.removeAttribute(name);
}
}
/**
* Remove dangerous stuff from the HTML document's nodes
* @param {Node} html The HTML document
*/
function clean (html) {
let nodes = html.children;
for (let node of nodes) {
removeAttributes(node);
clean(node);
}
}
// Convert the string to HTML
let html = stringToHTML();
// Sanitize it
removeScripts(html);
clean(html);
// If the user wants HTML nodes back, return them
// Otherwise, pass a sanitized string back
return nodes ? html.childNodes : html.innerHTML;
}