bsw/jbe@1309: function util.html_is_safe(str) bsw/jbe@1309: bsw/jbe@1309: -- All (ASCII) control characters except \t\n\f\r are forbidden: bsw/jbe@1309: if string.find(str, "[\0-\8\11\14-\31\127]") then bsw/jbe@1309: return false, "Invalid ASCII control character" bsw/jbe@1309: end bsw/jbe@1309: bsw/jbe@1309: -- Memorize expected closing tags: bsw/jbe@1309: local stack = {} bsw/jbe@1309: bsw/jbe@1309: -- State during parsing: bsw/jbe@1309: local para = false --
 tag open
bsw/jbe@1309:   local bold    = false  --  tag open
bsw/jbe@1309:   local italic  = false  --  tag open
bsw/jbe@1309:   local supsub  = false  --  or  tag open
bsw/jbe@1309:   local link    = false  --  tag open
bsw/jbe@1309:   local heading = false  --  or 
 (but no corresponding 
 or 
 tag) open
bsw/jbe@1309: 
bsw/jbe@1309:   -- Function looped with tail-calls:
bsw/jbe@1309:   local function loop(str)
bsw/jbe@1309: 
bsw/jbe@1309:     -- NOTE: We do not allow non-escaped "<" or ">" in attributes,
bsw/jbe@1309:     --       even if HTML5 allows it.
bsw/jbe@1309: 
bsw/jbe@1309:     -- Find any "<" or ">" character and determine context, i.e.
bsw/jbe@1309:     -- pre = text before character, tag = text until closing ">", and rest:
bsw/jbe@1309:     local pre, tag, rest = string.match(str, "^(.-)([<>][^<>]*>?)(.*)")
bsw/jbe@1309: 
bsw/jbe@1309:     -- Disallow text content (except inter-element white-space) in 
 or 
bsw/jbe@1309:     -- when outside 
 tag as void tag:
bsw/jbe@1309:     if string.find(tag, "^<[Bb][Rr][\t\n\f\r ]*/?>$") then
bsw/jbe@1309:       return loop(rest)
bsw/jbe@1309:     end
bsw/jbe@1309: 
bsw/jbe@1309:     -- Parse opening tag:
bsw/jbe@1309:     local tagname, attrs = string.match(
bsw/jbe@1309:       tag,
bsw/jbe@1309:       "^<([^<>\0-\32]+)[\t\n\f\r ]*([^<>]-)[\t\n\f\r ]*>$"
bsw/jbe@1309:     )
bsw/jbe@1309: 
bsw/jbe@1309:     -- Return false if tag could not be parsed:
bsw/jbe@1309:     if not tagname then
bsw/jbe@1309:       return false, "Malformed tag"
bsw/jbe@1309:     end
bsw/jbe@1309: 
bsw/jbe@1309:     -- Make tagname lowercase:
bsw/jbe@1309:     tagname = string.lower(tagname)
bsw/jbe@1309: 
bsw/jbe@1309:     -- Append closing tag to list of expected closing tags:
bsw/jbe@1309:     stack[#stack+1] = tagname
bsw/jbe@1309: 
bsw/jbe@1309:     -- Allow  or 
 is open,
bsw/jbe@1309:     -- then return false:
bsw/jbe@1309:     if list then
bsw/jbe@1309:       return false
bsw/jbe@1309:     end
bsw/jbe@1309: 
bsw/jbe@1309:     -- Allow , , ,  unless already open:
bsw/jbe@1309:     if tagname == "b" and attrs == "" then
bsw/jbe@1309:       if bold then
bsw/jbe@1309:         return false, "Bold inside bold tag"
bsw/jbe@1309:       end
bsw/jbe@1309:       bold = true
bsw/jbe@1309:       return loop(rest)
bsw/jbe@1309:     end
bsw/jbe@1309:     if tagname == "i" and attrs == "" then
bsw/jbe@1309:       if italic then
bsw/jbe@1309:         return false, "Italic inside italic tag"
bsw/jbe@1309:       end
bsw/jbe@1309:       italic = true
bsw/jbe@1309:       return loop(rest)
bsw/jbe@1309:     end
bsw/jbe@1309:     if (tagname == "sup" or tagname == "sub") and attrs == "" then
bsw/jbe@1309:       if supsub then
bsw/jbe@1309:         return false, "Super/subscript inside super/subscript tag"
bsw/jbe@1309:       end
bsw/jbe@1309:       supsub = true
bsw/jbe@1309:       return loop(rest)
bsw/jbe@1309:     end
bsw/jbe@1309: 
bsw/jbe@1309:     -- Allow  tag unless already open or malformed:
bsw/jbe@1309:     if tagname == "a" then
bsw/jbe@1309:       if link then
bsw/jbe@1309:         return false, "Link inside link"
bsw/jbe@1309:       end
bsw/jbe@1309:       local url = string.match(attrs, '^[Hh][Rr][Ee][Ff][\t\n\f\r ]*=[\t\n\f\r ]*"([^"]*)"$')
bsw/jbe@1309:       if not url then
bsw/jbe@1309:         url = string.match(attrs, "^[Hh][Rr][Ee][Ff][\t\n\f\r ]*=[\t\n\f\r ]*'([^']*)'$")
bsw/jbe@1309:       end
bsw/jbe@1309:       if not url then
bsw/jbe@1309:         url = string.match(attrs, "^[Hh][Rr][Ee][Ff][\t\n\f\r ]*=[\t\n\f\r ]*([^\0-\32\"'=<>`]+)$")
bsw/jbe@1309:       end
bsw/jbe@1309:       if not url then
bsw/jbe@1309:        return false, "Forbidden, missing, or malformed attributes in link tag"
bsw/jbe@1309:       end
bsw/jbe@1309:       if not string.find(url, "^[Hh][Tt][Tt][Pp][Ss]?://") then
bsw/jbe@1309:         return false, "Invalid link URL"
bsw/jbe@1309:       end
bsw/jbe@1309:       link = true
bsw/jbe@1309:       return loop(rest)
bsw/jbe@1309:     end
bsw/jbe@1309: 
bsw/jbe@1309:     -- Remaining tags require no open