| rev | line source | 
| bsw/jbe@1309 | 1 function util.html_is_safe(str) | 
| bsw/jbe@1309 | 2 | 
| bsw/jbe@1309 | 3   -- All (ASCII) control characters except \t\n\f\r are forbidden: | 
| bsw/jbe@1309 | 4   if string.find(str, "[\0-\8\11\14-\31\127]") then | 
| bsw/jbe@1309 | 5     return false, "Invalid ASCII control character" | 
| bsw/jbe@1309 | 6   end | 
| bsw/jbe@1309 | 7 | 
| bsw/jbe@1309 | 8   -- Memorize expected closing tags: | 
| bsw/jbe@1309 | 9   local stack = {} | 
| bsw/jbe@1309 | 10 | 
| bsw/jbe@1309 | 11   -- State during parsing: | 
| bsw/jbe@1309 | 12   local para    = false  -- <p> tag open | 
| bsw/jbe@1309 | 13   local bold    = false  -- <b> tag open | 
| bsw/jbe@1309 | 14   local italic  = false  -- <i> tag open | 
| bsw/jbe@1309 | 15   local supsub  = false  -- <sup> or <sub> tag open | 
| bsw/jbe@1309 | 16   local link    = false  -- <a href="..."> tag open | 
| bsw/jbe@1309 | 17   local heading = false  -- <h1-6> tag open | 
| bsw/jbe@1309 | 18   local list    = false  -- <ol> or <ul> (but no corresponding <li>) tag open | 
| bsw/jbe@1309 | 19   local listelm = false  -- <li> tag (but no further <ol> or <ul> tag) open | 
| bsw/jbe@1309 | 20 | 
| bsw/jbe@1309 | 21   -- Function looped with tail-calls: | 
| bsw/jbe@1309 | 22   local function loop(str) | 
| bsw/jbe@1309 | 23 | 
| bsw/jbe@1309 | 24     -- NOTE: We do not allow non-escaped "<" or ">" in attributes, | 
| bsw/jbe@1309 | 25     --       even if HTML5 allows it. | 
| bsw/jbe@1309 | 26 | 
| bsw/jbe@1309 | 27     -- Find any "<" or ">" character and determine context, i.e. | 
| bsw/jbe@1309 | 28     -- pre = text before character, tag = text until closing ">", and rest: | 
| bsw/jbe@1309 | 29     local pre, tag, rest = string.match(str, "^(.-)([<>][^<>]*>?)(.*)") | 
| bsw/jbe@1309 | 30 | 
| bsw/jbe@1309 | 31     -- Disallow text content (except inter-element white-space) in <ol> or <ul> | 
| bsw/jbe@1309 | 32     -- when outside <li>: | 
| bsw/jbe@1309 | 33     if list and string.find(pre, "[^\t\n\f\r ]") then | 
| bsw/jbe@1309 | 34       return false, "Text content in list but outside list element" | 
| bsw/jbe@1309 | 35     end | 
| bsw/jbe@1309 | 36 | 
| bsw/jbe@1309 | 37     -- If no more "<" or ">" characters are found, | 
| bsw/jbe@1309 | 38     -- then return true if all tags have been closed: | 
| bsw/jbe@1309 | 39     if not tag then | 
| bsw/jbe@1309 | 40       if #stack == 0 then | 
| bsw/jbe@1309 | 41         return true | 
| bsw/jbe@1309 | 42       else | 
| bsw/jbe@1309 | 43         return false, "Not all tags have been closed" | 
| bsw/jbe@1309 | 44       end | 
| bsw/jbe@1309 | 45     end | 
| bsw/jbe@1309 | 46 | 
| bsw/jbe@1309 | 47     -- Handle (expected) closing tags: | 
| bsw/jbe@1309 | 48     local closed_tagname = string.match(tag, "^</(.-)[\t\n\f\r ]*>$") | 
| bsw/jbe@1309 | 49     if closed_tagname then | 
| bsw/jbe@1309 | 50       closed_tagname = string.lower(closed_tagname) | 
| bsw/jbe@1309 | 51       if closed_tagname ~= stack[#stack] then | 
| bsw/jbe@1309 | 52         return false, "Wrong closing tag" | 
| bsw/jbe@1309 | 53       end | 
| bsw/jbe@1309 | 54       if closed_tagname == "p" then | 
| bsw/jbe@1309 | 55         para = false | 
| bsw/jbe@1309 | 56       elseif closed_tagname == "b" then | 
| bsw/jbe@1309 | 57         bold = false | 
| bsw/jbe@1309 | 58       elseif closed_tagname == "i" then | 
| bsw/jbe@1309 | 59         italic = false | 
| bsw/jbe@1309 | 60       elseif closed_tagname == "sup" or closed_tagname == "sub" then | 
| bsw/jbe@1309 | 61         supsub = false | 
| bsw/jbe@1309 | 62       elseif closed_tagname == "a" then | 
| bsw/jbe@1309 | 63         link = false | 
| bsw/jbe@1309 | 64       elseif string.find(closed_tagname, "^h[1-6]$") then | 
| bsw/jbe@1309 | 65         heading = false | 
| bsw/jbe@1309 | 66       elseif closed_tagname == "ul" or closed_tagname == "ol" then | 
| bsw/jbe@1309 | 67         list = false | 
| bsw/jbe@1309 | 68       elseif closed_tagname == "li" then | 
| bsw/jbe@1309 | 69         listelm = false | 
| bsw/jbe@1309 | 70         list = true | 
| bsw/jbe@1309 | 71       end | 
| bsw/jbe@1309 | 72       stack[#stack] = nil | 
| bsw/jbe@1309 | 73       return loop(rest) | 
| bsw/jbe@1309 | 74     end | 
| bsw/jbe@1309 | 75 | 
| bsw/jbe@1309 | 76     -- Allow <br> tag as void tag: | 
| bsw/jbe@1309 | 77     if string.find(tag, "^<[Bb][Rr][\t\n\f\r ]*/?>$") then | 
| bsw/jbe@1309 | 78       return loop(rest) | 
| bsw/jbe@1309 | 79     end | 
| bsw/jbe@1309 | 80 | 
| bsw/jbe@1309 | 81     -- Parse opening tag: | 
| bsw/jbe@1309 | 82     local tagname, attrs = string.match( | 
| bsw/jbe@1309 | 83       tag, | 
| bsw/jbe@1309 | 84       "^<([^<>\0-\32]+)[\t\n\f\r ]*([^<>]-)[\t\n\f\r ]*>$" | 
| bsw/jbe@1309 | 85     ) | 
| bsw/jbe@1309 | 86 | 
| bsw/jbe@1309 | 87     -- Return false if tag could not be parsed: | 
| bsw/jbe@1309 | 88     if not tagname then | 
| bsw/jbe@1309 | 89       return false, "Malformed tag" | 
| bsw/jbe@1309 | 90     end | 
| bsw/jbe@1309 | 91 | 
| bsw/jbe@1309 | 92     -- Make tagname lowercase: | 
| bsw/jbe@1309 | 93     tagname = string.lower(tagname) | 
| bsw/jbe@1309 | 94 | 
| bsw/jbe@1309 | 95     -- Append closing tag to list of expected closing tags: | 
| bsw/jbe@1309 | 96     stack[#stack+1] = tagname | 
| bsw/jbe@1309 | 97 | 
| bsw/jbe@1309 | 98     -- Allow <li> tag in proper context: | 
| bsw/jbe@1309 | 99     if tagname == "li" and attrs == "" then | 
| bsw/jbe@1309 | 100       if not list then | 
| bsw/jbe@1309 | 101         return false, "List element outside list" | 
| bsw/jbe@1309 | 102       end | 
| bsw/jbe@1309 | 103       list = false | 
| bsw/jbe@1309 | 104       listelm = true | 
| bsw/jbe@1309 | 105       return loop(rest) | 
| bsw/jbe@1309 | 106     end | 
| bsw/jbe@1309 | 107 | 
| bsw/jbe@1309 | 108     -- If there was no valid <li> tag but <ol> or <ul> is open, | 
| bsw/jbe@1309 | 109     -- then return false: | 
| bsw/jbe@1309 | 110     if list then | 
| bsw/jbe@1309 | 111       return false | 
| bsw/jbe@1309 | 112     end | 
| bsw/jbe@1309 | 113 | 
| bsw/jbe@1309 | 114     -- Allow <b>, <i>, <sup>, <sub> unless already open: | 
| bsw/jbe@1309 | 115     if tagname == "b" and attrs == "" then | 
| bsw/jbe@1309 | 116       if bold then | 
| bsw/jbe@1309 | 117         return false, "Bold inside bold tag" | 
| bsw/jbe@1309 | 118       end | 
| bsw/jbe@1309 | 119       bold = true | 
| bsw/jbe@1309 | 120       return loop(rest) | 
| bsw/jbe@1309 | 121     end | 
| bsw/jbe@1309 | 122     if tagname == "i" and attrs == "" then | 
| bsw/jbe@1309 | 123       if italic then | 
| bsw/jbe@1309 | 124         return false, "Italic inside italic tag" | 
| bsw/jbe@1309 | 125       end | 
| bsw/jbe@1309 | 126       italic = true | 
| bsw/jbe@1309 | 127       return loop(rest) | 
| bsw/jbe@1309 | 128     end | 
| bsw/jbe@1309 | 129     if (tagname == "sup" or tagname == "sub") and attrs == "" then | 
| bsw/jbe@1309 | 130       if supsub then | 
| bsw/jbe@1309 | 131         return false, "Super/subscript inside super/subscript tag" | 
| bsw/jbe@1309 | 132       end | 
| bsw/jbe@1309 | 133       supsub = true | 
| bsw/jbe@1309 | 134       return loop(rest) | 
| bsw/jbe@1309 | 135     end | 
| bsw/jbe@1309 | 136 | 
| bsw/jbe@1309 | 137     -- Allow <a href="..."> tag unless already open or malformed: | 
| bsw/jbe@1309 | 138     if tagname == "a" then | 
| bsw/jbe@1309 | 139       if link then | 
| bsw/jbe@1309 | 140         return false, "Link inside link" | 
| bsw/jbe@1309 | 141       end | 
| bsw/jbe@1309 | 142       local url = string.match(attrs, '^[Hh][Rr][Ee][Ff][\t\n\f\r ]*=[\t\n\f\r ]*"([^"]*)"$') | 
| bsw/jbe@1309 | 143       if not url then | 
| bsw/jbe@1309 | 144         url = string.match(attrs, "^[Hh][Rr][Ee][Ff][\t\n\f\r ]*=[\t\n\f\r ]*'([^']*)'$") | 
| bsw/jbe@1309 | 145       end | 
| bsw/jbe@1309 | 146       if not url then | 
| bsw/jbe@1309 | 147         url = string.match(attrs, "^[Hh][Rr][Ee][Ff][\t\n\f\r ]*=[\t\n\f\r ]*([^\0-\32\"'=<>`]+)$") | 
| bsw/jbe@1309 | 148       end | 
| bsw/jbe@1309 | 149       if not url then | 
| bsw/jbe@1309 | 150        return false, "Forbidden, missing, or malformed attributes in link tag" | 
| bsw/jbe@1309 | 151       end | 
| bsw/jbe@1309 | 152       if not string.find(url, "^[Hh][Tt][Tt][Pp][Ss]?://") then | 
| bsw/jbe@1309 | 153         return false, "Invalid link URL" | 
| bsw/jbe@1309 | 154       end | 
| bsw/jbe@1309 | 155       link = true | 
| bsw/jbe@1309 | 156       return loop(rest) | 
| bsw/jbe@1309 | 157     end | 
| bsw/jbe@1309 | 158 | 
| bsw/jbe@1309 | 159     -- Remaining tags require no open <p>, <b>, <i>, <sup>, <sub>, | 
| bsw/jbe@1309 | 160     -- <a href="...">, or <h1>..</h6> tag: | 
| bsw/jbe@1309 | 161     if para or bold or italic or supsub or link or heading then | 
| bsw/jbe@1309 | 162       return false, "Forbidden child tag within paragraph, bold, italic, super/subscript, link, or heading tag" | 
| bsw/jbe@1309 | 163     end | 
| bsw/jbe@1309 | 164 | 
| bsw/jbe@1309 | 165     -- Allow <p>: | 
| bsw/jbe@1309 | 166     if tagname == "p" and attrs == "" then | 
| bsw/jbe@1309 | 167       para = true | 
| bsw/jbe@1309 | 168       return loop(rest) | 
| bsw/jbe@1309 | 169     end | 
| bsw/jbe@1309 | 170 | 
| bsw/jbe@1309 | 171     -- Allow <h1>..<h6>: | 
| bsw/jbe@1309 | 172     if string.find(tagname, "^h[1-6]$") and attrs == "" then | 
| bsw/jbe@1309 | 173       heading = true | 
| bsw/jbe@1309 | 174       return loop(rest) | 
| bsw/jbe@1309 | 175     end | 
| bsw/jbe@1309 | 176 | 
| bsw/jbe@1309 | 177     -- Allow <ul> and <ol>: | 
| bsw/jbe@1309 | 178     if (tagname == "ul" or tagname == "ol") and attrs == "" then | 
| bsw/jbe@1309 | 179       list = true | 
| bsw/jbe@1309 | 180       return loop(rest) | 
| bsw/jbe@1309 | 181     end | 
| bsw/jbe@1309 | 182 | 
| bsw/jbe@1309 | 183     -- Disallow all others (including unexpected closing tags): | 
| bsw/jbe@1309 | 184     return false, "Forbidden tag or forbidden attributes" | 
| bsw/jbe@1309 | 185 | 
| bsw/jbe@1309 | 186   end | 
| bsw/jbe@1309 | 187 | 
| bsw/jbe@1309 | 188   -- Invoke tail-call loop: | 
| bsw/jbe@1309 | 189   return loop(str) | 
| bsw/jbe@1309 | 190 | 
| bsw/jbe@1309 | 191 end |