liquid_feedback_frontend
view env/util/html_is_safe.lua @ 1852:e593570a23c5
More efficient algorithm
| author | bsw | 
|---|---|
| date | Tue Mar 22 10:35:44 2022 +0100 (2022-03-22) | 
| parents | 27d2a7609cc1 | 
| children | 
 line source
     1 function util.html_is_safe(str)
     3   -- All (ASCII) control characters except \t\n\f\r are forbidden:
     4   if string.find(str, "[\0-\8\11\14-\31\127]") then
     5     return false, "Invalid ASCII control character"
     6   end
     8   -- Memorize expected closing tags:
     9   local stack = {}
    11   -- State during parsing:
    12   local para    = false  -- <p> tag open
    13   local bold    = false  -- <b> tag open
    14   local italic  = false  -- <i> tag open
    15   local supsub  = false  -- <sup> or <sub> tag open
    16   local link    = false  -- <a href="..."> tag open
    17   local heading = false  -- <h1-6> tag open
    18   local list    = false  -- <ol> or <ul> (but no corresponding <li>) tag open
    19   local listelm = false  -- <li> tag (but no further <ol> or <ul> tag) open
    20   local pre     = false  -- <pre> tag open
    22   -- Function looped with tail-calls:
    23   local function loop(str)
    25     -- NOTE: We do not allow non-escaped "<" or ">" in attributes,
    26     --       even if HTML5 allows it.
    28     -- Find any "<" or ">" character and determine context, i.e.
    29     -- prefix = text before character, tag = text until closing ">", and rest:
    30     local prefix, tag, rest = string.match(str, "^(.-)([<>][^<>]*>?)(.*)")
    32     -- If no more "<" or ">" characters are found,
    33     -- then return true if all tags have been closed:
    34     if not tag then
    35       if #stack == 0 then
    36         return true
    37       else
    38         return false, "Not all tags have been closed"
    39       end
    40     end
    42     -- Disallow text content (except inter-element white-space) in <ol> or <ul>
    43     -- when outside <li>:
    44     if list and string.find(prefix, "[^\t\n\f\r ]") then
    45       return false, "Text content in list but outside list element"
    46     end
    48     -- Handle (expected) closing tags:
    49     local closed_tagname = string.match(tag, "^</(.-)[\t\n\f\r ]*>$")
    50     if closed_tagname then
    51       closed_tagname = string.lower(closed_tagname)
    52       if closed_tagname ~= stack[#stack] then
    53         return false, "Wrong closing tag"
    54       end
    55       if closed_tagname == "p" then
    56         para = false
    57       elseif closed_tagname == "b" then
    58         bold = false
    59       elseif closed_tagname == "i" then
    60         italic = false
    61       elseif closed_tagname == "sup" or closed_tagname == "sub" then
    62         supsub = false
    63       elseif closed_tagname == "a" then
    64         link = false
    65       elseif string.find(closed_tagname, "^h[1-6]$") then
    66         heading = false
    67       elseif closed_tagname == "ul" or closed_tagname == "ol" then
    68         list = false
    69       elseif closed_tagname == "li" then
    70         listelm = false
    71         list = true
    72       elseif closed_tagname == "pre" then
    73         pre = false
    74       end
    75       stack[#stack] = nil
    76       return loop(rest)
    77     end
    79     -- Allow <br> tag as void tag:
    80     if string.find(tag, "^<[Bb][Rr][\t\n\f\r ]*/?>$") then
    81       return loop(rest)
    82     end
    84     -- Parse opening tag:
    85     local tagname, attrs = string.match(
    86       tag,
    87       "^<([^<>\0-\32]+)[\t\n\f\r ]*([^<>]-)[\t\n\f\r ]*>$"
    88     )
    90     -- Return false if tag could not be parsed:
    91     if not tagname then
    92       return false, "Malformed tag"
    93     end
    95     -- Make tagname lowercase:
    96     tagname = string.lower(tagname)
    98     -- Append closing tag to list of expected closing tags:
    99     stack[#stack+1] = tagname
   101     -- Allow <li> tag in proper context:
   102     if tagname == "li" and attrs == "" then
   103       if not list then
   104         return false, "List element outside list"
   105       end
   106       list = false
   107       listelm = true
   108       return loop(rest)
   109     end
   111     -- If there was no valid <li> tag but <ol> or <ul> is open,
   112     -- then return false:
   113     if list then
   114       return false
   115     end
   117     -- Allow <b>, <i>, <sup>, <sub> unless already open:
   118     if tagname == "b" and attrs == "" then
   119       if bold then
   120         return false, "Bold inside bold tag"
   121       end
   122       bold = true
   123       return loop(rest)
   124     end
   125     if tagname == "i" and attrs == "" then
   126       if italic then
   127         return false, "Italic inside italic tag"
   128       end
   129       italic = true
   130       return loop(rest)
   131     end
   132     if (tagname == "sup" or tagname == "sub") and attrs == "" then
   133       if supsub then
   134         return false, "Super/subscript inside super/subscript tag"
   135       end
   136       supsub = true
   137       return loop(rest)
   138     end
   140     -- Allow <a href="..."> tag unless already open or malformed:
   141     if tagname == "a" then
   142       if link then
   143         return false, "Link inside link"
   144       end
   145       local url = string.match(attrs, '^[Hh][Rr][Ee][Ff][\t\n\f\r ]*=[\t\n\f\r ]*"([^"]*)"$')
   146       if not url then
   147         url = string.match(attrs, "^[Hh][Rr][Ee][Ff][\t\n\f\r ]*=[\t\n\f\r ]*'([^']*)'$")
   148       end
   149       if not url then
   150         url = string.match(attrs, "^[Hh][Rr][Ee][Ff][\t\n\f\r ]*=[\t\n\f\r ]*([^\0-\32\"'=<>`]+)$")
   151       end
   152       if not url then
   153        return false, "Forbidden, missing, or malformed attributes in link tag"
   154       end
   155       if not string.find(url, "^[Hh][Tt][Tt][Pp][Ss]?://") then
   156         return false, "Invalid link URL"
   157       end
   158       link = true
   159       return loop(rest)
   160     end
   162     -- Always allow <pre>
   163     if tagname == "pre" then
   164         pre = true
   165         return loop(rest)
   166     end
   168     -- Remaining tags require no open <p>, <b>, <i>, <sup>, <sub>,
   169     -- <a href="...">, or <h1>..</h6> tag:
   170     -- TODO: HTML also requires that no <pre> tag is open, but check not done
   171     -- here due to used WYSIWYG editor
   172     if para or bold or italic or supsub or link or heading then
   173       return false, "Forbidden child tag within paragraph, bold, italic, super/subscript, link, or heading tag"
   174     end
   176     -- Allow <p>:
   177     if tagname == "p" and attrs == "" then
   178       para = true
   179       return loop(rest)
   180     end
   182     -- Allow <h1>..<h6>:
   183     if string.find(tagname, "^h[1-6]$") and attrs == "" then
   184       heading = true
   185       return loop(rest)
   186     end
   188     -- Allow <ul> and <ol>:
   189     if (tagname == "ul" or tagname == "ol") and attrs == "" then
   190       list = true
   191       return loop(rest)
   192     end
   194     -- Disallow all others (including unexpected closing tags):
   195     return false, "Forbidden tag or forbidden attributes"
   197   end
   199   -- Invoke tail-call loop:
   200   return loop(str)
   202 end
