| rev | 
   line source | 
| 
bsw/jbe@1309
 | 
     1 function util.html_is_safe(str)
 | 
| 
bsw/jbe@1309
 | 
     2 
 | 
| 
bsw/jbe@1309
 | 
     3   -- All (ASCII) control characters except \t\n\f\r are forbidden:
 | 
| 
bsw/jbe@1309
 | 
     4   if string.find(str, "[\0-\8\11\14-\31\127]") then
 | 
| 
bsw/jbe@1309
 | 
     5     return false, "Invalid ASCII control character"
 | 
| 
bsw/jbe@1309
 | 
     6   end
 | 
| 
bsw/jbe@1309
 | 
     7 
 | 
| 
bsw/jbe@1309
 | 
     8   -- Memorize expected closing tags:
 | 
| 
bsw/jbe@1309
 | 
     9   local stack = {}
 | 
| 
bsw/jbe@1309
 | 
    10 
 | 
| 
bsw/jbe@1309
 | 
    11   -- State during parsing:
 | 
| 
bsw/jbe@1309
 | 
    12   local para    = false  -- <p> tag open
 | 
| 
bsw/jbe@1309
 | 
    13   local bold    = false  -- <b> tag open
 | 
| 
bsw/jbe@1309
 | 
    14   local italic  = false  -- <i> tag open
 | 
| 
bsw/jbe@1309
 | 
    15   local supsub  = false  -- <sup> or <sub> tag open
 | 
| 
bsw/jbe@1309
 | 
    16   local link    = false  -- <a href="..."> tag open
 | 
| 
bsw/jbe@1309
 | 
    17   local heading = false  -- <h1-6> tag open
 | 
| 
bsw/jbe@1309
 | 
    18   local list    = false  -- <ol> or <ul> (but no corresponding <li>) tag open
 | 
| 
bsw/jbe@1309
 | 
    19   local listelm = false  -- <li> tag (but no further <ol> or <ul> tag) open
 | 
| 
bsw/jbe@1309
 | 
    20 
 | 
| 
bsw/jbe@1309
 | 
    21   -- Function looped with tail-calls:
 | 
| 
bsw/jbe@1309
 | 
    22   local function loop(str)
 | 
| 
bsw/jbe@1309
 | 
    23 
 | 
| 
bsw/jbe@1309
 | 
    24     -- NOTE: We do not allow non-escaped "<" or ">" in attributes,
 | 
| 
bsw/jbe@1309
 | 
    25     --       even if HTML5 allows it.
 | 
| 
bsw/jbe@1309
 | 
    26 
 | 
| 
bsw/jbe@1309
 | 
    27     -- Find any "<" or ">" character and determine context, i.e.
 | 
| 
bsw/jbe@1309
 | 
    28     -- pre = text before character, tag = text until closing ">", and rest:
 | 
| 
bsw/jbe@1309
 | 
    29     local pre, tag, rest = string.match(str, "^(.-)([<>][^<>]*>?)(.*)")
 | 
| 
bsw/jbe@1309
 | 
    30 
 | 
| 
bsw/jbe@1309
 | 
    31     -- Disallow text content (except inter-element white-space) in <ol> or <ul>
 | 
| 
bsw/jbe@1309
 | 
    32     -- when outside <li>:
 | 
| 
bsw/jbe@1309
 | 
    33     if list and string.find(pre, "[^\t\n\f\r ]") then
 | 
| 
bsw/jbe@1309
 | 
    34       return false, "Text content in list but outside list element"
 | 
| 
bsw/jbe@1309
 | 
    35     end
 | 
| 
bsw/jbe@1309
 | 
    36 
 | 
| 
bsw/jbe@1309
 | 
    37     -- If no more "<" or ">" characters are found,
 | 
| 
bsw/jbe@1309
 | 
    38     -- then return true if all tags have been closed:
 | 
| 
bsw/jbe@1309
 | 
    39     if not tag then
 | 
| 
bsw/jbe@1309
 | 
    40       if #stack == 0 then
 | 
| 
bsw/jbe@1309
 | 
    41         return true
 | 
| 
bsw/jbe@1309
 | 
    42       else
 | 
| 
bsw/jbe@1309
 | 
    43         return false, "Not all tags have been closed"
 | 
| 
bsw/jbe@1309
 | 
    44       end
 | 
| 
bsw/jbe@1309
 | 
    45     end
 | 
| 
bsw/jbe@1309
 | 
    46 
 | 
| 
bsw/jbe@1309
 | 
    47     -- Handle (expected) closing tags:
 | 
| 
bsw/jbe@1309
 | 
    48     local closed_tagname = string.match(tag, "^</(.-)[\t\n\f\r ]*>$")
 | 
| 
bsw/jbe@1309
 | 
    49     if closed_tagname then
 | 
| 
bsw/jbe@1309
 | 
    50       closed_tagname = string.lower(closed_tagname)
 | 
| 
bsw/jbe@1309
 | 
    51       if closed_tagname ~= stack[#stack] then
 | 
| 
bsw/jbe@1309
 | 
    52         return false, "Wrong closing tag"
 | 
| 
bsw/jbe@1309
 | 
    53       end
 | 
| 
bsw/jbe@1309
 | 
    54       if closed_tagname == "p" then
 | 
| 
bsw/jbe@1309
 | 
    55         para = false
 | 
| 
bsw/jbe@1309
 | 
    56       elseif closed_tagname == "b" then
 | 
| 
bsw/jbe@1309
 | 
    57         bold = false
 | 
| 
bsw/jbe@1309
 | 
    58       elseif closed_tagname == "i" then
 | 
| 
bsw/jbe@1309
 | 
    59         italic = false
 | 
| 
bsw/jbe@1309
 | 
    60       elseif closed_tagname == "sup" or closed_tagname == "sub" then
 | 
| 
bsw/jbe@1309
 | 
    61         supsub = false
 | 
| 
bsw/jbe@1309
 | 
    62       elseif closed_tagname == "a" then
 | 
| 
bsw/jbe@1309
 | 
    63         link = false
 | 
| 
bsw/jbe@1309
 | 
    64       elseif string.find(closed_tagname, "^h[1-6]$") then
 | 
| 
bsw/jbe@1309
 | 
    65         heading = false
 | 
| 
bsw/jbe@1309
 | 
    66       elseif closed_tagname == "ul" or closed_tagname == "ol" then
 | 
| 
bsw/jbe@1309
 | 
    67         list = false
 | 
| 
bsw/jbe@1309
 | 
    68       elseif closed_tagname == "li" then
 | 
| 
bsw/jbe@1309
 | 
    69         listelm = false
 | 
| 
bsw/jbe@1309
 | 
    70         list = true
 | 
| 
bsw/jbe@1309
 | 
    71       end
 | 
| 
bsw/jbe@1309
 | 
    72       stack[#stack] = nil
 | 
| 
bsw/jbe@1309
 | 
    73       return loop(rest)
 | 
| 
bsw/jbe@1309
 | 
    74     end
 | 
| 
bsw/jbe@1309
 | 
    75 
 | 
| 
bsw/jbe@1309
 | 
    76     -- Allow <br> tag as void tag:
 | 
| 
bsw/jbe@1309
 | 
    77     if string.find(tag, "^<[Bb][Rr][\t\n\f\r ]*/?>$") then
 | 
| 
bsw/jbe@1309
 | 
    78       return loop(rest)
 | 
| 
bsw/jbe@1309
 | 
    79     end
 | 
| 
bsw/jbe@1309
 | 
    80 
 | 
| 
bsw/jbe@1309
 | 
    81     -- Parse opening tag:
 | 
| 
bsw/jbe@1309
 | 
    82     local tagname, attrs = string.match(
 | 
| 
bsw/jbe@1309
 | 
    83       tag,
 | 
| 
bsw/jbe@1309
 | 
    84       "^<([^<>\0-\32]+)[\t\n\f\r ]*([^<>]-)[\t\n\f\r ]*>$"
 | 
| 
bsw/jbe@1309
 | 
    85     )
 | 
| 
bsw/jbe@1309
 | 
    86 
 | 
| 
bsw/jbe@1309
 | 
    87     -- Return false if tag could not be parsed:
 | 
| 
bsw/jbe@1309
 | 
    88     if not tagname then
 | 
| 
bsw/jbe@1309
 | 
    89       return false, "Malformed tag"
 | 
| 
bsw/jbe@1309
 | 
    90     end
 | 
| 
bsw/jbe@1309
 | 
    91 
 | 
| 
bsw/jbe@1309
 | 
    92     -- Make tagname lowercase:
 | 
| 
bsw/jbe@1309
 | 
    93     tagname = string.lower(tagname)
 | 
| 
bsw/jbe@1309
 | 
    94 
 | 
| 
bsw/jbe@1309
 | 
    95     -- Append closing tag to list of expected closing tags:
 | 
| 
bsw/jbe@1309
 | 
    96     stack[#stack+1] = tagname
 | 
| 
bsw/jbe@1309
 | 
    97 
 | 
| 
bsw/jbe@1309
 | 
    98     -- Allow <li> tag in proper context:
 | 
| 
bsw/jbe@1309
 | 
    99     if tagname == "li" and attrs == "" then
 | 
| 
bsw/jbe@1309
 | 
   100       if not list then
 | 
| 
bsw/jbe@1309
 | 
   101         return false, "List element outside list"
 | 
| 
bsw/jbe@1309
 | 
   102       end
 | 
| 
bsw/jbe@1309
 | 
   103       list = false
 | 
| 
bsw/jbe@1309
 | 
   104       listelm = true
 | 
| 
bsw/jbe@1309
 | 
   105       return loop(rest)
 | 
| 
bsw/jbe@1309
 | 
   106     end
 | 
| 
bsw/jbe@1309
 | 
   107 
 | 
| 
bsw/jbe@1309
 | 
   108     -- If there was no valid <li> tag but <ol> or <ul> is open,
 | 
| 
bsw/jbe@1309
 | 
   109     -- then return false:
 | 
| 
bsw/jbe@1309
 | 
   110     if list then
 | 
| 
bsw/jbe@1309
 | 
   111       return false
 | 
| 
bsw/jbe@1309
 | 
   112     end
 | 
| 
bsw/jbe@1309
 | 
   113 
 | 
| 
bsw/jbe@1309
 | 
   114     -- Allow <b>, <i>, <sup>, <sub> unless already open:
 | 
| 
bsw/jbe@1309
 | 
   115     if tagname == "b" and attrs == "" then
 | 
| 
bsw/jbe@1309
 | 
   116       if bold then
 | 
| 
bsw/jbe@1309
 | 
   117         return false, "Bold inside bold tag"
 | 
| 
bsw/jbe@1309
 | 
   118       end
 | 
| 
bsw/jbe@1309
 | 
   119       bold = true
 | 
| 
bsw/jbe@1309
 | 
   120       return loop(rest)
 | 
| 
bsw/jbe@1309
 | 
   121     end
 | 
| 
bsw/jbe@1309
 | 
   122     if tagname == "i" and attrs == "" then
 | 
| 
bsw/jbe@1309
 | 
   123       if italic then
 | 
| 
bsw/jbe@1309
 | 
   124         return false, "Italic inside italic tag"
 | 
| 
bsw/jbe@1309
 | 
   125       end
 | 
| 
bsw/jbe@1309
 | 
   126       italic = true
 | 
| 
bsw/jbe@1309
 | 
   127       return loop(rest)
 | 
| 
bsw/jbe@1309
 | 
   128     end
 | 
| 
bsw/jbe@1309
 | 
   129     if (tagname == "sup" or tagname == "sub") and attrs == "" then
 | 
| 
bsw/jbe@1309
 | 
   130       if supsub then
 | 
| 
bsw/jbe@1309
 | 
   131         return false, "Super/subscript inside super/subscript tag"
 | 
| 
bsw/jbe@1309
 | 
   132       end
 | 
| 
bsw/jbe@1309
 | 
   133       supsub = true
 | 
| 
bsw/jbe@1309
 | 
   134       return loop(rest)
 | 
| 
bsw/jbe@1309
 | 
   135     end
 | 
| 
bsw/jbe@1309
 | 
   136 
 | 
| 
bsw/jbe@1309
 | 
   137     -- Allow <a href="..."> tag unless already open or malformed:
 | 
| 
bsw/jbe@1309
 | 
   138     if tagname == "a" then
 | 
| 
bsw/jbe@1309
 | 
   139       if link then
 | 
| 
bsw/jbe@1309
 | 
   140         return false, "Link inside link"
 | 
| 
bsw/jbe@1309
 | 
   141       end
 | 
| 
bsw/jbe@1309
 | 
   142       local url = string.match(attrs, '^[Hh][Rr][Ee][Ff][\t\n\f\r ]*=[\t\n\f\r ]*"([^"]*)"$')
 | 
| 
bsw/jbe@1309
 | 
   143       if not url then
 | 
| 
bsw/jbe@1309
 | 
   144         url = string.match(attrs, "^[Hh][Rr][Ee][Ff][\t\n\f\r ]*=[\t\n\f\r ]*'([^']*)'$")
 | 
| 
bsw/jbe@1309
 | 
   145       end
 | 
| 
bsw/jbe@1309
 | 
   146       if not url then
 | 
| 
bsw/jbe@1309
 | 
   147         url = string.match(attrs, "^[Hh][Rr][Ee][Ff][\t\n\f\r ]*=[\t\n\f\r ]*([^\0-\32\"'=<>`]+)$")
 | 
| 
bsw/jbe@1309
 | 
   148       end
 | 
| 
bsw/jbe@1309
 | 
   149       if not url then
 | 
| 
bsw/jbe@1309
 | 
   150        return false, "Forbidden, missing, or malformed attributes in link tag"
 | 
| 
bsw/jbe@1309
 | 
   151       end
 | 
| 
bsw/jbe@1309
 | 
   152       if not string.find(url, "^[Hh][Tt][Tt][Pp][Ss]?://") then
 | 
| 
bsw/jbe@1309
 | 
   153         return false, "Invalid link URL"
 | 
| 
bsw/jbe@1309
 | 
   154       end
 | 
| 
bsw/jbe@1309
 | 
   155       link = true
 | 
| 
bsw/jbe@1309
 | 
   156       return loop(rest)
 | 
| 
bsw/jbe@1309
 | 
   157     end
 | 
| 
bsw/jbe@1309
 | 
   158 
 | 
| 
bsw/jbe@1309
 | 
   159     -- Remaining tags require no open <p>, <b>, <i>, <sup>, <sub>,
 | 
| 
bsw/jbe@1309
 | 
   160     -- <a href="...">, or <h1>..</h6> tag:
 | 
| 
bsw/jbe@1309
 | 
   161     if para or bold or italic or supsub or link or heading then
 | 
| 
bsw/jbe@1309
 | 
   162       return false, "Forbidden child tag within paragraph, bold, italic, super/subscript, link, or heading tag"
 | 
| 
bsw/jbe@1309
 | 
   163     end
 | 
| 
bsw/jbe@1309
 | 
   164 
 | 
| 
bsw/jbe@1309
 | 
   165     -- Allow <p>:
 | 
| 
bsw/jbe@1309
 | 
   166     if tagname == "p" and attrs == "" then
 | 
| 
bsw/jbe@1309
 | 
   167       para = true
 | 
| 
bsw/jbe@1309
 | 
   168       return loop(rest)
 | 
| 
bsw/jbe@1309
 | 
   169     end
 | 
| 
bsw/jbe@1309
 | 
   170 
 | 
| 
bsw/jbe@1309
 | 
   171     -- Allow <h1>..<h6>:
 | 
| 
bsw/jbe@1309
 | 
   172     if string.find(tagname, "^h[1-6]$") and attrs == "" then
 | 
| 
bsw/jbe@1309
 | 
   173       heading = true
 | 
| 
bsw/jbe@1309
 | 
   174       return loop(rest)
 | 
| 
bsw/jbe@1309
 | 
   175     end
 | 
| 
bsw/jbe@1309
 | 
   176 
 | 
| 
bsw/jbe@1309
 | 
   177     -- Allow <ul> and <ol>:
 | 
| 
bsw/jbe@1309
 | 
   178     if (tagname == "ul" or tagname == "ol") and attrs == "" then
 | 
| 
bsw/jbe@1309
 | 
   179       list = true
 | 
| 
bsw/jbe@1309
 | 
   180       return loop(rest)
 | 
| 
bsw/jbe@1309
 | 
   181     end
 | 
| 
bsw/jbe@1309
 | 
   182 
 | 
| 
bsw/jbe@1309
 | 
   183     -- Disallow all others (including unexpected closing tags):
 | 
| 
bsw/jbe@1309
 | 
   184     return false, "Forbidden tag or forbidden attributes"
 | 
| 
bsw/jbe@1309
 | 
   185 
 | 
| 
bsw/jbe@1309
 | 
   186   end
 | 
| 
bsw/jbe@1309
 | 
   187 
 | 
| 
bsw/jbe@1309
 | 
   188   -- Invoke tail-call loop:
 | 
| 
bsw/jbe@1309
 | 
   189   return loop(str)
 | 
| 
bsw/jbe@1309
 | 
   190 
 | 
| 
bsw/jbe@1309
 | 
   191 end
 |