| rev | 
   line source | 
| 
bsw/jbe@1309
 | 
     1 function util.html_to_text(str)
 | 
| 
bsw/jbe@1309
 | 
     2   str = string.gsub(str, "[\0-\32]", " ")
 | 
| 
bsw/jbe@1309
 | 
     3   str = string.gsub(str, "<[Bb][Rr] */?>", "\n")
 | 
| 
bsw/jbe@1309
 | 
     4   str = string.gsub(str, "</?[Pp] *>", "\n\n")
 | 
| 
bsw/jbe@1309
 | 
     5   str = string.gsub(str, "</?[Bb] *>", "**")
 | 
| 
bsw/jbe@1309
 | 
     6   str = string.gsub(str, "</?[Ii] *>", "//")
 | 
| 
bsw/jbe@1309
 | 
     7   str = string.gsub(str, "</?[Ss][Uu][Bb] *>", "__")
 | 
| 
bsw/jbe@1309
 | 
     8   str = string.gsub(str, "</?[Ss][Uu][Pp] *>", "^^")
 | 
| 
bsw/jbe@1309
 | 
     9   str = string.gsub(str, '<[Aa] *[Hh][Rr][Ee][Ff] *= *"([^"]*)" *>', "[[%1 ")
 | 
| 
bsw/jbe@1309
 | 
    10   str = string.gsub(str, "<[Aa] *[Hh][Rr][Ee][Ff] *= *'([^']*)' *>", "[[%1 ")
 | 
| 
bsw/jbe@1309
 | 
    11   str = string.gsub(str, "<[Aa] *[Hh][Rr][Ee][Ff] *= *([^ <>\"']*) *>", "[[%1 ")
 | 
| 
bsw/jbe@1309
 | 
    12   str = string.gsub(str, "</[Aa] *>", "]]")
 | 
| 
bsw/jbe@1309
 | 
    13   str = string.gsub(str, "<[Hh]1 *>", "\n\n###### ")
 | 
| 
bsw/jbe@1309
 | 
    14   str = string.gsub(str, "<[Hh]2 *>", "\n\n##### ")
 | 
| 
bsw/jbe@1309
 | 
    15   str = string.gsub(str, "<[Hh]3 *>", "\n\n#### ")
 | 
| 
bsw/jbe@1309
 | 
    16   str = string.gsub(str, "<[Hh]4 *>", "\n\n### ")
 | 
| 
bsw/jbe@1309
 | 
    17   str = string.gsub(str, "<[Hh]5 *>", "\n\n## ")
 | 
| 
bsw/jbe@1309
 | 
    18   str = string.gsub(str, "<[Hh]6 *>", "\n\n# ")
 | 
| 
bsw/jbe@1309
 | 
    19   str = string.gsub(str, "</[Hh]1 *>", " ######\n\n")
 | 
| 
bsw/jbe@1309
 | 
    20   str = string.gsub(str, "</[Hh]2 *>", " #####\n\n")
 | 
| 
bsw/jbe@1309
 | 
    21   str = string.gsub(str, "</[Hh]3 *>", " ####\n\n")
 | 
| 
bsw/jbe@1309
 | 
    22   str = string.gsub(str, "</[Hh]4 *>", " ###\n\n")
 | 
| 
bsw/jbe@1309
 | 
    23   str = string.gsub(str, "</[Hh]5 *>", " ##\n\n")
 | 
| 
bsw/jbe@1309
 | 
    24   str = string.gsub(str, "</[Hh]6 *>", " #\n\n")
 | 
| 
bsw/jbe@1309
 | 
    25   local li_info = {}
 | 
| 
bsw/jbe@1309
 | 
    26   local pos = 1
 | 
| 
bsw/jbe@1309
 | 
    27   local counters = {}
 | 
| 
bsw/jbe@1309
 | 
    28   while true do
 | 
| 
bsw/jbe@1309
 | 
    29     local list_start, list_stop, list_tagname = string.find(str, "<(/?[OoUu]l) *>", pos)
 | 
| 
bsw/jbe@1309
 | 
    30     if list_tagname then
 | 
| 
bsw/jbe@1309
 | 
    31       list_tagname = string.lower(list_tagname)
 | 
| 
bsw/jbe@1309
 | 
    32     end
 | 
| 
bsw/jbe@1309
 | 
    33     local elem_start, elem_stop = string.find(str, "<[Ll][Ii] *>", pos)
 | 
| 
bsw/jbe@1309
 | 
    34     if list_start and not elem_start then
 | 
| 
bsw/jbe@1309
 | 
    35       pos = list_stop
 | 
| 
bsw/jbe@1309
 | 
    36     elseif elem_start and not list_start then
 | 
| 
bsw/jbe@1309
 | 
    37       pos = elem_stop
 | 
| 
bsw/jbe@1309
 | 
    38     elseif list_start and elem_start then
 | 
| 
bsw/jbe@1309
 | 
    39       if list_start < elem_start then
 | 
| 
bsw/jbe@1309
 | 
    40         pos = list_stop
 | 
| 
bsw/jbe@1309
 | 
    41       else
 | 
| 
bsw/jbe@1309
 | 
    42         pos = elem_stop
 | 
| 
bsw/jbe@1309
 | 
    43         list_tagname = nil
 | 
| 
bsw/jbe@1309
 | 
    44       end
 | 
| 
bsw/jbe@1309
 | 
    45     else
 | 
| 
bsw/jbe@1309
 | 
    46       break
 | 
| 
bsw/jbe@1309
 | 
    47     end
 | 
| 
bsw/jbe@1309
 | 
    48     if list_tagname == "ol" then
 | 
| 
bsw/jbe@1309
 | 
    49       counters[#counters+1] = 0
 | 
| 
bsw/jbe@1309
 | 
    50     elseif list_tagname == "ul" then
 | 
| 
bsw/jbe@1309
 | 
    51       counters[#counters+1] = false
 | 
| 
bsw/jbe@1309
 | 
    52     elseif list_tagname then
 | 
| 
bsw/jbe@1309
 | 
    53       counters[#counters] = nil
 | 
| 
bsw/jbe@1309
 | 
    54     else
 | 
| 
bsw/jbe@1309
 | 
    55       if counters[#counters] then
 | 
| 
bsw/jbe@1309
 | 
    56         counters[#counters] = counters[#counters] + 1
 | 
| 
bsw/jbe@1309
 | 
    57       end
 | 
| 
bsw/jbe@1309
 | 
    58       local string_parts = {}
 | 
| 
bsw/jbe@1309
 | 
    59       for idx, counter in ipairs(counters) do
 | 
| 
bsw/jbe@1309
 | 
    60         if counter then
 | 
| 
bsw/jbe@1309
 | 
    61           string_parts[idx] = tostring(counter) .. ". "
 | 
| 
bsw/jbe@1309
 | 
    62         else
 | 
| 
bsw/jbe@1309
 | 
    63           string_parts[idx] = "* "
 | 
| 
bsw/jbe@1309
 | 
    64         end
 | 
| 
bsw/jbe@1309
 | 
    65       end
 | 
| 
bsw/jbe@1309
 | 
    66       li_info[#li_info+1] = table.concat(string_parts)
 | 
| 
bsw/jbe@1309
 | 
    67     end
 | 
| 
bsw/jbe@1309
 | 
    68   end
 | 
| 
bsw/jbe@1309
 | 
    69   str = string.gsub(str, "</?[OoUu]l *>", "\n\n")
 | 
| 
bsw/jbe@1309
 | 
    70   local li_index = 0
 | 
| 
bsw/jbe@1309
 | 
    71   str = string.gsub(str, "<[Ll][Ii] *>", function()
 | 
| 
bsw/jbe@1309
 | 
    72     li_index = li_index + 1
 | 
| 
bsw/jbe@1309
 | 
    73     return li_info[li_index]
 | 
| 
bsw/jbe@1309
 | 
    74   end)
 | 
| 
bsw/jbe@1309
 | 
    75   str = string.gsub(str, "</[Ll][Ii] *>", "\n")
 | 
| 
bsw/jbe@1309
 | 
    76   str = string.gsub(str, "<[^<>]*>", "")
 | 
| 
bsw/jbe@1309
 | 
    77   str = string.gsub(str, "<", "<")
 | 
| 
bsw/jbe@1309
 | 
    78   str = string.gsub(str, ">", ">")
 | 
| 
bsw/jbe@1309
 | 
    79   str = string.gsub(str, "  +", " ")
 | 
| 
bsw/jbe@1309
 | 
    80   str = string.gsub(str, "%f[^\0\n] ", "")
 | 
| 
bsw/jbe@1309
 | 
    81   str = string.gsub(str, " %f[\0\n]", "")
 | 
| 
bsw/jbe@1309
 | 
    82   str = string.gsub(str, "\n\n\n+", "\n\n")
 | 
| 
bsw/jbe@1309
 | 
    83   str = string.gsub(str, "^\n+", "")
 | 
| 
bsw/jbe@1309
 | 
    84   str = string.gsub(str, "\n*$", "\n")
 | 
| 
bsw/jbe@1309
 | 
    85   return str
 | 
| 
bsw/jbe@1309
 | 
    86 end
 |