bsw/jbe@1309: function util.html_to_text(str)
bsw/jbe@1309: str = string.gsub(str, "[\0-\32]", " ")
bsw/jbe@1309: str = string.gsub(str, "<[Bb][Rr] */?>", "\n")
bsw/jbe@1309: str = string.gsub(str, "?[Pp] *>", "\n\n")
bsw/jbe@1309: str = string.gsub(str, "?[Bb] *>", "**")
bsw/jbe@1309: str = string.gsub(str, "?[Ii] *>", "//")
bsw/jbe@1309: str = string.gsub(str, "?[Ss][Uu][Bb] *>", "__")
bsw/jbe@1309: str = string.gsub(str, "?[Ss][Uu][Pp] *>", "^^")
bsw/jbe@1309: str = string.gsub(str, '<[Aa] *[Hh][Rr][Ee][Ff] *= *"([^"]*)" *>', "[[%1 ")
bsw/jbe@1309: str = string.gsub(str, "<[Aa] *[Hh][Rr][Ee][Ff] *= *'([^']*)' *>", "[[%1 ")
bsw/jbe@1309: str = string.gsub(str, "<[Aa] *[Hh][Rr][Ee][Ff] *= *([^ <>\"']*) *>", "[[%1 ")
bsw/jbe@1309: str = string.gsub(str, "[Aa] *>", "]]")
bsw/jbe@1309: str = string.gsub(str, "<[Hh]1 *>", "\n\n###### ")
bsw/jbe@1309: str = string.gsub(str, "<[Hh]2 *>", "\n\n##### ")
bsw/jbe@1309: str = string.gsub(str, "<[Hh]3 *>", "\n\n#### ")
bsw/jbe@1309: str = string.gsub(str, "<[Hh]4 *>", "\n\n### ")
bsw/jbe@1309: str = string.gsub(str, "<[Hh]5 *>", "\n\n## ")
bsw/jbe@1309: str = string.gsub(str, "<[Hh]6 *>", "\n\n# ")
bsw/jbe@1309: str = string.gsub(str, "[Hh]1 *>", " ######\n\n")
bsw/jbe@1309: str = string.gsub(str, "[Hh]2 *>", " #####\n\n")
bsw/jbe@1309: str = string.gsub(str, "[Hh]3 *>", " ####\n\n")
bsw/jbe@1309: str = string.gsub(str, "[Hh]4 *>", " ###\n\n")
bsw/jbe@1309: str = string.gsub(str, "[Hh]5 *>", " ##\n\n")
bsw/jbe@1309: str = string.gsub(str, "[Hh]6 *>", " #\n\n")
bsw/jbe@1309: local li_info = {}
bsw/jbe@1309: local pos = 1
bsw/jbe@1309: local counters = {}
bsw/jbe@1309: while true do
bsw/jbe@1309: local list_start, list_stop, list_tagname = string.find(str, "<(/?[OoUu]l) *>", pos)
bsw/jbe@1309: if list_tagname then
bsw/jbe@1309: list_tagname = string.lower(list_tagname)
bsw/jbe@1309: end
bsw/jbe@1309: local elem_start, elem_stop = string.find(str, "<[Ll][Ii] *>", pos)
bsw/jbe@1309: if list_start and not elem_start then
bsw/jbe@1309: pos = list_stop
bsw/jbe@1309: elseif elem_start and not list_start then
bsw/jbe@1309: pos = elem_stop
bsw/jbe@1309: elseif list_start and elem_start then
bsw/jbe@1309: if list_start < elem_start then
bsw/jbe@1309: pos = list_stop
bsw/jbe@1309: else
bsw/jbe@1309: pos = elem_stop
bsw/jbe@1309: list_tagname = nil
bsw/jbe@1309: end
bsw/jbe@1309: else
bsw/jbe@1309: break
bsw/jbe@1309: end
bsw/jbe@1309: if list_tagname == "ol" then
bsw/jbe@1309: counters[#counters+1] = 0
bsw/jbe@1309: elseif list_tagname == "ul" then
bsw/jbe@1309: counters[#counters+1] = false
bsw/jbe@1309: elseif list_tagname then
bsw/jbe@1309: counters[#counters] = nil
bsw/jbe@1309: else
bsw/jbe@1309: if counters[#counters] then
bsw/jbe@1309: counters[#counters] = counters[#counters] + 1
bsw/jbe@1309: end
bsw/jbe@1309: local string_parts = {}
bsw/jbe@1309: for idx, counter in ipairs(counters) do
bsw/jbe@1309: if counter then
bsw/jbe@1309: string_parts[idx] = tostring(counter) .. ". "
bsw/jbe@1309: else
bsw/jbe@1309: string_parts[idx] = "* "
bsw/jbe@1309: end
bsw/jbe@1309: end
bsw/jbe@1309: li_info[#li_info+1] = table.concat(string_parts)
bsw/jbe@1309: end
bsw/jbe@1309: end
bsw/jbe@1309: str = string.gsub(str, "?[OoUu]l *>", "\n\n")
bsw/jbe@1309: local li_index = 0
bsw/jbe@1309: str = string.gsub(str, "<[Ll][Ii] *>", function()
bsw/jbe@1309: li_index = li_index + 1
bsw/jbe@1309: return li_info[li_index]
bsw/jbe@1309: end)
bsw/jbe@1309: str = string.gsub(str, "[Ll][Ii] *>", "\n")
bsw/jbe@1309: str = string.gsub(str, "<[^<>]*>", "")
bsw/jbe@1309: str = string.gsub(str, "<", "<")
bsw/jbe@1309: str = string.gsub(str, ">", ">")
bsw/jbe@1309: str = string.gsub(str, " +", " ")
bsw/jbe@1309: str = string.gsub(str, "%f[^\0\n] ", "")
bsw/jbe@1309: str = string.gsub(str, " %f[\0\n]", "")
bsw/jbe@1309: str = string.gsub(str, "\n\n\n+", "\n\n")
bsw/jbe@1309: str = string.gsub(str, "^\n+", "")
bsw/jbe@1309: str = string.gsub(str, "\n*$", "\n")
bsw/jbe@1309: return str
bsw/jbe@1309: end