# HG changeset patch # User jbe # Date 1389645443 -3600 # Node ID 43986d8dacf3fb25466b78bb9fbbce6be160a794 # Parent 0c4841af07a528d6e1ccc559782e9ed709942c22 Better (and bugfixed) implementation of string truncating in format.string(...) diff -r 0c4841af07a5 -r 43986d8dacf3 framework/env/format/string.lua --- a/framework/env/format/string.lua Sun Jan 12 03:57:47 2014 +0100 +++ b/framework/env/format/string.lua Mon Jan 13 21:37:23 2014 +0100 @@ -1,14 +1,14 @@ --[[-- text = -- a string format.string( - value, -- any value where tostring(value) gives a reasonable result + value, -- any value where tostring(value) gives a reasonable result { - nil_as = nil_text, -- text to be returned for a nil value - truncate_mode = "codepoints", -- performe truncating by counting UTF-8 codepoints ("codepoints") or Unicode grapheme clusters ("graphmeclusters") - -- (currently only "codepoints" are supported and this option may be omitted) - truncate_at = truncate_at, -- truncate string after the given number of UTF-8 codepoints (or Unicode grapheme clusterst) - truncate_if = truncate_if, -- truncate only if length exceeds the given number of UTF-8 codepoints (or Unicode grapheme clusters) - truncate_suffix = truncate_suffix -- string to append, if string was truncated + nil_as = nil_text, -- text to be returned for a nil value + truncate_mode = "codepoints", -- performe truncating by counting UTF-8 codepoints ("codepoints") or Unicode grapheme clusters ("graphmeclusters") + -- (currently only "codepoints" are supported and this option may be omitted) + truncate_at = truncate_at, -- truncate string after the given number of UTF-8 codepoints (or Unicode grapheme clusters) + truncate_suffix = truncate_suffix, -- string to append, if string was truncated (use boolean true for Unicode ellipsis) + truncate_count_suffix = truncate_count_suffix -- unless explicitly set to false, the total length (including suffix) may not exceed the given length } ) @@ -16,61 +16,96 @@ --]]-- -function truncate_codepoints(str, truncate_at_codepoint, truncate_if_codepoint, suffix) - local byte_pos = 0 - local codepoint_pos = 0 - local truncate_at_byte - truncate_at_codepoint = truncate_at_codepoint or truncate_if_codepoint - truncate_if_codepoint = truncate_if_codepoint or truncate_at_codepoint - while true do +local function codepoint_count(str) + return #string.gsub(str, '[\128-\255][\128-\191]?[\128-\191]?[\128-\191]?', 'x') +end + +local function codepoint_truncate(str, length) + local byte_pos = 1 + local count = 0 + while count < length do b1, b2, b3, b4 = string.byte(str, byte_pos, byte_pos+3) - if b1 then - if codepoint_pos > truncate_if_codepoint then - return string.sub(str, 1, truncate_at_byte or byte_pos) .. (suffix or "") - end - if codepoint_pos == truncate_at_codepoint then - truncate_at_byte = byte_pos - end - if b1 < 128 then - byte_pos = byte_pos + 1 - elseif b1 >= 192 and b1 < 248 then - if b2 and b2 >= 128 and b2 < 192 then - if b1 < 240 and b3 and b3 >= 128 and b3 < 192 then - if b1 < 224 and b4 and b4 >= 128 and b4 < 192 then - byte_pos = byte_pos + 4 - else - byte_pos = byte_pos + 3 - end - else - byte_pos = byte_pos + 2 - end + if not b2 then + break + end + b3 = b3 or 0 + b4 = b4 or 0 + if b1 >= 128 and b2 >= 128 and b2 <= 191 then + if b3 >= 128 and b3 <= 191 then + if b4 >= 128 and b4 <= 191 then + byte_pos = byte_pos + 4 + count = count + 1 + elseif count + 1 < length and b4 < 128 then + byte_pos = byte_pos + 4 + count = count + 2 else - byte_pos = byte_pos + 1 + byte_pos = byte_pos + 3 + count = count + 1 + end + elseif count + 1 < length and b3 < 128 then + if count + 2 < length and b4 < 128 then + byte_pos = byte_pos + 4 + count = count + 3 + else + byte_pos = byte_pos + 3 + count = count + 2 end else - byte_pos = byte_pos + 1 + byte_pos = byte_pos + 2 + count = count + 1 end - codepoint_pos = codepoint_pos + 1 + elseif count + 1 < length and b2 < 128 then + if count + 2 < length and b3 < 128 then + if count + 3 < length and b4 < 128 then + byte_pos = byte_pos + 4 + count = count + 4 + else + byte_pos = byte_pos + 3 + count = count + 3 + end + else + byte_pos = byte_pos + 2 + count = count + 2 + end else - break + byte_pos = byte_pos + 1 + count = count + 1 end end - return str + return string.sub(str, 1, byte_pos-1) end function format.string(str, options) local options = options or {} if str == nil then return options.nil_as or "" - elseif options.truncate_at or options.truncate_if then + elseif options.truncate_at then + str = tostring(str) -- TODO: Unicode grapheme cluster boundary detetion is not implemented -- (Unicode codepoints are used instead) - return truncate_codepoints( - tostring(str), - options.truncate_at, - options.truncate_if, - options.truncate_suffix - ) + local truncate_suffix = options.truncate_suffix + if truncate_suffix == true then + truncate_suffix = '\226\128\166' + elseif not truncate_suffix then + truncate_suffix = '' + end + if options.truncate_count_suffix ~= false and truncate_suffix then + local suffix_length = codepoint_count(truncate_suffix) + if codepoint_count(str) > options.truncate_at then + return ( + codepoint_truncate(str, options.truncate_at - suffix_length) .. + truncate_suffix + ) + else + return str + end + else + if codepoint_count(str) > options.truncate_at then + return codepoint_truncate(str, options.truncate_at) .. truncate_suffix + else + return str + end + end else return tostring(str) end