# HG changeset patch # User jbe # Date 1389495467 -3600 # Node ID 0c4841af07a528d6e1ccc559782e9ed709942c22 # Parent db7ad8e4f78bf3bfe0eaf62611a85a352d10ce2b String truncating by counting Unicode codepoints in format.string(...) (grapheme cluster boundary detection not implemented) diff -r db7ad8e4f78b -r 0c4841af07a5 framework/env/format/string.lua --- a/framework/env/format/string.lua Sun Nov 04 18:51:32 2012 +0100 +++ b/framework/env/format/string.lua Sun Jan 12 03:57:47 2014 +0100 @@ -1,20 +1,76 @@ --[[-- -text = -- a string +text = -- a string format.string( - value, -- any value where tostring(value) gives a reasonable result + value, -- any value where tostring(value) gives a reasonable result { - nil_as = nil_text -- text to be returned for a nil value + nil_as = nil_text, -- text to be returned for a nil value + truncate_mode = "codepoints", -- performe truncating by counting UTF-8 codepoints ("codepoints") or Unicode grapheme clusters ("graphmeclusters") + -- (currently only "codepoints" are supported and this option may be omitted) + truncate_at = truncate_at, -- truncate string after the given number of UTF-8 codepoints (or Unicode grapheme clusterst) + truncate_if = truncate_if, -- truncate only if length exceeds the given number of UTF-8 codepoints (or Unicode grapheme clusters) + truncate_suffix = truncate_suffix -- string to append, if string was truncated } ) -Formats a value as a text by calling tostring(...), unless the value is nil, in which case the text returned is chosen by the 'nil_as' option. +Formats a value as a text by calling tostring(...), unless the value is nil, in which case the text returned is chosen by the 'nil_as' option. Using the 'truncate_*' parameters, it is possible to show only the beginning of a string. --]]-- +function truncate_codepoints(str, truncate_at_codepoint, truncate_if_codepoint, suffix) + local byte_pos = 0 + local codepoint_pos = 0 + local truncate_at_byte + truncate_at_codepoint = truncate_at_codepoint or truncate_if_codepoint + truncate_if_codepoint = truncate_if_codepoint or truncate_at_codepoint + while true do + b1, b2, b3, b4 = string.byte(str, byte_pos, byte_pos+3) + if b1 then + if codepoint_pos > truncate_if_codepoint then + return string.sub(str, 1, truncate_at_byte or byte_pos) .. (suffix or "") + end + if codepoint_pos == truncate_at_codepoint then + truncate_at_byte = byte_pos + end + if b1 < 128 then + byte_pos = byte_pos + 1 + elseif b1 >= 192 and b1 < 248 then + if b2 and b2 >= 128 and b2 < 192 then + if b1 < 240 and b3 and b3 >= 128 and b3 < 192 then + if b1 < 224 and b4 and b4 >= 128 and b4 < 192 then + byte_pos = byte_pos + 4 + else + byte_pos = byte_pos + 3 + end + else + byte_pos = byte_pos + 2 + end + else + byte_pos = byte_pos + 1 + end + else + byte_pos = byte_pos + 1 + end + codepoint_pos = codepoint_pos + 1 + else + break + end + end + return str +end + function format.string(str, options) local options = options or {} if str == nil then return options.nil_as or "" + elseif options.truncate_at or options.truncate_if then + -- TODO: Unicode grapheme cluster boundary detetion is not implemented + -- (Unicode codepoints are used instead) + return truncate_codepoints( + tostring(str), + options.truncate_at, + options.truncate_if, + options.truncate_suffix + ) else return tostring(str) end