webmcp
changeset 111:43986d8dacf3
Better (and bugfixed) implementation of string truncating in format.string(...)
| author | jbe | 
|---|---|
| date | Mon Jan 13 21:37:23 2014 +0100 (2014-01-13) | 
| parents | 0c4841af07a5 | 
| children | 407633fd0e84 | 
| files | framework/env/format/string.lua | 
   line diff
1.1 --- a/framework/env/format/string.lua Sun Jan 12 03:57:47 2014 +0100 1.2 +++ b/framework/env/format/string.lua Mon Jan 13 21:37:23 2014 +0100 1.3 @@ -1,14 +1,14 @@ 1.4 --[[-- 1.5 text = -- a string 1.6 format.string( 1.7 - value, -- any value where tostring(value) gives a reasonable result 1.8 + value, -- any value where tostring(value) gives a reasonable result 1.9 { 1.10 - nil_as = nil_text, -- text to be returned for a nil value 1.11 - truncate_mode = "codepoints", -- performe truncating by counting UTF-8 codepoints ("codepoints") or Unicode grapheme clusters ("graphmeclusters") 1.12 - -- (currently only "codepoints" are supported and this option may be omitted) 1.13 - truncate_at = truncate_at, -- truncate string after the given number of UTF-8 codepoints (or Unicode grapheme clusterst) 1.14 - truncate_if = truncate_if, -- truncate only if length exceeds the given number of UTF-8 codepoints (or Unicode grapheme clusters) 1.15 - truncate_suffix = truncate_suffix -- string to append, if string was truncated 1.16 + nil_as = nil_text, -- text to be returned for a nil value 1.17 + truncate_mode = "codepoints", -- performe truncating by counting UTF-8 codepoints ("codepoints") or Unicode grapheme clusters ("graphmeclusters") 1.18 + -- (currently only "codepoints" are supported and this option may be omitted) 1.19 + truncate_at = truncate_at, -- truncate string after the given number of UTF-8 codepoints (or Unicode grapheme clusters) 1.20 + truncate_suffix = truncate_suffix, -- string to append, if string was truncated (use boolean true for Unicode ellipsis) 1.21 + truncate_count_suffix = truncate_count_suffix -- unless explicitly set to false, the total length (including suffix) may not exceed the given length 1.22 } 1.23 ) 1.24 1.25 @@ -16,61 +16,96 @@ 1.26 1.27 --]]-- 1.28 1.29 -function truncate_codepoints(str, truncate_at_codepoint, truncate_if_codepoint, suffix) 1.30 - local byte_pos = 0 1.31 - local codepoint_pos = 0 1.32 - local truncate_at_byte 1.33 - truncate_at_codepoint = truncate_at_codepoint or truncate_if_codepoint 1.34 - truncate_if_codepoint = truncate_if_codepoint or truncate_at_codepoint 1.35 - while true do 1.36 +local function codepoint_count(str) 1.37 + return #string.gsub(str, '[\128-\255][\128-\191]?[\128-\191]?[\128-\191]?', 'x') 1.38 +end 1.39 + 1.40 +local function codepoint_truncate(str, length) 1.41 + local byte_pos = 1 1.42 + local count = 0 1.43 + while count < length do 1.44 b1, b2, b3, b4 = string.byte(str, byte_pos, byte_pos+3) 1.45 - if b1 then 1.46 - if codepoint_pos > truncate_if_codepoint then 1.47 - return string.sub(str, 1, truncate_at_byte or byte_pos) .. (suffix or "") 1.48 - end 1.49 - if codepoint_pos == truncate_at_codepoint then 1.50 - truncate_at_byte = byte_pos 1.51 - end 1.52 - if b1 < 128 then 1.53 - byte_pos = byte_pos + 1 1.54 - elseif b1 >= 192 and b1 < 248 then 1.55 - if b2 and b2 >= 128 and b2 < 192 then 1.56 - if b1 < 240 and b3 and b3 >= 128 and b3 < 192 then 1.57 - if b1 < 224 and b4 and b4 >= 128 and b4 < 192 then 1.58 - byte_pos = byte_pos + 4 1.59 - else 1.60 - byte_pos = byte_pos + 3 1.61 - end 1.62 - else 1.63 - byte_pos = byte_pos + 2 1.64 - end 1.65 + if not b2 then 1.66 + break 1.67 + end 1.68 + b3 = b3 or 0 1.69 + b4 = b4 or 0 1.70 + if b1 >= 128 and b2 >= 128 and b2 <= 191 then 1.71 + if b3 >= 128 and b3 <= 191 then 1.72 + if b4 >= 128 and b4 <= 191 then 1.73 + byte_pos = byte_pos + 4 1.74 + count = count + 1 1.75 + elseif count + 1 < length and b4 < 128 then 1.76 + byte_pos = byte_pos + 4 1.77 + count = count + 2 1.78 else 1.79 - byte_pos = byte_pos + 1 1.80 + byte_pos = byte_pos + 3 1.81 + count = count + 1 1.82 + end 1.83 + elseif count + 1 < length and b3 < 128 then 1.84 + if count + 2 < length and b4 < 128 then 1.85 + byte_pos = byte_pos + 4 1.86 + count = count + 3 1.87 + else 1.88 + byte_pos = byte_pos + 3 1.89 + count = count + 2 1.90 end 1.91 else 1.92 - byte_pos = byte_pos + 1 1.93 + byte_pos = byte_pos + 2 1.94 + count = count + 1 1.95 end 1.96 - codepoint_pos = codepoint_pos + 1 1.97 + elseif count + 1 < length and b2 < 128 then 1.98 + if count + 2 < length and b3 < 128 then 1.99 + if count + 3 < length and b4 < 128 then 1.100 + byte_pos = byte_pos + 4 1.101 + count = count + 4 1.102 + else 1.103 + byte_pos = byte_pos + 3 1.104 + count = count + 3 1.105 + end 1.106 + else 1.107 + byte_pos = byte_pos + 2 1.108 + count = count + 2 1.109 + end 1.110 else 1.111 - break 1.112 + byte_pos = byte_pos + 1 1.113 + count = count + 1 1.114 end 1.115 end 1.116 - return str 1.117 + return string.sub(str, 1, byte_pos-1) 1.118 end 1.119 1.120 function format.string(str, options) 1.121 local options = options or {} 1.122 if str == nil then 1.123 return options.nil_as or "" 1.124 - elseif options.truncate_at or options.truncate_if then 1.125 + elseif options.truncate_at then 1.126 + str = tostring(str) 1.127 -- TODO: Unicode grapheme cluster boundary detetion is not implemented 1.128 -- (Unicode codepoints are used instead) 1.129 - return truncate_codepoints( 1.130 - tostring(str), 1.131 - options.truncate_at, 1.132 - options.truncate_if, 1.133 - options.truncate_suffix 1.134 - ) 1.135 + local truncate_suffix = options.truncate_suffix 1.136 + if truncate_suffix == true then 1.137 + truncate_suffix = '\226\128\166' 1.138 + elseif not truncate_suffix then 1.139 + truncate_suffix = '' 1.140 + end 1.141 + if options.truncate_count_suffix ~= false and truncate_suffix then 1.142 + local suffix_length = codepoint_count(truncate_suffix) 1.143 + if codepoint_count(str) > options.truncate_at then 1.144 + return ( 1.145 + codepoint_truncate(str, options.truncate_at - suffix_length) .. 1.146 + truncate_suffix 1.147 + ) 1.148 + else 1.149 + return str 1.150 + end 1.151 + else 1.152 + if codepoint_count(str) > options.truncate_at then 1.153 + return codepoint_truncate(str, options.truncate_at) .. truncate_suffix 1.154 + else 1.155 + return str 1.156 + end 1.157 + end 1.158 else 1.159 return tostring(str) 1.160 end