webmcp
diff framework/env/format/string.lua @ 110:0c4841af07a5
String truncating by counting Unicode codepoints in format.string(...)
(grapheme cluster boundary detection not implemented)
(grapheme cluster boundary detection not implemented)
author | jbe |
---|---|
date | Sun Jan 12 03:57:47 2014 +0100 (2014-01-12) |
parents | 9fdfb27f8e67 |
children | 43986d8dacf3 |
line diff
1.1 --- a/framework/env/format/string.lua Sun Nov 04 18:51:32 2012 +0100 1.2 +++ b/framework/env/format/string.lua Sun Jan 12 03:57:47 2014 +0100 1.3 @@ -1,20 +1,76 @@ 1.4 --[[-- 1.5 -text = -- a string 1.6 +text = -- a string 1.7 format.string( 1.8 - value, -- any value where tostring(value) gives a reasonable result 1.9 + value, -- any value where tostring(value) gives a reasonable result 1.10 { 1.11 - nil_as = nil_text -- text to be returned for a nil value 1.12 + nil_as = nil_text, -- text to be returned for a nil value 1.13 + truncate_mode = "codepoints", -- performe truncating by counting UTF-8 codepoints ("codepoints") or Unicode grapheme clusters ("graphmeclusters") 1.14 + -- (currently only "codepoints" are supported and this option may be omitted) 1.15 + truncate_at = truncate_at, -- truncate string after the given number of UTF-8 codepoints (or Unicode grapheme clusterst) 1.16 + truncate_if = truncate_if, -- truncate only if length exceeds the given number of UTF-8 codepoints (or Unicode grapheme clusters) 1.17 + truncate_suffix = truncate_suffix -- string to append, if string was truncated 1.18 } 1.19 ) 1.20 1.21 -Formats a value as a text by calling tostring(...), unless the value is nil, in which case the text returned is chosen by the 'nil_as' option. 1.22 +Formats a value as a text by calling tostring(...), unless the value is nil, in which case the text returned is chosen by the 'nil_as' option. Using the 'truncate_*' parameters, it is possible to show only the beginning of a string. 1.23 1.24 --]]-- 1.25 1.26 +function truncate_codepoints(str, truncate_at_codepoint, truncate_if_codepoint, suffix) 1.27 + local byte_pos = 0 1.28 + local codepoint_pos = 0 1.29 + local truncate_at_byte 1.30 + truncate_at_codepoint = truncate_at_codepoint or truncate_if_codepoint 1.31 + truncate_if_codepoint = truncate_if_codepoint or truncate_at_codepoint 1.32 + while true do 1.33 + b1, b2, b3, b4 = string.byte(str, byte_pos, byte_pos+3) 1.34 + if b1 then 1.35 + if codepoint_pos > truncate_if_codepoint then 1.36 + return string.sub(str, 1, truncate_at_byte or byte_pos) .. (suffix or "") 1.37 + end 1.38 + if codepoint_pos == truncate_at_codepoint then 1.39 + truncate_at_byte = byte_pos 1.40 + end 1.41 + if b1 < 128 then 1.42 + byte_pos = byte_pos + 1 1.43 + elseif b1 >= 192 and b1 < 248 then 1.44 + if b2 and b2 >= 128 and b2 < 192 then 1.45 + if b1 < 240 and b3 and b3 >= 128 and b3 < 192 then 1.46 + if b1 < 224 and b4 and b4 >= 128 and b4 < 192 then 1.47 + byte_pos = byte_pos + 4 1.48 + else 1.49 + byte_pos = byte_pos + 3 1.50 + end 1.51 + else 1.52 + byte_pos = byte_pos + 2 1.53 + end 1.54 + else 1.55 + byte_pos = byte_pos + 1 1.56 + end 1.57 + else 1.58 + byte_pos = byte_pos + 1 1.59 + end 1.60 + codepoint_pos = codepoint_pos + 1 1.61 + else 1.62 + break 1.63 + end 1.64 + end 1.65 + return str 1.66 +end 1.67 + 1.68 function format.string(str, options) 1.69 local options = options or {} 1.70 if str == nil then 1.71 return options.nil_as or "" 1.72 + elseif options.truncate_at or options.truncate_if then 1.73 + -- TODO: Unicode grapheme cluster boundary detetion is not implemented 1.74 + -- (Unicode codepoints are used instead) 1.75 + return truncate_codepoints( 1.76 + tostring(str), 1.77 + options.truncate_at, 1.78 + options.truncate_if, 1.79 + options.truncate_suffix 1.80 + ) 1.81 else 1.82 return tostring(str) 1.83 end