webmcp
annotate framework/env/format/string.lua @ 110:0c4841af07a5
String truncating by counting Unicode codepoints in format.string(...)
(grapheme cluster boundary detection not implemented)
(grapheme cluster boundary detection not implemented)
author | jbe |
---|---|
date | Sun Jan 12 03:57:47 2014 +0100 (2014-01-12) |
parents | 9fdfb27f8e67 |
children | 43986d8dacf3 |
rev | line source |
---|---|
jbe/bsw@0 | 1 --[[-- |
jbe@110 | 2 text = -- a string |
jbe/bsw@0 | 3 format.string( |
jbe@110 | 4 value, -- any value where tostring(value) gives a reasonable result |
jbe/bsw@0 | 5 { |
jbe@110 | 6 nil_as = nil_text, -- text to be returned for a nil value |
jbe@110 | 7 truncate_mode = "codepoints", -- performe truncating by counting UTF-8 codepoints ("codepoints") or Unicode grapheme clusters ("graphmeclusters") |
jbe@110 | 8 -- (currently only "codepoints" are supported and this option may be omitted) |
jbe@110 | 9 truncate_at = truncate_at, -- truncate string after the given number of UTF-8 codepoints (or Unicode grapheme clusterst) |
jbe@110 | 10 truncate_if = truncate_if, -- truncate only if length exceeds the given number of UTF-8 codepoints (or Unicode grapheme clusters) |
jbe@110 | 11 truncate_suffix = truncate_suffix -- string to append, if string was truncated |
jbe/bsw@0 | 12 } |
jbe/bsw@0 | 13 ) |
jbe/bsw@0 | 14 |
jbe@110 | 15 Formats a value as a text by calling tostring(...), unless the value is nil, in which case the text returned is chosen by the 'nil_as' option. Using the 'truncate_*' parameters, it is possible to show only the beginning of a string. |
jbe/bsw@0 | 16 |
jbe/bsw@0 | 17 --]]-- |
jbe/bsw@0 | 18 |
jbe@110 | 19 function truncate_codepoints(str, truncate_at_codepoint, truncate_if_codepoint, suffix) |
jbe@110 | 20 local byte_pos = 0 |
jbe@110 | 21 local codepoint_pos = 0 |
jbe@110 | 22 local truncate_at_byte |
jbe@110 | 23 truncate_at_codepoint = truncate_at_codepoint or truncate_if_codepoint |
jbe@110 | 24 truncate_if_codepoint = truncate_if_codepoint or truncate_at_codepoint |
jbe@110 | 25 while true do |
jbe@110 | 26 b1, b2, b3, b4 = string.byte(str, byte_pos, byte_pos+3) |
jbe@110 | 27 if b1 then |
jbe@110 | 28 if codepoint_pos > truncate_if_codepoint then |
jbe@110 | 29 return string.sub(str, 1, truncate_at_byte or byte_pos) .. (suffix or "") |
jbe@110 | 30 end |
jbe@110 | 31 if codepoint_pos == truncate_at_codepoint then |
jbe@110 | 32 truncate_at_byte = byte_pos |
jbe@110 | 33 end |
jbe@110 | 34 if b1 < 128 then |
jbe@110 | 35 byte_pos = byte_pos + 1 |
jbe@110 | 36 elseif b1 >= 192 and b1 < 248 then |
jbe@110 | 37 if b2 and b2 >= 128 and b2 < 192 then |
jbe@110 | 38 if b1 < 240 and b3 and b3 >= 128 and b3 < 192 then |
jbe@110 | 39 if b1 < 224 and b4 and b4 >= 128 and b4 < 192 then |
jbe@110 | 40 byte_pos = byte_pos + 4 |
jbe@110 | 41 else |
jbe@110 | 42 byte_pos = byte_pos + 3 |
jbe@110 | 43 end |
jbe@110 | 44 else |
jbe@110 | 45 byte_pos = byte_pos + 2 |
jbe@110 | 46 end |
jbe@110 | 47 else |
jbe@110 | 48 byte_pos = byte_pos + 1 |
jbe@110 | 49 end |
jbe@110 | 50 else |
jbe@110 | 51 byte_pos = byte_pos + 1 |
jbe@110 | 52 end |
jbe@110 | 53 codepoint_pos = codepoint_pos + 1 |
jbe@110 | 54 else |
jbe@110 | 55 break |
jbe@110 | 56 end |
jbe@110 | 57 end |
jbe@110 | 58 return str |
jbe@110 | 59 end |
jbe@110 | 60 |
jbe/bsw@0 | 61 function format.string(str, options) |
jbe/bsw@0 | 62 local options = options or {} |
jbe/bsw@0 | 63 if str == nil then |
jbe/bsw@0 | 64 return options.nil_as or "" |
jbe@110 | 65 elseif options.truncate_at or options.truncate_if then |
jbe@110 | 66 -- TODO: Unicode grapheme cluster boundary detetion is not implemented |
jbe@110 | 67 -- (Unicode codepoints are used instead) |
jbe@110 | 68 return truncate_codepoints( |
jbe@110 | 69 tostring(str), |
jbe@110 | 70 options.truncate_at, |
jbe@110 | 71 options.truncate_if, |
jbe@110 | 72 options.truncate_suffix |
jbe@110 | 73 ) |
jbe/bsw@0 | 74 else |
jbe/bsw@0 | 75 return tostring(str) |
jbe/bsw@0 | 76 end |
jbe/bsw@0 | 77 end |