webmcp

changeset 110:0c4841af07a5

String truncating by counting Unicode codepoints in format.string(...)
(grapheme cluster boundary detection not implemented)
author jbe
date Sun Jan 12 03:57:47 2014 +0100 (2014-01-12)
parents db7ad8e4f78b
children 43986d8dacf3
files framework/env/format/string.lua
line diff
     1.1 --- a/framework/env/format/string.lua	Sun Nov 04 18:51:32 2012 +0100
     1.2 +++ b/framework/env/format/string.lua	Sun Jan 12 03:57:47 2014 +0100
     1.3 @@ -1,20 +1,76 @@
     1.4  --[[--
     1.5 -text =                 -- a string
     1.6 +text =                                          -- a string
     1.7  format.string(
     1.8 -  value,               -- any value where tostring(value) gives a reasonable result
     1.9 +  value,                               -- any value where tostring(value) gives a reasonable result
    1.10    {
    1.11 -    nil_as = nil_text  -- text to be returned for a nil value
    1.12 +    nil_as          = nil_text,        -- text to be returned for a nil value
    1.13 +    truncate_mode   = "codepoints",    -- performe truncating by counting UTF-8 codepoints ("codepoints") or Unicode grapheme clusters ("graphmeclusters")
    1.14 +                                       -- (currently only "codepoints" are supported and this option may be omitted)
    1.15 +    truncate_at     = truncate_at,     -- truncate string after the given number of UTF-8 codepoints (or Unicode grapheme clusterst)
    1.16 +    truncate_if     = truncate_if,     -- truncate only if length exceeds the given number of UTF-8 codepoints (or Unicode grapheme clusters)
    1.17 +    truncate_suffix = truncate_suffix  -- string to append, if string was truncated
    1.18    }
    1.19  )
    1.20  
    1.21 -Formats a value as a text by calling tostring(...), unless the value is nil, in which case the text returned is chosen by the 'nil_as' option.
    1.22 +Formats a value as a text by calling tostring(...), unless the value is nil, in which case the text returned is chosen by the 'nil_as' option. Using the 'truncate_*' parameters, it is possible to show only the beginning of a string.
    1.23  
    1.24  --]]--
    1.25  
    1.26 +function truncate_codepoints(str, truncate_at_codepoint, truncate_if_codepoint, suffix)
    1.27 +  local byte_pos = 0
    1.28 +  local codepoint_pos = 0
    1.29 +  local truncate_at_byte
    1.30 +  truncate_at_codepoint = truncate_at_codepoint or truncate_if_codepoint
    1.31 +  truncate_if_codepoint = truncate_if_codepoint or truncate_at_codepoint
    1.32 +  while true do
    1.33 +    b1, b2, b3, b4 = string.byte(str, byte_pos, byte_pos+3)
    1.34 +    if b1 then
    1.35 +      if codepoint_pos > truncate_if_codepoint then
    1.36 +        return string.sub(str, 1, truncate_at_byte or byte_pos) .. (suffix or "")
    1.37 +      end
    1.38 +      if codepoint_pos == truncate_at_codepoint then
    1.39 +        truncate_at_byte = byte_pos
    1.40 +      end
    1.41 +      if b1 < 128 then
    1.42 +        byte_pos = byte_pos + 1
    1.43 +      elseif b1 >= 192 and b1 < 248 then
    1.44 +        if b2 and b2 >= 128 and b2 < 192 then
    1.45 +          if b1 < 240 and b3 and b3 >= 128 and b3 < 192 then
    1.46 +            if b1 < 224 and b4 and b4 >= 128 and b4 < 192 then
    1.47 +              byte_pos = byte_pos + 4
    1.48 +            else
    1.49 +              byte_pos = byte_pos + 3
    1.50 +            end
    1.51 +          else
    1.52 +            byte_pos = byte_pos + 2
    1.53 +          end
    1.54 +        else
    1.55 +          byte_pos = byte_pos + 1
    1.56 +        end
    1.57 +      else
    1.58 +        byte_pos = byte_pos + 1
    1.59 +      end
    1.60 +      codepoint_pos = codepoint_pos + 1
    1.61 +    else
    1.62 +      break
    1.63 +    end
    1.64 +  end
    1.65 +  return str
    1.66 +end
    1.67 +
    1.68  function format.string(str, options)
    1.69    local options = options or {}
    1.70    if str == nil then
    1.71      return options.nil_as or ""
    1.72 +  elseif options.truncate_at or options.truncate_if then
    1.73 +    -- TODO: Unicode grapheme cluster boundary detetion is not implemented
    1.74 +    -- (Unicode codepoints are used instead)
    1.75 +    return truncate_codepoints(
    1.76 +      tostring(str),
    1.77 +      options.truncate_at,
    1.78 +      options.truncate_if,
    1.79 +      options.truncate_suffix
    1.80 +    )
    1.81    else
    1.82      return tostring(str)
    1.83    end

Impressum / About Us