webmcp

changeset 111:43986d8dacf3

Better (and bugfixed) implementation of string truncating in format.string(...)
author jbe
date Mon Jan 13 21:37:23 2014 +0100 (2014-01-13)
parents 0c4841af07a5
children 407633fd0e84
files framework/env/format/string.lua
line diff
     1.1 --- a/framework/env/format/string.lua	Sun Jan 12 03:57:47 2014 +0100
     1.2 +++ b/framework/env/format/string.lua	Mon Jan 13 21:37:23 2014 +0100
     1.3 @@ -1,14 +1,14 @@
     1.4  --[[--
     1.5  text =                                          -- a string
     1.6  format.string(
     1.7 -  value,                               -- any value where tostring(value) gives a reasonable result
     1.8 +  value,                                           -- any value where tostring(value) gives a reasonable result
     1.9    {
    1.10 -    nil_as          = nil_text,        -- text to be returned for a nil value
    1.11 -    truncate_mode   = "codepoints",    -- performe truncating by counting UTF-8 codepoints ("codepoints") or Unicode grapheme clusters ("graphmeclusters")
    1.12 -                                       -- (currently only "codepoints" are supported and this option may be omitted)
    1.13 -    truncate_at     = truncate_at,     -- truncate string after the given number of UTF-8 codepoints (or Unicode grapheme clusterst)
    1.14 -    truncate_if     = truncate_if,     -- truncate only if length exceeds the given number of UTF-8 codepoints (or Unicode grapheme clusters)
    1.15 -    truncate_suffix = truncate_suffix  -- string to append, if string was truncated
    1.16 +    nil_as                = nil_text,              -- text to be returned for a nil value
    1.17 +    truncate_mode         = "codepoints",          -- performe truncating by counting UTF-8 codepoints ("codepoints") or Unicode grapheme clusters ("graphmeclusters")
    1.18 +                                                   -- (currently only "codepoints" are supported and this option may be omitted)
    1.19 +    truncate_at           = truncate_at,           -- truncate string after the given number of UTF-8 codepoints (or Unicode grapheme clusters)
    1.20 +    truncate_suffix       = truncate_suffix,       -- string to append, if string was truncated (use boolean true for Unicode ellipsis)
    1.21 +    truncate_count_suffix = truncate_count_suffix  -- unless explicitly set to false, the total length (including suffix) may not exceed the given length
    1.22    }
    1.23  )
    1.24  
    1.25 @@ -16,61 +16,96 @@
    1.26  
    1.27  --]]--
    1.28  
    1.29 -function truncate_codepoints(str, truncate_at_codepoint, truncate_if_codepoint, suffix)
    1.30 -  local byte_pos = 0
    1.31 -  local codepoint_pos = 0
    1.32 -  local truncate_at_byte
    1.33 -  truncate_at_codepoint = truncate_at_codepoint or truncate_if_codepoint
    1.34 -  truncate_if_codepoint = truncate_if_codepoint or truncate_at_codepoint
    1.35 -  while true do
    1.36 +local function codepoint_count(str)
    1.37 +  return #string.gsub(str, '[\128-\255][\128-\191]?[\128-\191]?[\128-\191]?', 'x')
    1.38 +end
    1.39 +
    1.40 +local function codepoint_truncate(str, length)
    1.41 +  local byte_pos = 1
    1.42 +  local count = 0
    1.43 +  while count < length do
    1.44      b1, b2, b3, b4 = string.byte(str, byte_pos, byte_pos+3)
    1.45 -    if b1 then
    1.46 -      if codepoint_pos > truncate_if_codepoint then
    1.47 -        return string.sub(str, 1, truncate_at_byte or byte_pos) .. (suffix or "")
    1.48 -      end
    1.49 -      if codepoint_pos == truncate_at_codepoint then
    1.50 -        truncate_at_byte = byte_pos
    1.51 -      end
    1.52 -      if b1 < 128 then
    1.53 -        byte_pos = byte_pos + 1
    1.54 -      elseif b1 >= 192 and b1 < 248 then
    1.55 -        if b2 and b2 >= 128 and b2 < 192 then
    1.56 -          if b1 < 240 and b3 and b3 >= 128 and b3 < 192 then
    1.57 -            if b1 < 224 and b4 and b4 >= 128 and b4 < 192 then
    1.58 -              byte_pos = byte_pos + 4
    1.59 -            else
    1.60 -              byte_pos = byte_pos + 3
    1.61 -            end
    1.62 -          else
    1.63 -            byte_pos = byte_pos + 2
    1.64 -          end
    1.65 +    if not b2 then
    1.66 +      break
    1.67 +    end
    1.68 +    b3 = b3 or 0
    1.69 +    b4 = b4 or 0
    1.70 +    if b1 >= 128 and b2 >= 128 and b2 <= 191 then
    1.71 +      if b3 >= 128 and b3 <= 191 then
    1.72 +        if b4 >= 128 and b4 <= 191 then
    1.73 +          byte_pos = byte_pos + 4
    1.74 +          count = count + 1
    1.75 +        elseif count + 1 < length and b4 < 128 then
    1.76 +          byte_pos = byte_pos + 4
    1.77 +          count = count + 2
    1.78          else
    1.79 -          byte_pos = byte_pos + 1
    1.80 +          byte_pos = byte_pos + 3
    1.81 +          count = count + 1
    1.82 +        end
    1.83 +      elseif count + 1 < length and b3 < 128 then
    1.84 +        if count + 2 < length and b4 < 128 then
    1.85 +          byte_pos = byte_pos + 4
    1.86 +          count = count + 3
    1.87 +        else
    1.88 +          byte_pos = byte_pos + 3
    1.89 +          count = count + 2
    1.90          end
    1.91        else
    1.92 -        byte_pos = byte_pos + 1
    1.93 +        byte_pos = byte_pos + 2
    1.94 +        count = count + 1
    1.95        end
    1.96 -      codepoint_pos = codepoint_pos + 1
    1.97 +    elseif count + 1 < length and b2 < 128 then
    1.98 +      if count + 2 < length and b3 < 128 then
    1.99 +        if count + 3 < length and b4 < 128 then
   1.100 +          byte_pos = byte_pos + 4
   1.101 +          count = count + 4
   1.102 +        else
   1.103 +          byte_pos = byte_pos + 3
   1.104 +          count = count + 3
   1.105 +        end
   1.106 +      else
   1.107 +        byte_pos = byte_pos + 2
   1.108 +        count = count + 2
   1.109 +      end
   1.110      else
   1.111 -      break
   1.112 +      byte_pos = byte_pos + 1
   1.113 +      count = count + 1
   1.114      end
   1.115    end
   1.116 -  return str
   1.117 +  return string.sub(str, 1, byte_pos-1)
   1.118  end
   1.119  
   1.120  function format.string(str, options)
   1.121    local options = options or {}
   1.122    if str == nil then
   1.123      return options.nil_as or ""
   1.124 -  elseif options.truncate_at or options.truncate_if then
   1.125 +  elseif options.truncate_at then
   1.126 +    str = tostring(str)
   1.127      -- TODO: Unicode grapheme cluster boundary detetion is not implemented
   1.128      -- (Unicode codepoints are used instead)
   1.129 -    return truncate_codepoints(
   1.130 -      tostring(str),
   1.131 -      options.truncate_at,
   1.132 -      options.truncate_if,
   1.133 -      options.truncate_suffix
   1.134 -    )
   1.135 +    local truncate_suffix = options.truncate_suffix
   1.136 +    if truncate_suffix == true then
   1.137 +      truncate_suffix = '\226\128\166'
   1.138 +    elseif not truncate_suffix then
   1.139 +      truncate_suffix = ''
   1.140 +    end
   1.141 +    if options.truncate_count_suffix ~= false and truncate_suffix then
   1.142 +      local suffix_length = codepoint_count(truncate_suffix)
   1.143 +      if codepoint_count(str) > options.truncate_at then
   1.144 +        return (
   1.145 +          codepoint_truncate(str, options.truncate_at - suffix_length) ..
   1.146 +          truncate_suffix
   1.147 +        )
   1.148 +      else
   1.149 +        return str
   1.150 +      end
   1.151 +    else
   1.152 +      if codepoint_count(str) > options.truncate_at then
   1.153 +        return codepoint_truncate(str, options.truncate_at) .. truncate_suffix
   1.154 +      else
   1.155 +        return str
   1.156 +      end
   1.157 +    end
   1.158    else
   1.159      return tostring(str)
   1.160    end

Impressum / About Us