Skip to content

Commit 57ae006

Browse files
committed
Updates to string_formatter
- handles utf8 strings when lpeg module is present - handles unlisted escapes for raw binary strings - do not add an extra line at the end of multiline strings
1 parent c85865a commit 57ae006

File tree

1 file changed

+132
-23
lines changed

1 file changed

+132
-23
lines changed

pprint.lua

+132-23
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ pprint.defaults = {
2727
level_width = 80, -- max width per indent level
2828
wrap_string = true, -- wrap string when it's longer than level_width
2929
wrap_array = false, -- wrap every array elements
30+
string_is_utf8 = true, -- treat string as utf8, and count utf8 char when wrapping, if possible
3031
sort_keys = true, -- sort table keys
3132
}
3233

@@ -42,16 +43,62 @@ local ESCAPE_MAP = {
4243
}
4344

4445
-- generic utilities
45-
local function escape(s)
46-
s = s:gsub('([%c\\])', ESCAPE_MAP)
47-
local dq = s:find('"')
48-
local sq = s:find("'")
49-
if dq and sq then
50-
return s:gsub('"', '\\"'), '"'
51-
elseif sq then
52-
return s, '"'
53-
else
54-
return s, "'"
46+
local tokenize_string = function(s)
47+
local t = {}
48+
for i = 1, #s do
49+
local c = s:sub(i, i)
50+
local b = c:byte()
51+
local e = ESCAPE_MAP[c]
52+
if (b >= 0x20 and b < 0x80) or e then
53+
local s = e or c
54+
t[i] = { char = s, len = #s }
55+
else
56+
t[i] = { char = string.format('\\x%02x', b), len = 4 }
57+
end
58+
if c == '"' then
59+
t.has_double_quote = true
60+
elseif c == "'" then
61+
t.has_single_quote = true
62+
end
63+
end
64+
return t
65+
end
66+
local tokenize_utf8_string = tokenize_string
67+
68+
local has_lpeg, lpeg = pcall(require, 'lpeg')
69+
70+
if has_lpeg then
71+
local function utf8_valid_char(c)
72+
return { char = c, len = 1 }
73+
end
74+
75+
local function utf8_invalid_char(c)
76+
local b = c:byte()
77+
local e = ESCAPE_MAP[c]
78+
if (b >= 0x20 and b < 0x80) or e then
79+
local s = e or c
80+
return { char = s, len = #s }
81+
else
82+
return { char = string.format('\\x%02x', b), len = 4 }
83+
end
84+
end
85+
86+
local cont = lpeg.R('\x80\xbf')
87+
local utf8_char =
88+
lpeg.R('\x20\x7f') +
89+
lpeg.R('\xc0\xdf') * cont +
90+
lpeg.R('\xe0\xef') * cont * cont +
91+
lpeg.R('\xf0\xf7') * cont * cont * cont
92+
93+
local utf8_capture = (((utf8_char / utf8_valid_char) + (lpeg.P(1) / utf8_invalid_char)) ^ 0) * -1
94+
95+
tokenize_utf8_string = function(s)
96+
local dq = s:find('"')
97+
local sq = s:find("'")
98+
local t = table.pack(utf8_capture:match(s))
99+
t.has_double_quote = not not dq
100+
t.has_single_quote = not not sq
101+
return t
55102
end
56103
end
57104

@@ -201,9 +248,11 @@ function pprint.pformat(obj, option, printer)
201248
local status = {
202249
indent = '', -- current indent
203250
len = 0, -- current line length
251+
printed_something = false, -- used to remove leading new lines
204252
}
205253

206254
local wrapped_printer = function(s)
255+
status.printed_something = true
207256
printer(last)
208257
last = s
209258
end
@@ -213,6 +262,7 @@ function pprint.pformat(obj, option, printer)
213262
end
214263

215264
local function _n(d)
265+
if not status.printed_something then return end
216266
wrapped_printer('\n')
217267
wrapped_printer(status.indent)
218268
if d then
@@ -269,26 +319,85 @@ function pprint.pformat(obj, option, printer)
269319
end
270320

271321
local function string_formatter(s, force_long_quote)
272-
local s, quote = escape(s)
273-
local quote_len = force_long_quote and 4 or 2
274-
if quote_len + #s + status.len > option.level_width then
322+
local tokens = option.string_is_utf8 and tokenize_utf8_string(s) or tokenize_string(s)
323+
local string_len = 0
324+
local escape_quotes = tokens.has_double_quote and tokens.has_single_quote
325+
for _, token in ipairs(tokens) do
326+
if escape_quotes and token.char == '"' then
327+
string_len = string_len + 2
328+
else
329+
string_len = string_len + token.len
330+
end
331+
end
332+
local quote_len = 2
333+
local long_quote_dashes = 0
334+
local function compute_long_quote_dashes()
335+
local keep_looking = true
336+
while keep_looking do
337+
if s:find('%]' .. string.rep('=', long_quote_dashes) .. '%]') then
338+
long_quote_dashes = long_quote_dashes + 1
339+
else
340+
keep_looking = false
341+
end
342+
end
343+
end
344+
if force_long_quote then
345+
compute_long_quote_dashes()
346+
quote_len = 2 + long_quote_dashes
347+
end
348+
if quote_len + string_len + status.len > option.level_width then
275349
_n()
276350
-- only wrap string when is longer than level_width
277-
if option.wrap_string and #s + quote_len > option.level_width then
351+
if option.wrap_string and string_len + quote_len > option.level_width then
352+
if not force_long_quote then
353+
compute_long_quote_dashes()
354+
quote_len = 2 + long_quote_dashes
355+
end
278356
-- keep the quotes together
279-
_p('[[')
280-
while #s + status.len >= option.level_width do
281-
local seg = option.level_width - status.len
282-
_p(string.sub(s, 1, seg), true)
283-
_n()
284-
s = string.sub(s, seg+1)
357+
local dashes = string.rep('=', long_quote_dashes)
358+
_p('[' .. dashes .. '[', true)
359+
local status_len = status.len
360+
local line_len = 0
361+
local line = ''
362+
for _, token in ipairs(tokens) do
363+
if line_len + token.len + status_len > option.level_width then
364+
_n()
365+
_p(line, true)
366+
line_len = token.len
367+
line = token.char
368+
else
369+
line_len = line_len + token.len
370+
line = line .. token.char
371+
end
285372
end
286-
_p(s) -- print the remaining parts
287-
return ']]'
373+
374+
return line .. ']' .. dashes .. ']'
288375
end
289376
end
290377

291-
return force_long_quote and '[['..s..']]' or quote..s..quote
378+
if tokens.has_double_quote and tokens.has_single_quote and not force_long_quote then
379+
for i, token in ipairs(tokens) do
380+
if token.char == '"' then
381+
tokens[i].char = '\\"'
382+
end
383+
end
384+
end
385+
local flat_table = {}
386+
for _, token in ipairs(tokens) do
387+
table.insert(flat_table, token.char)
388+
end
389+
local concat = table.concat(flat_table)
390+
391+
if force_long_quote then
392+
local dashes = string.rep('=', long_quote_dashes)
393+
return '[' .. dashes .. '[' .. concat .. ']' .. dashes .. ']'
394+
elseif tokens.has_single_quote then
395+
-- use double quote
396+
return '"' .. concat .. '"'
397+
else
398+
-- use single quote
399+
return "'" .. concat .. "'"
400+
end
292401
end
293402

294403
local function table_formatter(t)

0 commit comments

Comments
 (0)