[BACK]Return to eurobsdcon2016-utf8.roff CVS log [TXT][DIR] Up to [local] / www / papers

File: [local] / www / papers / eurobsdcon2016-utf8.roff (download)

Revision 1.1, Mon Sep 26 08:36:41 2016 UTC (7 years, 8 months ago) by schwarze
Branch: MAIN
CVS Tags: HEAD

my Beograd UTF-8 talk

.\"
.\" The following Copyright notice, license and disclaimer
.\" applies to all files of this presentation except the images
.\" and is not repeated in each individual file.
.\"
.\" Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
.\"
.\" Permission to use, copy, modify, and distribute this presentation for any
.\" purpose with or without fee is hereby granted, provided that the above
.\" copyright notice and this permission notice appear in all copies.
.\"
.\" THE PRESENTATION IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
.\" WARRANTIES WITH REGARD TO THIS PRESENTATION INCLUDING ALL IMPLIED
.\" WARRANTIES OF MERCHANTABILITY AND FITNESS.  IN NO EVENT SHALL THE
.\" AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
.\" DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
.\" OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
.\" TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
.\" PERFORMANCE OF THIS PRESENTATION.
.\"
.\" --------------------------------------------------------------------
.\"
.\" These slides use the mm and gpresent groff macros.
.\" For example, on OpenBSD, install these ports:
.\" groff, gpresent, ghostscript.
.\" Then run:
.\" groff -P-p7.74i,11.5i -mm -mpresent \
.\"   eurobsdcon2016-utf8.roff > eurobsdcon2016-utf8.ps
.\" ps2pdf eurobsdcon2016-utf8.ps
.\"
.\" --------------------------------------------------------------------
.\"
.\" --- global mm configuration settings -------------------------------
.nr Pi 3
.MARGIN 0i
.\" --- global gpresent configuration settings -------------------------
.DEFCOLOR Kea1 0 0.8 0.48
.DEFCOLOR Kea2 0 0.5 0.3
.TITLECOLOR Kea1
.SUBTITLEFORMAT C
.SUBTITLECOLOR Kea2
.FOOTERSIZE 2
.\" We don't want a header line for the title page,
.\" so we have to start it before setting up headers.
.SUBTITLE "Why and how you ought to"
.TITLE "Keep multibyte character support simple"
.\" === gpresent header setup ==========================================
.\" --- define gpresent extension registers ----------------------------
.nr gpe_page_tot 1
.nr gpe_page_sec 0
.af gpe_page_sec I
.nr gpe_time_tsec 14*60+3*60
.nr gpe_time_hour 14
.nr gpe_time_min 03
.af gpe_time_min 02
.nr gpe_time_sec 0
.af gpe_time_sec 02
.
.\" --- macro to start a new section -----------------------------------
.de GPE_SECTION
.ds gpe_title_sec \\$1
.nr gpe_page_sec 0
..
.\" --- macro to prepare a new page ------------------------------------
.de GPE_NEXT
.ds gpe_next \\$1
.SK
..
.\" --- gpresent page header callback ----------------------------------
.de HEADER
.nr gpe_page_tot +1
.nr gpe_page_sec +1
.sp 0.5v
.ds gpe_middle page \\n[gpe_page_tot]: \\*[gpe_title_sec] \\n[gpe_page_sec]
.tl 'Ingo Schwarze: Keep multibyte character support simple'\
\h'9m'\\*[gpe_middle]'\
Beograd, September 25, 2016'
.sp -0.5v
.\" horizontal line below the page header
\l'\\n(.lu'\h'-\\n(.lu'
.br
..
.\" --- initialize the first section before completing the title page --
.GPE_SECTION INTRO
.\" === define some gpresent extension macros ==========================
.\" --- two-column mode (for images) -----------------------------------
.\" 1st arg: width of first column
.\" 2nd arg: move second column up by this amout (default 0.5v)
.\" switch column with normal .MULN, end with normal .MULE
.de GPE_MULB
.nr gpe_colwr \\n(.l-\\$1-1n
.ie \\n[.$]>1 .ds gpe_vsp \\$2
.el .ds gpe_vsp 0.5v
.sp -\\*[gpe_vsp]
.MULB \\$1 1n \\n[gpe_colwr]u
.sp \\*[gpe_vsp]
..
.\" --- emphasis -------------------------------------------------------
.\" arg: text
.de GPE_EM
.COLOR red
\\$1
.COLOR P
..
.\" --- small text -----------------------------------------------------
.\" arg: text
.de GPE_SM
.S -4
.ce
\\$1
.S P
..
.\" --- small text with one emphasised word ----------------------------
.\" 1st arg: text before emphasis
.\" 2nd arg: emphasised text
.\" 3rd arg: text after emphasis
.de GPE_SMEM
.GPE_SM "\\$1 \m[red]\\$2\m[] \\$3"
..
.\" --- title page -----------------------------------------------------
.\" The main title line has already been printed.
.SUBTITLE "EuroBSDCon, Beograd, September 25, 2016"
.SUBTITLE "Ingo Schwarze <schwarze@openbsd.org>"
.MULB 6i 0i 4i
.PSPIC Images/CanyonCampground.eps
.GPE_SM "Canyon Campground below Lower Kananaskis Lake"
.GPE_SM "and the Opal Range (2900-3000m)"
.MULN
.PSPIC Images/KananaskisWelcome.eps
.GPE_SM "Alberta Highway 66 near"
.GPE_SM "Bragg Creek, Elbow Valley"
.MULE
.\" === gpresent footer setup ==========================================
.\" We dont want a footer line for the title page,
.\" so we have to set it up after completing the title page.
.SK
.\" --- macros to start a new page -------------------------------------
.\" arg: time for this page in seconds
.de GPE_TIME
.nr gpe_time_tsec +\\$1
.nr gpe_time_hour \\n[gpe_time_tsec]/3600
.nr gpe_time_min \\n[gpe_time_tsec]%3600/60
.nr gpe_time_sec \\n[gpe_time_tsec]%60
..
.\" --- gpresent page footer callback ----------------------------------
.de FOOTER
.ps 18
.vs 20
.sp -2v
\l'\\n(.lu'\h'-\\n(.lu'
.br
.tl '\s-6\\n[gpe_time_hour]:\\n[gpe_time_min]:\s-2\\n[gpe_time_sec]\s+8''\
\\m[Kea2]\\*[gpe_next]\ \ \(->\\m[]'
.ps
.vs
..
.\" The INTRO section was already started in header.roff.
.TITLE "Topic of this talk"
.SUBTITLE "Multibyte character support in the base system"
.BL
.LI
Multibyte character support in basic infrastructure
.br
like ksh(1), xterm(1), OpenSSH, man(1), libedit...
.LI
LC_CTYPE support in BSD and POSIX utility programs,
.br
in particular the simplest ones like ls(1), ps(1), cut(1), wc(1)
.LI
POSIX multibyte and wide character functions in the C library
.LI
Multibyte character support in the kernel terminal driver?
.LE
.PSPIC Images/NestorBuller16.eps
.GPE_SM "Mount Nestor (2975m), the southern pillar of the Goat Range"
.GPE_TIME 45
.GPE_NEXT "What won't be covered?"
.TITLE "Out of scope for this talk"
.GPE_MULB 7i
.BL
.LI
Internationalization in general
.br
That's a vast field and could only be covered in an overview talk.
.br
This talk explores specific technical details \(-> limited scope.
.LI
Not about general locale support \(em only about LC_CTYPE.
.LI
Not about character encoding conversions.
.br
To convert files from one encoding to another,
.br
simply install the GNU iconv package.
.LE
.MULN
.PSPIC Images/LowerKananaskisLake.eps
.GPE_SM "Lower Kananaskis Lake (1680m)"
.MULE
.BL
.LI
Not about typesetting.
.br
That cannot be done with C library utilities,
and even Unicode is utterly inadequate to to express any
non-trivial arrangement of glyphs, for example for non-trivial
mathematical formulae or anything similar.  Typesetting requires
specialized software like TeX or groff, and in such contexts,
character set handling is one of the points to be considered,
but a relatively minor one, and in any case, for such software,
it is completely irrelevant whether or not the base system offers
any kind of multibyte character support or not.
.LE
.sp
.S -4
It is not my intention to dismiss any real-world tasks as irrelevant,
in particular not the handling of legacy non-UTF-8 multibyte
encodings, which is a legitimate concern; i am merely studying
what can reasonably be done with C library support in the base
system without compromizing other goals like security, reliability,
and usability, and what may better be left to specialized add-on
software.
.br
.S P
.GPE_TIME 90
.GPE_NEXT "Can all be done?"
.TITLE "The myth of feasibility"
.BL
.LI
I suspect that many people think that as long as you implement
carefully and use all standard facilities and interfaces as designed,
complete multibyte character support can be done.
.LI
At least i thought so before i set out on the quest i'm talking about.
.LI
But it turned out that is not true.  If you build support for arbitrary
character set locales into the base system, there are several
aspects with respect to which making things secure, reliable,
and usable becomes outright impossible.
.LI
As a first step, i will show some of these unsolvable issues.
.LE
.PSPIC Images/CalgaryCenterStreet.eps
.GPE_SM "A thunderstorm approaching Calgary, Center Street Bridge"
.GPE_TIME 45
.GPE_NEXT "Table of contents"
\&
.GPE_MULB 5i
.TITLE "Table of contents"
.BL
.LI
Examples of problems
.br
unsolvable with arbitrary encodings
.LI
Benefits of supporting UTF-8 only
.LI
Implementation techniques
.BL
.LI
isu8cont() for simplicity
.LI
mblen(3) for validation
.LI
mbtowc(3) for property inspection
.LI
utf8.c for modularization
.LI
fgetwc(3) for unusually complex cases
.LI
Techniques to avoid, if possible
.LE
.LI
Examples of bugs in libraries and tools
.LI
Conclusions and outlook
.LE
.MULN
.PSPIC Images/KananaskisMap.eps
.GPE_SM "Kananaskis Country, Alberta, Canada"
.MULE
.GPE_TIME 90
.GPE_SECTION "UNSOLVABLE PROBLEMS"
.GPE_NEXT "Examples of problems"
.TITLE "An extreme example of breakage in the standard: write(1)"
.sp -1v
.SUBTITLE "POSIX requires:"
.B1
\(lqThe following environment variable shall affect the execution of write:
LC_CTYPE: Determine the locale for the interpretation of sequences
of bytes of text data as characters (for example, single-byte as
opposed to multi-byte characters in arguments and input files). If
the recipient's locale does not use an LC_CTYPE equivalent to the
sender's, the results are undefined.\(rq
.B2
.SUBTITLE "When the locales agree, POSIX requires:"
.B1
\(lqTyping characters from LC_CTYPE classifications \(oqprint\(cq
or \(oqspace\(cq shall cause those characters to be sent to the
recipient's terminal.\(rq
.B2
.GPE_MULB 7i
.P
Now as a matter of fact, for the sending program, there is no
way to find out the recipient's locale.  By its basic design,
the locale is part of the environment of each program, and it
is essential for system security that without elevated privileges,
no program can inspect the environment of other user's programs.
.P
So to satisfy the requirement for the case of matching locales,
the standard effectively requires the write(1) program to
unconditionally \m[red]write all printable characters using the sender's
locale\m[], no matter what the recipient's locale may be.
.MULN
.PSPIC Images/MountWarspite.eps
.GPE_SM "Mount Warspite (2850m)"
.GPE_SM "across the Kananaskis Lakes"
.MULE
.GPE_TIME 120
.GPE_NEXT "What can we do?"
.TITLE "write(1) implementation cannot be fixed"
.SUBTITLE "Locales mismatch \(-> print garbage"
.BL
.LI
Even if the senders are well-intentioned and only send
byte sequences they consider as printable characters
in their own locale,
.LI
the recipient's locale might interpret some of them
as terminal control sequences
.LI
may screw up the recipient's terminal state
.LI
may display wrong and misleading information
.LI
may even put the terminal into a state where it interpretes user
input in a way different from what the recipient wants and reasonably
expects.
.LE
.GPE_MULB 6i
.SUBTITLE "Impossible to reduce functionality to make it safe"
For each and every byte sequence, there can be a locale in
which it might represent a potentially dangerous control
sequence, so \m[red]no byte or byte sequence at all is safe\m[] to print
to a terminal if you do not know the encoding.
.MULN
.PSPIC Images/MistLineham16.eps
.GPE_SM "Mist Mountain (3138m) from Lineham Creek"
.MULE
.GPE_TIME 60
.GPE_NEXT "Any way out?"
.MULB 7i 0i 3i
.TITLE "write(1) standard cannot be fixed"
.SUBTITLE "Standard effectively requires utterly unsafe behaviour"
On a system providing arbitrary locales,
there is no way how the standard could be improved, short of
completely deleting the entire write(1) program.
.SUBTITLE "Who uses write(1) anyway?"
.BL
.LI
People moved on to WhatsApp, didn't they?
.LI
But wait: wall(1) has the same problem.
.LI
And \m[red]shutdown(8)\m[] uses wall(1)!
.LI
Worst time to screw up people's terminals:
.LI
Right before shutdown when they hurry to save their work...
.LE
.sp
That teaches us that even if many users use traditional low-level
tools much less nowadays, they may still be more relevant in
subtle ways than one might naively think.
.MULN
.PSPIC Images/CowboyTrail.eps
.GPE_SM "Highway 22 near Bragg Creek,"
.GPE_SM "Elbow River Valley"
.MULE
.GPE_TIME 50
.GPE_NEXT "Are important tools affected?"
.TITLE "Tough problems in basic tools: ssh(1)"
.SUBTITLE "An ssh(1) connection involves two locales"
.GPE_MULB 8i
.AL
.LI
The locale set in the original shell on the client machine (client locale)
.br
Determines what can safely be displayed and how it must be encoded.
.br
It is already defined before the ssh(1) client program is even started.
.LI
The locale set in the remote shell on the server machine (server locale)
.br
Only this can influence what may get printed on the client
.br
to the terminal in which ssh(1) is run.
.LE
.MULN
.PSPIC Images/GoatSouth.eps
.GPE_SM "Goat Range (2700-2800m)"
.GPE_SM "from Goat Pond"
.MULE
.sp -0.5v
.SUBTITLE "How is the server locale selected?"
Lots of competing mechanisms for setting environment variables:
.BL
.LI
Operating system defaults when starting new processes
.LI
Variables set or unset by sshd(8) on the server when forking
the login shell
.LI
SSH initialization files, for example ~/.ssh/environment
.LI
System wide and user specific shell initialization files, and so on
.LI
\&...
.LE
None of that host of possibilities depends on the locale used on
the client side, or can even inspect the locale on the client side.
.GPE_TIME 80
.GPE_NEXT "Any way out?"
.TITLE "ssh(1) cannot be fixed either"
.BL
.LI
So we end up with a problem similar to the write(1) case:
.LI
If the client side is using an arbitrary locale, the server
cannot safely send any string in any encoding, not even plain
US-ASCII.
.LI
OpenSSH provides no way for the client to communicate the required
locale to the server.
.LI
And even if it could, there is no guarantee
that that particular locale is available on the server.
.LI
And even if there were a locale of the same name on the server, there
is no guarantee that it is compatible with the client locale,
because neither locale names nor the semantic of any locale
except C and POSIX is standardized by POSIX.
.LE
.SUBTITLE "Generic problem for any kind of inter-process communication"
.sp -0.5v
.MULB 3i 0.5i 3i 0.5i 3i
.PSPIC Images/GoatPond1.eps
.MULN
.PSPIC Images/GoatPond2.eps
.MULN
.PSPIC Images/GoatPond3.eps
.MULE
.GPE_SM "At the Goat Pond dam (1670m), Spray Lakes area"
.GPE_TIME 50
.GPE_NEXT "Any way out?"
.TITLE "Partial mitigations for the ssh(1) problem"
.GPE_MULB 7i
.BL
.LI
If you happen to know the default locale of the remote account you
want to connect to and the same locale happens to be available on
your client system, you can start a terminal using that locale on
the client system before typing the ssh(1) command, and you are
safe.  But that's a special case.
.LI
Besides, let me ask a question
to the audience:  Who has done that at least once in the past?
Considering what the server locale was going to be, and start
a matching terminal before typing the ssh command?
.LI
Note that opening the connection first, then setting LC_CTYPE in
the remote shell to whatever you need locally is not safe -
the remote system may already print to your local terminal
before you ever get to the shell prompt:
.BL
.LI
A banner even before authentication,
.LI
the motd(5),
.LI
and then the shell prompt itself...
.LE
.LI
All that might already screw up your terminal, and in the worst
case cause your terminal to misinterpret the input you type.
.LE
.MULN
.PSPIC Images/ChapmanBridge.eps
.GPE_SM "Chapman Bridge (ca. 1600m)"
.GPE_SM "Elbow River Campground"
.MULE
.GPE_TIME 90
.GPE_SECTION "THE OPENBSD WAY"
.GPE_NEXT "What can we do?"
.TITLE "The OpenBSD way"
.GPE_MULB 5i
.SUBTITLE "We made a drastic decision!"
The OpenBSD base system supports
.br
exactly two LC_CTYPE locales:
.AL
.LI
UTF-8
.LI
C = POSIX = US-ASCII
.LE
.P
We don't even support ISO-LATIN-1
.br
any longer in the base system.
.sp 2v
.SUBTITLE "Isn't that seriously inconvenient?"
.BL
.LI
Usability is not as bad as it may seem at first.
.LE
.MULN
.PSPIC Images/OldGoat.eps
.GPE_SM "Old Goat Mountain (3109m), Spray Lakes Reservoir"
.MULE
.BL
.LI
If you get text in different encodings, it is very easy
to install conversion tools from ports
.br
and simply convert the data once before using it.
.LI
Besides, Unicode and UTF-8 support all languages.
.LI
So even without relying on ports,
the base system is still able to support all languages.
.LE
.GPE_TIME 60
.GPE_NEXT "How does that help write(1)?"
.GPE_MULB 7i 0i
.TITLE "Benefits for write(1)"
.MULN
.PSPIC Images/WhitemansPond.eps
.GPE_SM "Looking across Whiteman's Pond"
.GPE_SM "to the Goat Range (ca. 2700m)"
.MULE
.sp -8v
.AL
.LI
ASCII printable characters always safe to print on OpenBSD
.br
(UTF-8 ASCII-compatible: both encode ASCII the same way)
.br
That allows partial write(1) functionality:
.br
Allow passing ASCII only no matter what the two locales are.
.LI
It allows filtering out ASCII control bytes (C0 characters).
.br
Important because the escape character is dangerous.
.br
Possible because no UTF-8 sequence contains such a byte.
.LI
If the sender's terminal is set to UTF-8 and non-ASCII
characters are actually typed, they can safely be filtered out
(just in case the receiver's terminal is set to US-ASCII, which
we cannot know).  That's possible because UTF-8 is stateless,
that is, codepoints can safely be deleted from the stream and
it still remains a valid stream of characters (which would not
be true for arbitrary locales).
.LI
If invalid bytes not forming UTF-8 occur in the input stream,
they can safely be filtered out, allowing to recover from
encoding errors.  That's possible because after an encoding
error, UTF-8 allows to find the beginning of the next character
by simply looking for the next byte not having the most significant
bit set or having the two most significant bits set.  Consequently,
the sending terminal can never become unusable, which might
well happen when allowing arbitrary encodings.
.LE
.GPE_TIME 90
.GPE_NEXT "Are there any downsides?"
.TITLE "Prices to pay in write(1)"
.SUBTITLE "OpenBSD write(1) now violates POSIX"
Yet it does implement the maximal safe and useful and reasonable part of POSIX:
.br
All printable ASCII characters, space characters, and the BEL are sent as typed.
.BL
.LI
Locales are ignored.
.LI
UTF-8 continuation bytes are silently ignored.
.LI
ASCII control characters and UTF-8 start bytes
are replaced with question marks.
.LI
Consequently, if the sender uses UTF-8 or control characters,
the recipient sees
.br
that something got lost and can ask the sender about the missing parts.
.LE
.SUBTITLE "The implementation is extremely simple and robust"
It doesn't even need <wchar.h> or <locale.h>, neither setlocale(3)
nor getwchar(3) nor mbtowc(3) nor anything like that.
It gets away with elementary single-byte character handling
functions like fgets(3), isprint(3), and putchar(3), which
makes code review and maintenance a lot easier.
.MULB 6i 0.2i 3.8i
.PSPIC -R Images/SpraySouth.eps
.MULN
.S -4
.sp 3v
Spray Mountains (2700-2900m)
.br
seen from the Kananaskis Lakes
.S P
.MULE
.GPE_TIME 80
.ig
usr.bin/write  : write.1 write.c
5 Feb 2016 12:00:40 -0700 (MST)
usr.bin/wall   : wall.c wall.1
8 May 2016 10:19:36 -0600 (MDT)
..
.GPE_NEXT "What about ssh(1)?"
.TITLE "Best practice for ssh(1)"
.BL
.LI
It is obvious that, if the server runs OpenBSD, which only supports
the C/POSIX and UTF-8 locales, connecting to it becomes safe if
you follow the simple rule to always use a UTF-8 enabled
terminal to run ssh(1).
.LI
Of course, that does not yet secure connections FROM OpenBSD
systems:  Connecting from OpenBSD to other operating systems
is still dangerous because they might send text in arbitrary
locales.
.LI
So, the best practice i recommend is to:
.AL
.LI
On all your servers that you may ever want to SSH into, no matter
on which operating system, make sure the \m[red]default system locale and
all login locales\m[] are set to either C/POSIX or to UTF-8.
.LI
Only ever run ssh(1) from \m[red]UTF-8\m[] enabled terminals.
.LE
.LE
.sp 2v
.GPE_MULB 7i 3.5v
While much of the stuff discussed here is subtle,
.br
this is one simple pair of rules
.br
i recommend that you remember.
.MULN
.PSPIC Images/ElpocaFlowers.eps
.GPE_SM "Elpoca Mountain (3029m)"
.GPE_SM "from the Smith-Dorrien Trail"
.MULE
.GPE_TIME 120
.GPE_NEXT "What about xterm(1)?"
.GPE_MULB 7i 0i
.TITLE "Benefits for xterm(1)"
.SUBTITLE "Upstream xterm(1) runs in ASCII mode by default"
.BL
.LI
Traditionally, that default was also used on OpenBSD.
.LI
Bad idea:
Some common UTF-8 characters are interpreted as control codes.
For example, a stray German \(oq\(ss\(cq will lock up your ASCII xterm(1).
.LE
.MULN
.PSPIC Images/MountainSheep.eps
.GPE_SM "Kananaskis Trail at Pocaterra Creek"
.MULE
.sp -1v
.SUBTITLE "OpenBSD 6.0 runs xterm(1) in UTF-8 mode by default"
If you use a C/POSIX locale, even if you don't intend to ever use UTF-8,
.br
that's OK because a UTF-8 terminal handles ASCII output just fine.
.P
In addition to that, the UTF-8 enabled terminal is obviously more
resilient to UTF-8 accidentally sneaking in, in particular, but
not only, for the case of running ssh(1) as explained above.
Actually, even when fed garbage or unsupported encodings, a UTF-8
xterm(1) is more robust than an ASCII xterm(1) because the UTF-8
xterm(1) honours *fewer* terminal escape codes than the ASCII
xterm(1). That may seem surprising at first because Unicode defines
*more* control characters than ASCII does. But as explained on
http://invisible-island.net/xterm/ctlseqs/ctlseqs.html
xterm(1) never treats decoded multibyte characters as terminal
control codes, so the ISO 6429 C1 control codes do not take effect
in UTF-8 mode; but they do take effect in ASCII mode, even though
they fall outside the scope of ASCII.
.GPE_TIME 90
.ig
app/xterm      : XTerm.ad
8 Mar 2016 10:26:30 -0700 (MST)
..
.GPE_NEXT "Any downsides?"
.TITLE "Caveats for xterm(1)"
Do not use this non-standard default
setting on any other system except OpenBSD.
.GPE_MULB 6i
.sp
.BL
.LI
It only works because
OpenBSD deliberately does not support any locales except UTF-8 and
C/POSIX/ASCII.
.LI
Terrible things will happen if you force the
default to UTF-8 in this way on a system where people can
opt into arbitrary locales that differ from UTF-8.
.LI
On other operating systems except OpenBSD, there is no way in
hell to make the interaction of locales with terminal controls
truly safe.
.LE
.MULN
.PSPIC Images/FairholmeRange.eps
.GPE_SM "Fairholme Range from Whiteman's Pond"
.MULE
.sp
The main goal of having UTF-8 xterms by default on OpenBSD
is improving robustness.
.br
But it also improves usability.
If you usually run your shells inside xterm(1) in C/POSIX mode,
there should be few visible changes for you.
.br
But if you ever stumble upon a directory containing UTF-8 filenames,
you can simply say
.VERBON 7 16
	$ LC_CTYPE=en_US.UTF-8 ls
.VERBOFF
which would have given you garbage output in the past, and which
just works in OpenBSD 6.0.
.GPE_TIME 80
.GPE_NEXT "Another example?"
.TITLE "Benefits for pod2man(1) manuals"
.BL
.LI
Many perl manuals contain UTF-8.
.LI
So do several ports manuals using perlpod(1) format.
.\" 36 on my system right now
.LI
A few ports manuals contain ISO-LATIN-1: latex2man(1), a2ping(1), ...
.br
OpenBSD man(1), which is the mandoc implementation,
silently converts that to UTF-8.
.LI
So we enabled UTF-8 by default for pod2man(1) in OpenBSD,
.br
improving output for both UTF-8 and C/POSIX/ASCII users.
.LI
Problem unsolvable on any system trying to support arbitrary locales,
.br
because man(1) must not print UTF-8
for users using a different locale.
.LE
.PSPIC Images/YellowBelliedMarmot.eps
.GPE_SM "Yellow Bellied Marmot near Lower Kananaskis Lake"
.GPE_TIME 90
.ig
In pod2man(1), enable UTF-8 output by default and provide a --no-utf8
command line option to disable it.  The new default improves the
formatting of Perl manuals using UTF-8 characters (for example
perlunicook(1)) with man(1) and mandoc(1) no matter which locale
the user has set.
Issue discovered by and fix OK by afresh1@.
Trying to push this change upstream would make no sense.  It's the
right thing to do only because we decided to not support any other
locales except ASCII and UTF-8.  A system trying to provide arbitrary
locales simply cannot handle manuals containing UTF-8 characters
at build time, so the change would produce wrong results.
  gnu/usr.bin/perl/cpan/podlators/scripts: pod2man.PL
  19 Apr 2016 02:06:52 -0600 (MDT)
..
.GPE_SECTION "IMPLEMENTATION TECHNIQUES"
.GPE_NEXT "What about implementations?"
.sp -0.5v
.TITLE "Overview of implementation techniques"
.sp -1v
.MULB 11c 0c 10.6c  \" total width is 21.6c
.SUBTITLE "for small base system utilities"
.VERBON 1 12
command deci parse  ins sani eval
        iddo cvitgw spw inau cwsp

rev     lc-- c----- --- ---- ----
ksh     .--- c----- --- ---- ----
tty(4)  ---- c----- --- ---- -c--
write   ---- c----- --- -?p? ----
ypldap  ---- c----- --- -cc? c---
cut -fd l-ld -v---- --- ---- --s-
cut -cn l-lc --i--- --- ---- c---
uniq -s l-lc --i--- --- ---- c---
uniq -f l-tf ---t-- s-- ---- ----
wc -m   le-c ---t-- s-- ---- c---
colrm   l-t- ---t-- -pw ---- -w--
fold    lctb c--t-- -pw ---- -w--
column  l-t- ---tG- spw ---- -wS-
fmt     l-t- ---t-- spw ?ppp -w--
ls      l-t- ---t-w -pw ??pp -w--
rs      le-- ---t-w -pw ??cc -w--
ps      l-t- ---t-w -pw vrpp -w--
ssh     l-t- ---t-w -pw vvcc -w--
ul      l-g- ----g- -pw sspp cw-p
man     l--l p----w --w ??pp -w-p
.VERBOFF
.SUBTITLE "utility functions"
.VERBON 1 10
ls   int mbsprint(const char *mbs, int print)
rs   int mbsavis(char** outp, const char *mbs)
ps   int mbswprint(const char *, maxw, trail)
ssh  int vasnmprintf(c **, size_t, int *, fmt, va)
man  int preconv_encode(...)
.VERBOFF
.MULN
.sp 0.5v
.nr VerbinSave \n[Verbin]
.nr Verbin 12c
.VERBON 16 9
decision making:
 i - initialization
      l - setlocale(3) called
      . - setlocale(3) called but essentially unused
 d - decision
      c - MB_CUR_MAX inspected in isu8cont()
      e - MB_CUR_MAX inspected before calling multibyte functions
      l - implicit in mblen(3)
      t - implicit in mbtowc(3)
      g - implicit in fgetwc(3)
 o - options deciding whether multibyte functions are used at all
      d - option to specify delimiter (may be UTF-8)
      b - option to count bytes (alternative is to count characters)
      c - option to count characters (alternative is to count bytes)
      f - option to count fields
      l - option to call setlocale(LC_CTYPE, "")
parsing:
 c - \m[red]direct inspection with isu8cont()\m[]
 p - dedicated UTF-8 parser
 v - validate multibyte character with mblen(3)
 i - iterate multibyte characters with mblen(3)
 t - \m[red]iterate multibyte characters with mbtowc(3)\m[]
 g - get wide characters with fgetwc(3)
 G - get command line argument with mbstowcs(3)
 w - wrapper to isolate UTF-8 handling from the main code
inspection:
 s - check for whitespace with iswspace(3) / iswblank(3)
 p - check printability with wcwidth(3)
 w - \m[red]get display width with wcwidth(3)\m[]
sanitation:
 classes:
  i - invalid byte
  n - non-printable character
  a - printable ASCII character
  u - printable Unicode character
 actions:
  s - skip
  ? - replace with question mark
  v - encode with vis(3)
  r - replace with Unicode replacement character
  c - copy character
  p - print character
evaluation:
 c - count characters
 w - count display width for columnation and/or tabulation
 s - split strings with strstr(3)
 S - split strings with wcschr(3)
 p - print with putwchar(3)
.VERBOFF
.nr Verbin \n[VerbinSave]
.MULE
.GPE_TIME 210
.GPE_NEXT "Why this table?"
.SUBTITLE "Why am i showing this table?"
.S -6
.BL
.LI
Each line provides information about one program;
some programs have very different modes, so some have two lines.
.LI
Each column represents one particular implementation technique.
.LI
My expectation, when showing this table, is not that anybody
might understand the whole table during this talk.  That is
impossible, it encodes a huge amount of information in an
extremely terse format.  But there are several reasons why
i do show the table anyway.
.LI
First reason:  It shows that the number of utilities we have
to deal with is surprisingly small.  You might expect that
almost everything might need multibyte character handling.
But this table only lists about three handfuls of utilities.
Admittedly, we didn't fix all utilities yet, but most are
done by now.
.LI
Second reason:  The table shows that the number of implementation
techniques is surprisingly large.  You might expect that there
might basically be just one technique: Read a stream, assemble
wide characters, process the characters, and write out the result.
But it turns out that would be a bad approach almost everywhere,
and different utilities have very different needs.
.LI
Third reason:  The table shows that so far, we did not not find
a single pair of utilities that could be handled in exactly
the same way.  No line in the table agrees with any other line.
Well, with one exception:  cut -c and uniq -s can be implemented
using exactly identical techniques - but the main technique
used in that case appears literally nowhere else, and both
utilities need quite different techniques when called with other
options.  So, basically, everything is different and nothing can
be done schematically.
.LI
Fourth reason, and i'll come back to that after explaining
some of the techniques:  Those techniques that people would
probably have expected to be ubiquitous barely appear at all.
For example, fgetwc(3) and fputwc(3) appear in one out of 16 cases,
and fgetws(3), fputws(3), *wprintf(3), *wscanf(3), mbrtowc(3),
wcrtomb(3), and wmem*(3) don't occur at all.
.LI
At this point, some of you will probably wonder whether i
screwed up the title of my talk.  "Why and how you ought to keep
multibyte character support simple" - but now the guy is saying
there is a large number of different techniques and everything
differs from everything else?  That doesn't sound simple at all!
.LI
The point is:  While the number of techniques is indeed not all
that small, every single technique is very elementary, so the
code of every individual utility does remain very short and simple.
Much simpler, in fact, than it would become if you tried to apply
one and the same general-purpose coding scheme everywhere.
.LE
.S P
.GPE_TIME 5
.GPE_NEXT "The simplest approach"
.TITLE "Technique 1: pure isu8cont()"
.SUBTITLE "rev(1) - reverse characters in each line"
.VL 6m
.LI requirements:
no need for character properties, only need to know where chars start
.br
hence, no need for character decoding
.br
not even any need for character validation
.LI solution:
isu8cont() one-liner by Ted Unangst <tedu@openbsd.org>:
.br
\m[red]Does this byte continue a character?\m[]
.nr VerbinSave \n[Verbin]
.nr Verbin 6m
.VERBON 23 14
int isu8cont(unsigned char c)
{ return MB_CUR_MAX > 1 && (c & (0x80 | 0x40)) == 0x80; }
.VERBOFF
.nr Verbin \n[VerbinSave]
setlocale(3) to set up MB_CUR_MAX
.LI "algorithm:"
read lines into a char[] buffer, skip newlines
.br
loop over characters by looping over bytes and skipping on isu8cont()
.br
then just copy the multibyte sequences without ever decoding them
.LE
.MULB 4.5i 0i 2.5i 0i 3i
.sp
This technique is very robust and never fails.
.br
Of course,
.br
it only works for UTF-8 only systems;
.br
no similar technique is possible
.br
for arbitrary encodings.
.MULN
.PSPIC Images/GoatCreek.eps
.MULN
.S -4
.sp 5v
Smith-Dorrien Trail (1650m)
.br
with East End of Rundle (2550m)
.S P
.MULE
.GPE_TIME 80
.ig
martijn@ rev.c 10 Apr 2016 11:06:52 -0600 (MDT) OK schwarze@
commit msg: Enable UTF-8 support in rev. Some minor cleanups while here.
..
.GPE_NEXT "What's the alternative?"
.TITLE "Technique 1: pure isu8cont()"
.SUBTITLE "The alternative: rev(1) on FreeBSD and NetBSD"
.GPE_MULB 6i
.BL
.LI
setlocale(3) used in the same way
.br
loop over lines with fgetwln(3)
.br
store wide character strings
.br
then print wide characters in reverse order
.LI
very fragile, \m[red]dies on the first encoding error\m[]
.LI
even worse: all known implementations of fgetwln(3) were buggy
and return success on encoding errors, effectively converting
all encoding errors into newline characters \(->
silently gave wrong results
.LE
.MULN
.PSPIC Images/MountSparrowhawk.eps
.GPE_SM "Dust on the Smith-Dorrien Trail"
.GPE_SM "with Mount Sparrowhawk (3121m)"
.MULE
.SUBTITLE "Summary regarding isu8cont()"
.BL
.LI
pure isu8cont() is the right move
when you need \m[red]no validation and no character properties\m[]
.LI
another example of pure isu8cont() is write(1) mentioned above
.LE
.GPE_TIME 70
.ig
martijn@ rev.c 10 Apr 2016 11:06:52 -0600 (MDT) OK schwarze@
commit msg: Enable UTF-8 support in rev. Some minor cleanups while here.
..
.GPE_NEXT "Another application?"
.TITLE "Technique 1: pure isu8cont(1) for ksh(1)"
.SUBTITLE "It can also be used as a quick and dirty stopgap\
 for complex programs"
.SUBTITLE "for example OpenBSD ksh(1) emacs input mode"
.GPE_MULB 5i
.BL
.LI
Make sure that moving left and right
.br
can only move by whole characters,
.br
not into the middle of a character.
.LI
Make sure that deleting characters
.br
can only delete characters whole,
.br
not individual bytes out of characters.
.LI
Improve all functions involving words
.br
by allowing non-ASCII characters
.br
to be part of words.
.LI
Allow insertion of non-ASCII characters
.br
without screwing up the display,
.br
by backing up to the start byte
.br
after inserting a continuation byte,
.br
and starting to re-print there.
.LE
.MULN
.PSPIC Images/ElbowFalls.eps
.GPE_SM "Elbow Falls"
.MULE
.GPE_TIME 60
.ig
Based on parts of a patch by Frederic Nowak <fnwk at mailbox dot org>,
tweaked by me.
OK tedu@ semarie@ mpi@
bin/ksh        : emacs.c
10 Dec 2015 03:00:14 -0700 (MST)
..
.ig
3. Fix forward movement which i didn't get quite right in my previous
commit: Always advance to a start byte, never to a final continuation
byte, or the next insertion would split the character in the middle.
OK mpi@
bin/ksh        : emacs.c
8 Jan 2016 06:17:57 -0700 (MST)
..
.GPE_NEXT "On a yet lower level?"
.TITLE "Technique 1: pure isu8cont(1) in the kernel"
.SUBTITLE "A quick and dirty stopgap for the tty driver"
.GPE_MULB 6.5i
.BL
.LI
For OXTABS and sysctl(3) KERN_TTY_INFO,
.br
sys/kern/tty.c wants to calculate display widths.
.LI
We don't want wcwidth(1) tables in the kernel,
.br
so double- and zero-width characters will remain wrong.
.LI
But let's at least handle the common case
.br
of single-width multibyte characters correctly:
.LE
.sp
.VERBON 1 14
int
ttyoutput(int c, struct tty *tp)
{
	int col = 0;

	/* lots of code deleted */
	switch (CCLASS(c)) {
	case ORDINARY:
		if (!isu8cont(c))
			++col;
		break;
	}
}
.VERBOFF
.MULN
.PSPIC Images/BowTowerUp.eps
.GPE_SM "Calgary, Bow Tower"
.MULE
.GPE_TIME 60
.GPE_NEXT "Another technique?"
.TITLE "Technique 2: pure mblen(3)"
.SUBTITLE "cut -d \(em select delimited fields out of lines"
.GPE_MULB 5.5i
.VL 6m
.LI "requirements:"
no need for character properties,
.br
\m[red]but need validation\m[]
.LI "solution:"
.LE
.VERBON 7 14
case 'd':
	dlen = mblen(optarg, MB_CUR_MAX);
	if (dlen == -1)
		usage();
	memcpy(dchar, optarg, dlen);
	dchar[dlen] = '\0';
.VERBOFF
.MULN
.PSPIC Images/TsuuTina.eps
.GPE_SM "Tsuu T'ina land near Calgary"
.MULE
.SUBTITLE "Alternatives"
.VL 6m
.LI FreeBSD:
uses mbrtowc(3), which is also possible
.LI NetBSD:
no multibyte support
.LE
.GPE_TIME 90
.ig
schwarze@ cut.c 1 Dec 2015 17:56:46 -0700 (MST) OK tedu@ czarkoff@ zhuk@
commit msg: UTF-8 support: Implement -c and -n
and let -d accept a multibyte delimiter character.
While here, simplify the code by switching from fgetln(3) to getline(3)
and from hand-crafted string parsing to strstr(3) and strchr(3).
..
.GPE_NEXT "A less simple example"
.TITLE "Technique 3: iteration with pure mblen(3)"
.SUBTITLE "cut(1) -c \(em select by character count"
Again no need for character properties, but need validation:
.VERBON 7 14
while(*cp != '\0') {
	len = mblen(cp, MB_CUR_MAX);
	if (len == -1)
		/* Handle encoding error, at least set len. */;
	/* Do something with the character. */
	cp += len;
}
.VERBOFF
Decision in OpenBSD:
treat each invalid byte as one character and keep going.
.GPE_MULB 4i
.sp 2v
.SUBTITLE "Alternative"
FreeBSD and NetBSD:
.P
Use getwc(3)
.br
and error out of the file
.br
on the first encoding error.
.MULN
.PSPIC Images/LinehamRidge.eps
.GPE_SM "Lineham Ridge (2700m) from the Highwood Valley (1880m)"
.MULE
.GPE_TIME 90
.ig
OpenBSD: same for uniq(1) -s \(em
ignore a certain number of initial characters
..
.GPE_NEXT "Alternatives?"
.TITLE "cut(1) in FreeBSD"
.BL
.LI
FreeBSD: uses fgetln(3) and mbrlen(3)/mbrtowc(3).
.LI
That is slightly inconsistent.
Even though POSIX 2016-TC2 now requires that L'\en' be encoded
as 0x000a in wchar_t, that doesn't imply that an arbitrary locale
must encode it as the single byte 0x0a in a multibyte char * string,
or as any single byte at all.
.LI
In any case, an encoding error causes the rest of the file to be lost.
.LI
mbstate is never reset, encoding errors in earlier files
may compromise decoding of later files.
.LE
.SUBTITLE "Looks like a pack of well-fed dragons..."
.GPE_MULB 7i
.BL
.LI
This teaches that it's very easy to write code that looks
perfectly general on first sight, but turns out to actually
be full of subtle issues on closer inspection.
.LI
As OpenBSD prefers correctness, security, and usability
over featurism, we believe that it's advantageous to sacrifice
full generality up front and allowing a simpler, more powerful,
and less fragile implementation for UTF-8 and ASCII only.
.LE
.MULN
.PSPIC Images/AlbertaHighway532.eps
.GPE_SM "Johnson Trail (Hwy. 532)"
.MULE
.GPE_TIME 140
.GPE_NEXT "What about character properties?"
.TITLE "Technique 4: iteration with mbtowc(3)"
.sp -1.5v
.GPE_MULB 7i 0i
.SUBTITLE "This one is most often needed in practice."
.VERBON
.COLOR red
char	*mbs;	/* Multibyte string (input). */
int	 len;	/* Encoded length in bytes. */
.sp 0.5v
wchar_t	 wc;	/* Wide character (decoded). */
.COLOR P
.COLOR blue
int	 width;	/* Display width in terminal columns. */
.sp 0.5v
.COLOR P
.COLOR red
for (mbs = INPUT; *mbs != '\e0'; mbs += len) {
.COLOR P
	HANDLE_SPECIFIC_BYTES(*mbs);  /* Optional. */
.COLOR red
	len = mbtowc(&wc, mbs, MB_CUR_MAX);
	if (len == -1) {
		/* Encoding error, reset state: */
		mbtowc(NULL, NULL, MB_CUR_MAX);
.COLOR P
.VERBOFF
.MULN
.PSPIC Images/SprayNorth.eps
.GPE_SM "Many summits: Spray Mountains"
.GPE_SM "from the Smith-Dorrien Trail"
.MULE
.VERBON
.sp -1v
.COLOR red
		/* After handling an invalid byte, retry with the next one. */
		len = 1;
.COLOR P
		HANDLE_INVALID_BYTE(*mbs);  /* Optional. */
		wc = L'?';	/* e.g. fmt, ls, rs, uniq */
		wc = L' ';	/* e.g. wc */
		width = 1;	/* e.g. column, colrm, fmt, ls, rs */
		width = -1;	/* e.g. ssh */
.COLOR blue
	} else {
		width = wcwidth(wc);
		if (width == -1) {
.COLOR P
			HANDLE_NONPRINTABLE_CHARACTER(wc);  /* Optional. */
			width = 1;  /* Usually. */
.COLOR blue
		}
.COLOR P
.COLOR red
	}
.COLOR P
	HANDLE_CHARACTER(wc, width);
.COLOR red
}
.COLOR P
.VERBOFF
.GPE_TIME 120
.ig
  - ls
    Support UTF-8: use wcwidth(3) for column adjustment and replace
    non-printable Unicode codepoints and invalid bytes with ASCII
    question marks.  No change for the SMALL version.
    Using ideas developed by tedu@, phessler@, bentley@ and feedback from many
    OK yasuoka@ czarkoff@ sthen@.
      bin/ls         : Makefile extern.h ls.1 ls.c print.c util.c
      bin/ls         : utf8.c
      1 Dec 2015 11:36:13 -0700 (MST)
    Fix a regression (and POSIX violation) introduced with UTF-8 support:
    When neither running on a terminal nor with -q, names must be passed
    through as they are, nothing must be replaced with question marks.
    Effectively, -q was always in effect.  SMALL was not affected.
      bin/ls         : utf8.c
      18 Jan 2016 12:06:37 -0700 (MST)
  - rs
    UTF-8 support: In a UTF-8 locale, properly align columns in the
    presence of zero-width and double-width characters and replace
    non-printable codepoints and invalid bytes with ASCII question
    marks.  No change in the C/POSIX locale.
    As a side effect, get rid of all pointer to pointer variables
    and simplify some of the code.
    Partially based on ideas from tedu@.
    Feedback and OK czarkoff@, OK tedu@.
      usr.bin/rs     : Makefile rs.c
      usr.bin/rs     : utf8.c
      3 Dec 2015 05:23:15 -0700 (MST)
  - wc
    UTF-8 support: implement -m for character counting
    and use iswspace(3) for word counting.
    Requires using getline(3) rather than read(2)
    to make sure that characters aren't chopped to pieces.
    Using feedback from millert@ on an earlier version.
    Feedback and OK tedu@.
      usr.bin/wc     : wc.1 wc.c
      7 Dec 2015 18:00:45 -0700 (MST)
  - fmt
    UTF-8 support; does not yet handle the -c option.
    No longer expand tabs up front in get_line(), their width depends on the
    width of characters earlier on the line.  Always NUL-terminate the input
    buffer for easier and safer handling.  Get rid of the hand-rolled output
    buffer, just let stdio do its work.
    OK tedu@
      usr.bin/fmt    : fmt.1 fmt.c
      15 Dec 2015 09:26:17 -0700 (MST)
    UTF-8 support for fmt -c.
    This implies two small changes in behaviour:
    1. Let fmt -c replace invalid bytes with ASCII question marks
    just like when called without -c.
    2. On lines to be centered, replace each tab with a single blank,
    simply because there is no useful way to define the meaning of a
    tab on such a line.  Having the width of a tab depend on what is
    to the right of it would be completely crazy (and complicate the
    code a lot), and otherwise, tabs on adjacent lines of different
    length wouldn't align anyway.
    OK millert@
      usr.bin/fmt    : fmt.c
      7 Jan 2016 11:02:43 -0700 (MST)
  - uniq
    UTF-8 support:
    Let -f recognize non-ASCII blank characters
    and let -s count characters rather than bytes.
    OK zhuk@ bentley@
      usr.bin/uniq   : uniq.1 uniq.c
      19 Dec 2015 03:21:01 -0700 (MST)
  - ps
    UTF-8 support:
    In a UTF-8 locale, columnate correctly and replace valid, but non-
    printable characters with the Unicode replacement character U+FFFD.
    No change in the C/POSIX locale, and no change for invalid bytes.
    Grand total, the code becomes shorter by almost 30 lines.
    Feedback from czarkoff@, OK millert@.
      bin/ps         : Makefile extern.h print.c ps.c
      bin/ps         : utf8.c
      10 Jan 2016 07:04:16 -0700 (MST)
  - colrm
    UTF-8 support:
    Cut by display columns rather than by character positions because
    the latter would be useless in the presence of combining zero-width
    characters.  Similarly, let tab advance to display columns rather
    than to character positions.
    For compatibility with nroff and man(1) output, let backspace back
    up one character rather than on display column.  But for compatibility
    with POSIX fold(1), *if* two backspaces follow a double-width character,
    ignore the second one.
    Fix some bugs while here:  Delete backspaces that immediately follow
    deleted characters.  Expand tabs intersecting deletions, such that
    part of the blanks can be removed.  Expand tabs following deletions,
    or they would no longer align with adjacent lines without tabs.
    OK jmc@ on a previous version of the manual.
    No opposition when shown on tech@.
      usr.bin/colrm  : colrm.1 colrm.c
      18 Jan 2016 13:31:36 -0700 (MST)
  - fold
    UTF-8 support.
    Using feedback about bugs in earlier versions from Matthew Martin
    <phy1729 at gmail dot com> and from tsg@ who tested it with afl(1).
    OK czarkoff@ tsg@
      usr.bin/fold   : fold.1 fold.c
      23 May 2016 04:31:42 -0600 (MDT)
..
.GPE_NEXT "Modularize?"
.TITLE "Technique 5: utf8.c utility files"
.BL
.LI
Advantage: isolate all multibyte = UTF-8 handling in one file
.LI
Avoid encumbering the main code
.LI
Not always possible, sometimes not even desirable,
in particular if the main code doesn't do much except character handling
in the first place, like in cut(1) or fmt(1)
.LI
Typical tasks: \m[red]parsing, validation, sanitation, output\m[]
.LI
Typically uses technique 4, iteration with mbtowc(3)
.LI
Sanition concerns invalid byte sequences and non-printable characters
.LI
Sanitation options are passthrough, skip, replace with question marks
or UTF-8 replacement characters, or vis(3)
.LE
.GPE_MULB 5i
.BL
.LI
Examples: ls(1), ps(1), rs(1), OpenSSH
.LI
All have subtly different requirements, in particular regarding
sanitation, width measurement, width limitation, and output
disposition
.LI
Hence, it was not yet possible to design a set of standard
functions, but we still hope more experience might allow
to do so in the future.
.LE
.MULN
.PSPIC Images/ThreeSistersDam.eps
.GPE_SM "Looking from the Three Sisters Dam (1710m)"
.GPE_SM "to the northern Goat Range (2730m)"
.MULE
.GPE_TIME 90
.GPE_NEXT "POSIX standard functions?"
.TITLE "Technique 6: iteration with fgetwc(3)"
.sp -1v
.GPE_MULB 7i
.BL
.LI
Only program so far too complex for these techniques: ul(1)
.LI
I implemented and tested a version using techniques 4 and 5,
.br
iteration with mbtowc(3) and wcwidth(3) in utf8.c,
.br
but it wasn't simple at all, so it was never committed.
.LI
The reason why it wasn't simple:  It does all kinds
.br
of string manipulation, almost like an editor:
.br
splitting, joining; deleting, inserting, transforming characters
.LI
What i did commit was a version doing the full
.br
char * to wchar_t * to char * double conversion.
.LI
In part inspired by the FreeBSD version which is in turn
.br
based on Bruno Haible's work in util-linux,
.br
but not sharing any UTF-8 code with either version.
.LE
.MULN
.PSPIC Images/StormPass.eps
.GPE_SM "Storm Mountain (3095m)"
.GPE_SM "from Highwood Pass (2206m)"
.MULE
.BL
.LE
.sp -0.5v
.SUBTITLE "Examples of problems with the FreeBSD version of ul(1)"
.BL
.LI
Errors out on the first encoding error \(em
can't be helped when surporting arbitrary encodings.
.LI
Backspace backs up one column position, but should backup one character.
.LI
Always treats _\eb_ as underlined, never as bold.
.LI
Fails to move the rest of the buffer right when _ later gets overlaid
with a double-width char.
.LE
.GPE_TIME 90
.ig
  As a bonus reimplement overstrike() and iattr() almost from scratch,
  getting rid of useless malloc(3)ed local buffers.
  Add lots of missing information to the manual.
  No opposition when shown on tech@.
    usr.bin/ul     : ul.1 ul.c
    18 Jan 2016 10:34:26 -0700 (MST)
..
.GPE_NEXT "What is not recommended?"
.TITLE "Techniques to avoid"
.BL
.LI
*r*() functions like mb\m[red]\s+8R\s-8\m[]towc(3)
.br
Don't use them unless you really need multithreading.
.br
They are considerably harder to use correctly than mbtowc(3).
.LI
fgetws(3) or fgetwln(3)
.br
Don't use them unless you must support arbitrary locales.
.br
getline(3) allows much better error handling
and isn't harder to use.
.br
.S -4
(And even for arbitrary locales, read(2) + mbtowc(3) is an option.)
.S P
.LI
*towcs() functions like mbstowcs(3)
.br
Iterating with mbtowc(3) allows much better error handling
.br
and needs only marginally more code (typically a dozen lines of code).
.LE
.GPE_MULB 6i
.sp 0.5v
Some people recommend
.br
to always use the most complicated functions
.br
because they work in all circumstances.
.SUBTITLE "I don't."
Where simpler functions suffice, they are easier to use and cause less bugs.
And the simpler functions themselves may be less buggy, too.
.MULN
.PSPIC Images/CrowchildTrail.eps
.GPE_SM "Biking the Crowchild Trail, Calgary"
.MULE
.GPE_TIME 120
.GPE_SECTION "BUGS' PARADISE"
.GPE_NEXT "What about library quality?"
.TITLE "Library quality"
.BL
.LI
In general, the BSD C libraries are of good quality:
.br
solid code that has been scoured for bugs for decades.
.LI
Multibyte and wide character code is no longer exactly young,
.br
but younger than much other code, and much more buggy.
.LI
In the following, i'm showing various examples from OpenBSD.
.br
Other BSD implementations sometimes differ in detail,
.br
but my impression is that quality is similar in all three systems.
.LI
All bugs found by chance, no complete audit yet!
.LE
.PSPIC Images/KingCreek.eps
.GPE_SM "Going from the Kananaskis Lakes toward Highwoood Pass"
.GPE_TIME 50
.GPE_NEXT "What about the most basic functions?"
.TITLE "Examples of bugs in conversion functions"
.BL
.LI
mbtowc(3) neglected to set errno(2) to EILSEQ
.br
when given an incomplete character (fixed Feb 2016)
.LI
mbrtowc(3) accepted some invalid UTF-8 sequences
.br
and silently produced invalid code points above U+10FFFF (fixed Sep 2015)
.LI
wcrtomb(3) accepted code points above U+10FFFF
.br
and silently produced invalid multibyte sequences (fixed Sep 2015)
.LI
wcrtomb(3) accepted UTF-16 surrogates in UTF-8 mode
.br
and silently produced invalid multibyte sequences (fixed Oct 2015)
.LE
.PSPIC Images/GlasgowElbow.eps
.GPE_SM "Little Elbow River (ca. 1600m) washed out by the 2013 flood,\
 with Mount Glasgow (2935m)"
.GPE_TIME 60
.ig
 - mbrtowc(3)
    - accepted > U+10FFFF
      src/lib/libc/citrus/citrus_utf8.c#rev1.9
      2015/09/05 15:22:04;  author: semarie  OK stsp
    - If an incomplete character is passed to mbtowc(3), set errno to EILSEQ.
      lib/libc/locale: mbtowc.c 27 Feb 2016 07:02:13 -0700 (MST)
 - wcrtomb(3)
    - accepted > U+10FFFF
      src/lib/libc/citrus/citrus_utf8.c#rev1.11
      2015/09/26 14:22:40;  author: semarie  OK bentley stsp
    - accepted surrogates
      src/lib/libc/citrus/citrus_utf8.c#rev1.14
      2015/10/13 02:17:46;  author: bentley  OK stsp
..
.GPE_NEXT "What about standard I/O?"
.TITLE "Bugs in libraries: examples in standard I/O"
.BL
.LI
fgetwc(3) didn't set the error indicator for encoding errors (fixed Dec 2015)
.LI
fputwc(3) didn't set the error indicator for invalid characters (fixed Jan 2016)
.br
The Austin Group thinks that even the C standard itself is buggy here.
.LI
fgetws(3) discarded any characters read and reported bogus EOF
when errno happened to be EILSEQ upon entry and the file ended
without a terminating L'\en' character (fixed Jan 2016)
.LI
fgetwln(3) ignored most encoding errors and
.br
sometimes returned partial lines truncated at random places (fixed Aug 2016)
.LI
printf(3) %ls destroyed all file flags on encoding errors,
.br
making the file permanently unreadable and unwriteable (fixed Jan 2016)
.LI
printf(3) silently treated encoding errors in the format string
.br
as the end of the format string (fixed Jan 2016)
.LI
printf(3) accessed a NULL pointer when out of memory
or on encoding errors (fixed Jan 2016)
.LE
.MULB 5i 0.2i 4.8i
.PSPIC -R Images/ElpocaRock16.eps
.MULN
.S -4
.sp 4v
The scree of the Rock Glacier (ca. 2100m), Pocaterra Valley,
.br
and the Elpoca Mountain (3029m)
.S P
.MULE
.GPE_TIME 60
.ig
 - fgetwc(3)
    - set the error indicator when an encoding error occurs
      lib/libc/stdio : fgetwc.c 24 Dec 2015 12:55:39 -0700 (MST)
 - fputwc(3)
    - When encoding fails in fputwc(3), set the error indicator as required
      by POSIX and as FreeBSD, SunOS 10/11, and glibc also do it.  Note
      that an enquiry to the Austin Group led to the conclusion that this
      change probably violates the C standard: C and POSIX unintentionally
      conflict.  But the POSIX behaviour makes more sense (easier to write
      correct error handling code for it, and a lower risk that programs
      miss errors) and is much more widespread, and the Austin Group
      intends to approach the C committee in order to adjust the C standard.
      See:  http://austingroupbugs.net/view.php?id=1022
      While here, do not set errno a second time, wcrtomb(3) already did that,
      and it is required to do it by the standard.
      OK millert@ and tedu@, and jca@ no longer objects
      lib/libc/stdio : fputwc.c
      26 Jan 2016 06:57:02 -0700 (MST)
 - fgetws(3)
    - When errno happens to be EILSEQ upon entry to fgetws(3),
      and when the file ends without a terminating L'\n' character,
      fgetws(3) discarded any characters read and reported bogus EOF.
      Never inspect errno(2) unless right after an error occurred!
      OK millert@
      4 Jan 2016 09:14:19 -0700 (MST)
 - fgetwln(3)
    - ignores most encoding errors
 - printf(3)
    - Fix lots of bugs.
      1. When fprintf(fp, "...%ls...", ...) encounters an encoding error,
      do not destroy all the fp->_flags, which made the file permanently
      unreadable and unwriteable.
      2. Do not change fp->_flags at all in case of encoding errors.
      Neither the manual nor POSIX ask for it, no other conversions set the
      error indicator, and it isn't needed because the return value reports
      failure and must be checked anyway.
      3. Detect failure in mbrtowc(3), do not silently treat invalid bytes
      in the format string as the end of the format string.
      4. Detect failure of __find_arguments(), no matter whether due to
      out of memory conditions or encoding errors, and gracefully fail
      rather than accessing an invalid pointer.
      5. Remove the pointless and slightly dangerous errno = EILSEQ overrides
      after functions that already do that and are required by the standard
      to do so.
      OK jca@ on items 1, 2, and 5.
      OK millert@ on the complete diff.
      "Completely brutal mix of bugs."  deraadt@
      lib/libc/stdio : vfprintf.c
      4 Jan 2016 08:47:48 -0700 (MST)
..
.GPE_NEXT "More examples?"
.TITLE "Various other errors in the C library"
.BL
.LI
The character property tables contained no data whatsoever
.br
for characters in the range U+FF00 to U+10FFFF.  (fixed Oct 2015)
.LI
Due to a bug in mklocale(1), character type and width data was wrong
.br
for many exotic characters designating numbers.  (fixed May 2016)
.LI
The C library code parsing character property tables contained
.br
out of boundary memory access for corrupt input files.  (fixed Oct 2015)
.LE
.PSPIC Images/ElkSouth.eps
.GPE_SM "Repair of the Highwood River (1870m)\
 after the 2013 flood below the Elk Range (2750m)"
.GPE_TIME 45
.ig
 - properties
    - update all of en_US.UTF-8.src
      share/locale/ctype: en_US.UTF-8.src 31 Oct 2015 14:56:19 -0600 (MDT)
 - locale description file handling
    - Validate input files to prevent out of boundary accesses.
      lib/libc/locale: rune.c 6 Dec 2015 04:54:59 -0700 (MST)
 - mklocale(1)
    - Delete encoding code for the unused TODIGIT information.
      I'm not aware of plans to add any TODIGIT support,
      and when shown on tech@, people were more or less indifferent
      and showed confusion about what this code even did.
      But the encoding code was buggy, in particular lacking validity
      checks, and hence clobbered other important data, in particular
      character type and character width data, with consequences that
      are hard to judge.
      usr.bin/mklocale: mklocale.1 yacc.y
      8 May 2016 09:25:44 -0600 (MDT)
..
.GPE_NEXT "Another library?"
.TITLE "Examples of bugs in libedit"
.sp -0.5v
.BL
.LI
el_wgetc(3), el_wgets(3) etc. sometimes discarded valid bytes
after reading invalid bytes
.LI
el_getc(3) silently converted non-ASCII Unicode characters into bogus bytes
.LI
el_getc(3) didn't set errno(2) for out-of-range errors
.LI
el_getc(3) didn't set the return argument to the NUL byte on read errors
.LI
Several functions reading characters broken
on systems where wchar_t doesn't use UCS-4.
.LE
.P
All these bugs were found during a partial audit and fixed in March 2016.
.PSPIC Images/StormSouth.eps
.GPE_SM "Crumbling rock on the south face of Storm Mountain (3095m)"
.GPE_TIME 50
.ig
    - Fix the CHARSET_IS_UTF8 case in read_char().
      1. After reading an invalid byte sequence, do not throw away additional
      valid bytes; fix by me using mbrtowc(3), obsoleting utf8_islead().
      lib/libedit    : chartype.h read.c
      20 Mar 2016 11:19:48 -0600 (MDT)
    - Fix read_char() for the non-UTF-8 case, in particular for systems
      supporting other multibyte locales or having an internal representation
      of wchar_t that doesn't match UCS-4.
      No functional change on OpenBSD, but it makes the code less confusing.
      OK czarkoff@.
      lib/libedit    : read.c
      Sun, 20 Mar 2016 12:20:10 -0600 (MDT)
    - Fix the public interface function el_getc(3).
      On OpenBSD, the effects are to set the return argument to the NUL byte
      in case of a read failure (for robustness) and to properly set errno
      when the character is out of range and cannot be stored in a byte.
      Once we enable UTF-8, this will be needed to avoid returning bogus
      bytes for valid Unicode characters.
      On systems where the internal representation of wchar_t doesn't
      match UCS-4, breakage was potentially even worse.
      OK czarkoff@.
      lib/libedit    : chartype.h eln.c
      20 Mar 2016 13:14:30 -0600 (MDT)
..
.GPE_NEXT "Examples in stand-alone programs?"
.TITLE "Examples of bugs in various programs"
.BL
.LI
mandoc(1) violated ISO C99 by mixing putchar(3) and putwchar(3)
on the same stream,
.br
resulting in corrupt output on glibc (fixed July and September 2016)
.LI
mandoc(1) accepted UTF-16 surrogates in \e[uXXXX] escapes
.br
and silently produced invalid UTF-8 output (fixed Oct 2015)
.LI
mandoc(1) failed to apply bold and italic markup to non-ASCII characters
(fixed Oct 2015)
.LI
tmux(1) contained wrong display widths for various characters
.br
in its internal width tables (fixed Nov 2015)
.LI
ypldap(8) contained buggy hand-rolled UTF-8 validation code
that failed to actually validate
.br
but instead caused buffer overruns
and loss of input data for invalid input (fixed Apr 2016)
.LE
.MULB 5.8i 0.2i 4i
.PSPIC -R Images/SmutsCreek.eps
.MULN
.S -4
.sp 8v
Even the tiny Smuts Creek (ca. 1900m)
.br
was washed out and needed repair.
.P
Background: Spray Mountains (up to 3400m)
.S P
.MULE
.GPE_TIME 60
.ig
 - man(1)
    - reject surrogates in \e[uXXXX] escapes
      usr.bin/mandoc : mandoc.c 13 Oct 2015 17:30:42 -0600 (MDT)
    - apply bold and italic to all non-ASCII Unicode codepoints
      usr.bin/mandoc : term.c 23 Oct 2015 08:49:13 -0600 (MDT)
    - ISO C99 7.19.2.5 doesn't like mixing putchar(3) and putwchar(3) on
      the same stream, and actually, it fails spectacularly on glibc.
      Portability issue pointed out by Svyatoslav Mishyn <juef at openmailbox
      dot org> after testing on Void Linux.
      mdocml: main.c main.h term_ascii.c  8 Jul 2016 17:29:35 -0500 (EST)
 - tmux(1)
    - update the internal wcwidth(3) table
      usr.bin/tmux   : utf8.c 5 Nov 2015 09:44:25 -0700 (MST)
 - ypldap(8)
    - Simplify overengineered and buggy code that looked like as if it did
      some kind of UTF-8 validation, but actually didn't, but instead, for
      malformed UTF-8 input, caused buffer overruns in some cases and caused
      skipping of valid ASCII characters in other cases.
      Problem originally discovered and fix OK by stsp@.
      eric@ agrees with the direction.
      usr.sbin/ypldap: aldap.c
      27 Apr 2016 04:53:27 -0600 (MDT)
..
.GPE_NEXT "Something really important?"
.TITLE "The situation in OpenSSH"
.nr LspSave \n[Lsp]
.nr Lsp \n[Lsp]/2
.sp -1v
.BL
.LI
Input and output streams are treated as narrow throughout.
.LI
In most places, incoming text data is treated as opaque byte strings,
.br
simply passing it through, not even trying to validate or decode.
.LI
Where text data is interpreted, the code does not restrict itself to UTF-8,
.br
but attempts to support arbitrary encodings,
though not always in full generality.
.LI
Informational and diagnostic messages are written in ASCII throughout,
.br
which is compatible with many, but not with all encodings.
.LI
In scp(1) and sftp(1), \m[red]untrusted text data sent from the server\m[] -
for example file and directory names - could silently screw up
terminal settings on the client host in addition to wrong data being
displayed.  Part of that was fixed in May 2016 using validation and
sanitation techniques.  Some fixes could not yet be committed because
part of the output is produced in signal handlers.
.LI
For exotic encodings, several unknown bugs likely exist.
.LE
.GPE_MULB 5i
.BL
.LI
For maximum security, make sur
.br
both endpoints run OpenBSD
.br
(to avoid exposure to arbitrary locales)
.br
and set LC_CTYPE to UTF-8 on both sides.
.LI
sftp(1) libedit usage needs review with
.br
respect to multibyte character handling.
.LI
Overall, \m[red]auditing has barely begun\m[].
.LE
.MULN
.PSPIC Images/SheepValleyTrails.eps
.MULE
.nr Lsp \n[LspSave]
.GPE_TIME 120
.ig
 - usability and security issues with non-UTF-8 locales in base
    - ssh, scp, sftp
      To prevent screwing up terminal settings when printing to the
      terminal, for ASCII and UTF-8, escape bytes not forming characters
      and bytes forming non-printable characters with vis(3) VIS_OCTAL.
      For other character sets, abort printing of the current string in
      these cases.  In particular,
      * let scp(1) respect the local user's LC_CTYPE locale(1);
      * sanitize data received from the remote host;
      * sanitize filenames, usernames, and similar data even locally;
      * take character display widths into account for the progressmeter.
      This is believed to be sufficient to keep the local terminal safe
      on OpenBSD, but bad things can still happen on other systems with
      state-dependent locales because many places in the code print
      unencoded ASCII characters into the output stream.
      Using feedback from djm@ and martijn@,
      various aspects discussed with many others.
      deraadt@ says it should go in now, i probably already hesitated too long
        usr.bin/ssh    : progressmeter.c scp.c sftp-client.c sftp.c
        usr.bin/ssh/lib: Makefile
        usr.bin/ssh    : utf8.c utf8.h
        25 May 2016 17:48:45 -0600 (MDT)
      Fix two rare edge cases:
      1. If vasprintf() returns < 0, do not access a NULL pointer in snmprintf()
,
      and do not free() the pointer returned from vasprintf() because on some
      systems other than OpenBSD, it might be a bogus pointer.
      2. If vasprintf() returns == 0, return 0 and "" rather than -1 and NULL.
      Besides, free(dst) is pointless after failure (not a bug).
      One half OK martijn@, the other half OK deraadt@;
      committing quickly before people get hurt.
        usr.bin/ssh    : utf8.c
        30 May 2016 06:05:56 -0600 (MDT)
      Even when only writing an unescaped character, the dst buffer may need to
      grow, or it would be overrun; issue found by tb@ with malloc.conf(5) 'C'.
      While here, reserve an additional byte for the terminating NUL
      up front such that we don't have to realloc() later just for that.
      OK tb@
        usr.bin/ssh    : utf8.c
        30 May 2016 06:57:21 -0600 (MDT)
      Backout rev. 1.43 for now.
      The function update_progress_meter() calls refresh_progress_meter()
      which calls snmprintf() which calls malloc(); but update_progress_meter()
      acts as the SIGALRM signal handler.
      "malloc(): error: recursive call" reported by sobrado@.
        usr.bin/ssh    : progressmeter.c
        30 May 2016 12:34:41 -0600 (MDT)
      support UTF-8 characters in ssh(1) banners using schwarze@'s
      safe fmprintf printer; bz#2058
      feedback schwarze@ ok dtucker@
        usr.bin/ssh    : ssh.c sshconnect2.c
        16 Jul 2016 22:20:16 -0600 (MDT)
..
.GPE_SECTION CONCLUSION
.GPE_NEXT "Conclusions"
.TITLE "Conclusions"
.BL
.LI
Multibyte character handling code in full generality is
huge, complicated, buggy, must error out on the slightest problem,
and using arbitrary encodings is never fully secure.
.LI
Consequently, providing full multibyte support in a general-purpose
POSIX C library is bad for usability, correctness, and security.
.LI
Instead, better leave full generality where it belongs:
In dedicated conversion and text processing software.
.LE
.GPE_MULB 5i
.BL
.LI
\m[red]Supporting UTF-8 only allows good usability, simplicity,
and hence a better chance for correctness, reliability, and security.\m[]
.LI
No silver bullet found yet to solve all implementation tasks in one
unified way, but every practical task encoutered so far allowed a
simple solution.
.LI
A full toolbox may require up to ten different simple implementation
techniques, each one adapted to a specific set of requirements.
.LI
Look at the source code of the OpenBSD utilities mentioned here
for guidance how to solve similar tasks.
.LE
.MULN
.PSPIC Images/MountRundle.eps
.GPE_SM "East End of Rundle (2550m) seen from Canmore"
.MULE
.GPE_TIME 90
.GPE_NEXT "Future directions"
.TITLE "Work to do in OpenBSD"
.BL
.LI
Small utilities almost done, but a few remain:
lam(1), pr(1), talk(1), tr(1), ...
.LI
Continue audit: libc, libedit
.LI
POSIX regular expression library, fnmatch(3), glob(3)
.LI
Work was barely started: ksh(1), OpenSSH
.LI
Work was not yet started: vi(1), mg(1)
.LI
Unknown status, maybe not much to do: libcurses, less(1), ...
.LE
.MULB 5.8i 0.2i 4i
.PSPIC -R Images/JohnsonTrail.eps
.MULN
.S -4
.sp 10v
Outlook from The Hump (2020m)
.br
along Johnson Trail (Hwy. 532)
.br
across the foothills towards the prairies
.S P
.MULE
.GPE_TIME 60
.GPE_NEXT "Who contributed?"
.GPE_MULB 7i 0.05i
.TITLE "Thanks!"
.BL
.LI
The OpenBSD foundation provided financial support for part of my time.
No way to make so much progress without that \(em but without the
contributions by lots and lots of other developers, nothing would
have been achieved at all:
.LI
\fBMarc Espie\fP: initial import of partial multibyte handling code
.LE
.MULN
.PSPIC Images/PrincesIsland.eps
.GPE_SM "Bridge to Prince's Island, Calgary"
.MULE
.sp -1v
.nr LspSave \n[Lsp]
.nr Lsp \n[Lsp]*6/10
.BL
.LI
\m[red]Stefan Sperling\m[]: import of partial Citrus code;
contributed to many of the ideas explained here;
joint work, patch reviews, bug reports, many useful discussions
.\" citrus [mult], ypldap(8), man(1)
.LI
\m[red]Ted Unangst\m[]: developed many of the ideas explained here;
many patch reviews \" stdio, cut(1), fmt(1), ksh(1), rs(1), wc(1)
.LI
\fBAnthony Bentley\fP: contributed to many of the ideas explained here;
joint work, bugfixes, patch reviews, bug reports, many useful discussions
.\" citrus [mult], man(1) [2], uniq(1)
.LI
\fBAndrew Fresh\fP: gen_ctype_utf8.pl author and maintainer;
.br
bug reports and feedback on some patches \" pod2man(1), mklocale(1)
.LI
Martijn van Duren: fix rev(1), wall(1), write(1);
many code reviews for libedit;
some initial ideas and lots of feedback for scp(1) and sftp(1)
.\" and patch reviews for stdio and xterm(1)
.LI
S\('ebastien Marie:
joint work, some bugfixes, patch reviews, useful discussions
.\" citrus [mult], ksh(1), write(1), man(1)
.LI
Vadim Zhukov: review and meticulously refine the TODO list;
.br
feedback concerning some of the ideas explained here;
some patch reviews \" cut(1), uniq(1)
.LI
Todd Miller: large numbers of patch reviews in libc and utilities
.\" libc/stdio [6], libc/locale, libedit,
.\" fmt(1), ps(1), mklocale(1), wall(1), wc(1)
.sp -1v
.LE
.nr Lsp \n[LspSave]
.
.GPE_TIME 80
.GPE_NEXT "Who contributed?"
.MULB 8.3i 0.2i 1.5i
.TITLE "Thanks! (2)"
.BL
.LI
Theo de Raadt: support for the OpenSSH, stdio, and citrus patches;
support for the general direction with small utilities;
a few patches and patch reviews \" ftpd(8), rs(1), w(1)
.LI
Dmitrij Czarkoff: many code reviews for libedit
and patch reviews for utilities  \" cut(1), fold(1), ls(1), ps(1), rs(1)
.LI
Damien Miller: patch to allow UTF-8 in ssh(1) banners
.br
and much support working on OpenSSH code in general
.LI
Nicholas Marriott: UTF-8 work in tmux(1);
.br
feedback concerning some of the ideas explained here
.LI
Christian Weisgerber:
feedback concerning some of the ideas explained here;
important suggestion and patch review for xterm(1)
.LI
Peter Hessler: important contributions to some of the ideas explained here
.LE
.SUBTITLE "Additional patches, patch reviews, help and feedback from:"
Darren Tucker,                   \" feedback on OpenSSH patches
Eric Faurot,                     \" review of ypldap(8) patch
Giannis Tsaraias,                \" review fold(1) patch, tested with afl(1)
Jason McIntyre,                  \" help with manual pages
J\('er\('emie Courr\(`eges-Anglas,  \" review of four patches for stdio
Jonathan Gray,                   \" bug fix in locale/rune.c
Martin Natano,                   \" feedback regarding ls(1)
Martin Pieuchot,                 \" review citrus and ksh(1) patches
Masahiko YASUOKA,                \" review ls(1) patch
Masao UEBAYASHI,                 \" feedback concerning some of the basic ideas
Matthieu Herrb,                  \" review xterm(1) patch
Philip Guenther,                 \" review of stdio patch
Stuart Henderson,                \" review ls(1) patch
Theo B\(:uhler,                  \" review OpenSSH patch
Tobias St\(:ockmann (OpenBSD),   \" bug fix and patch reviews in locale/rune.c
Andrey Chernov (FreeBSD),        \" important help with fgetwln(3)
Christos Zoulas (NetBSD),        \" maintaining libedit, checking patches
Svyatoslav Mishyn (Void Linux),  \" bug report: mixing put[w]char in man(1)
Christian Heckendorf,            \" some code reviews for libedit
Frederic Nowak,                  \" some initial ideas for ksh(1) emacs mode
Matthew Martin, ...              \" bug reports for fold(1)
.MULN
.sp 1i
.PSPIC Images/ElbowFlower1.eps
.sp 1i
.PSPIC Images/ElbowFlower2.eps
.MULE
.GPE_TIME 10
.ds gpe_next The end.
.\"so fin/images.roff