From 18d1b04b7adb1c85515b7663a81d3ca18f255d23 Mon Sep 17 00:00:00 2001 From: ArthurSonzogni Date: Thu, 17 Jun 2021 23:45:17 +0200 Subject: [PATCH] terminal_input_parser: validate UTF8. Make sure code points parsed are always valid UTF8. Don't assume stdin is filled with valid data. Check for overlong UTF8 and add some tests. The fuzzer has reached the following coverage: - cov : 204 - ft : 754 - corp : 62/12257b - lim : 2798 - exec/s : 1748 - rss : 445Mb - L : 155/1946 - MS : 3 Fixed:https://github.com/ArthurSonzogni/FTXUI/issues/118 --- src/ftxui/component/terminal_input_parser.cpp | 70 +++++++++++++++- .../component/terminal_input_parser_test.cpp | 82 +++++++++++++++++++ 2 files changed, 149 insertions(+), 3 deletions(-) diff --git a/src/ftxui/component/terminal_input_parser.cpp b/src/ftxui/component/terminal_input_parser.cpp index 63c57ae..fb7e3ad 100644 --- a/src/ftxui/component/terminal_input_parser.cpp +++ b/src/ftxui/component/terminal_input_parser.cpp @@ -93,14 +93,78 @@ TerminalInputParser::Output TerminalInputParser::Parse() { return ParseUTF8(); } +// Code point <-> UTF-8 conversion +// +// ┏━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ +// ┃Byte 1 ┃Byte 2 ┃Byte 3 ┃Byte 4 ┃ +// ┡━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ +// │0xxxxxxx│ │ │ │ +// ├────────┼────────┼────────┼────────┤ +// │110xxxxx│10xxxxxx│ │ │ +// ├────────┼────────┼────────┼────────┤ +// │1110xxxx│10xxxxxx│10xxxxxx│ │ +// ├────────┼────────┼────────┼────────┤ +// │11110xxx│10xxxxxx│10xxxxxx│10xxxxxx│ +// └────────┴────────┴────────┴────────┘ +// +// Then some sequences are illegal if it exist a shorter representation of the +// same codepoint. TerminalInputParser::Output TerminalInputParser::ParseUTF8() { unsigned char head = static_cast(Current()); - for (int i = 0; i < 3; ++i, head <<= 1) { - if ((head & 0b11000000) != 0b11000000) - break; + unsigned char selector = 0b1000'0000; + + // The non code-point part of the first byte. + unsigned char mask = selector; + + // Find the first zero in the first byte. + int first_zero = 8; + for(int i = 0; i<8; ++i) { + mask |= selector; + if (head & selector) { + selector >>= 1; + continue; + } + first_zero = i; + break; + } + + // Accumulate the value of the first byte. + wchar_t value = head & ~mask; + + // Invalid UTF8, with more than 5 bytes. + if (first_zero == 1 || first_zero >= 5) + return DROP; + + // Multi byte UTF-8. + for (int i = 2; i <= first_zero; ++i) { if (!Eat()) return UNCOMPLETED; + + // Invalid continuation byte. + head = static_cast(Current()); + if ((head & 0b1100'0000) != 0b1000'0000) + return DROP; + value <<= 6; + value += head & 0b0011'1111; } + + // Check for overlong UTF8 encoding. + int extra_byte; + if (value <= 0b000'0000'0111'1111) { + extra_byte = 0; + } else if (value <= 0b000'0111'1111'1111) { + extra_byte = 1; + } else if (value <= 0b1111'1111'1111'1111) { + extra_byte = 2; + } else if (value <= 0b1'0000'1111'1111'1111'1111) { + extra_byte = 3; + } else { + return DROP; + } + + if (extra_byte != position_) + return DROP; + return CHARACTER; } diff --git a/src/ftxui/component/terminal_input_parser_test.cpp b/src/ftxui/component/terminal_input_parser_test.cpp index f2ca2d8..17d5765 100644 --- a/src/ftxui/component/terminal_input_parser_test.cpp +++ b/src/ftxui/component/terminal_input_parser_test.cpp @@ -149,6 +149,88 @@ TEST(Event, MouseRightClick) { EXPECT_FALSE(event_receiver->Receive(&received)); } +TEST(Event, UTF8) { + struct { + std::vector input; + bool valid; + } kTestCase[] = { + // Basic characters. + {{'a'}, true}, + {{'z'}, true}, + {{'A'}, true}, + {{'Z'}, true}, + {{'0'}, true}, + {{'9'}, true}, + + // UTF-8 of various size: + {{0b0100'0001}, true}, + {{0b1100'0010, 0b1000'0000}, true}, + {{0b1110'0010, 0b1000'0000, 0b1000'0000}, true}, + {{0b1111'0010, 0b1000'0000, 0b1000'0000, 0b1000'0000}, true}, + + // Overlong UTF-8 encoding: + {{0b1100'0000, 0b1000'0000}, false}, + {{0b1110'0000, 0b1000'0000, 0b1000'0000}, false}, + {{0b1111'0000, 0b1000'0000, 0b1000'0000, 0b1000'0000}, false}, + + // Test limits in between the various legal regions + // https://unicode.org/versions/corrigendum1.html + // Limit in between the valid and ina + // {{0x7F}, true}, => Special sequence. + {{0x80}, false}, + // --- + {{0xC1, 0x80}, false}, + {{0xC2, 0x7F}, false}, + {{0xC2, 0x80}, true}, + // --- + {{0xDF, 0xBF}, true}, + {{0xDF, 0xC0}, false}, + // --- + {{0xE0, 0x9F, 0x80}, false}, + {{0xE0, 0xA0, 0x7F}, false}, + {{0xE0, 0xA0, 0x80}, true}, + // --- + {{0xE0, 0xBF, 0xBF}, true}, + // --- + {{0xE1, 0x7F, 0x80}, false}, + {{0xE1, 0x80, 0x7f}, false}, + {{0xE1, 0x80, 0x80}, true}, + // -- + {{0xEF, 0xBF, 0xBF}, true}, + {{0xEF, 0xC0, 0xBF}, false}, + {{0xEF, 0xBF, 0xC0}, false}, + // -- + {{0xF0, 0x90, 0x80}, false}, + {{0xF0, 0x8F, 0x80, 0x80}, false}, + {{0xF0, 0x90, 0x80, 0x7F}, false}, + {{0xF0, 0x90, 0x80, 0x80}, true}, + // -- + {{0xF1, 0x80, 0x80, 0x80}, true}, + // -- + {{0xF1, 0xBF, 0xBF, 0xBF}, true}, + // -- + {{0xF2, 0x80, 0x80, 0x80}, true}, + // -- + {{0xF4, 0x8F, 0xBF, 0xBF}, true}, + {{0xF4, 0x90, 0xBF, 0xBF}, false}, + + }; + for (auto test : kTestCase) { + auto event_receiver = MakeReceiver(); + { + auto parser = TerminalInputParser(event_receiver->MakeSender()); + for (auto input : test.input) + parser.Add(input); + } + Event received; + if (test.valid) { + EXPECT_TRUE(event_receiver->Receive(&received)); + EXPECT_TRUE(received.is_character()); + } + EXPECT_FALSE(event_receiver->Receive(&received)); + } +} + // Copyright 2020 Arthur Sonzogni. All rights reserved. // Use of this source code is governed by the MIT license that can be found in // the LICENSE file.