From 26256f285c4258d428687df94f37edac78d4de48 Mon Sep 17 00:00:00 2001 From: Francesco Chemolli <5175948+kinkie@users.noreply.github.com> Date: Wed, 31 Jan 2024 18:21:13 +0000 Subject: [PATCH] Add AnyP::Uri::Decode() (#1626) Implement percent-decoding according to RFC 3986. Currently unused. TODO: Upgrade rfc1738_unescape() callers. --- src/anyp/Uri.cc | 23 +++++++++++++++++ src/anyp/Uri.h | 3 +++ src/base/CharacterSet.cc | 8 ++++++ src/base/CharacterSet.h | 3 +++ src/http/one/RequestParser.cc | 4 +-- src/tests/testURL.cc | 48 +++++++++++++++++++++++++++++++++++ 6 files changed, 86 insertions(+), 3 deletions(-) diff --git a/src/anyp/Uri.cc b/src/anyp/Uri.cc index 2f9a7191f41..aba1bab5593 100644 --- a/src/anyp/Uri.cc +++ b/src/anyp/Uri.cc @@ -81,6 +81,29 @@ AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore) return output; } +SBuf +AnyP::Uri::Decode(const SBuf &buf) +{ + SBuf output; + Parser::Tokenizer tok(buf); + while (!tok.atEnd()) { + SBuf token; + static const auto unencodedChars = CharacterSet("percent", "%").complement("unencoded"); + if (tok.prefix(token, unencodedChars)) + output.append(token); + + // we are either at '%' or at end of input + if (tok.skip('%')) { + int64_t hex1 = 0, hex2 = 0; + if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1)) + output.append(static_cast((hex1 << 4) | hex2)); + else + throw TextException("invalid pct-encoded triplet", Here()); + } + } + return output; +} + const SBuf & AnyP::Uri::Asterisk() { diff --git a/src/anyp/Uri.h b/src/anyp/Uri.h index a90a3b7d471..81090e63cd8 100644 --- a/src/anyp/Uri.h +++ b/src/anyp/Uri.h @@ -113,6 +113,9 @@ class Uri /// the provided set of expected characters. static SBuf Encode(const SBuf &, const CharacterSet &expected); + /// %-decode the given buffer + static SBuf Decode(const SBuf &); + /** * The authority-form URI for currently stored values. * diff --git a/src/base/CharacterSet.cc b/src/base/CharacterSet.cc index efcb057690d..3793b7c745c 100644 --- a/src/base/CharacterSet.cc +++ b/src/base/CharacterSet.cc @@ -160,3 +160,11 @@ CharacterSet::ETAGC("ETAGC", {{0x21,0x21},{0x23,0x7e},{0x80,0xff}}), CharacterSet::TOKEN68C("TOKEN68C","-._~+/0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") ; +const CharacterSet & +CharacterSet::RFC3986_UNRESERVED() +{ + // RFC 3986: unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + static const auto chars = new CharacterSet("RFC3986_UNRESERVED", "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-._~"); + return *chars; +} + diff --git a/src/base/CharacterSet.h b/src/base/CharacterSet.h index a2439d48038..880072c0306 100644 --- a/src/base/CharacterSet.h +++ b/src/base/CharacterSet.h @@ -118,6 +118,9 @@ class CharacterSet // token68 (internal characters only, excludes '=' terminator) static const CharacterSet TOKEN68C; + /// allowed URI characters that do not have a reserved purpose, RFC 3986 + static const CharacterSet &RFC3986_UNRESERVED(); + private: /** index of characters in this set * diff --git a/src/http/one/RequestParser.cc b/src/http/one/RequestParser.cc index 8dcd1e7eacb..6d73d4c0281 100644 --- a/src/http/one/RequestParser.cc +++ b/src/http/one/RequestParser.cc @@ -98,9 +98,7 @@ UriValidCharacters() CharacterSet("gen-delims", ":/?#[]@") + CharacterSet("sub-delims", "!$&'()*+,;=") + // RFC 3986 section 2.3 - unreserved characters - CharacterSet::ALPHA + - CharacterSet::DIGIT + - CharacterSet("unreserved", "-._~") + + CharacterSet::RFC3986_UNRESERVED() + // RFC 3986 section 2.1 - percent encoding "%" HEXDIG CharacterSet("pct-encoded", "%") + CharacterSet::HEXDIG; diff --git a/src/tests/testURL.cc b/src/tests/testURL.cc index 2600a72a200..d50e22db77f 100644 --- a/src/tests/testURL.cc +++ b/src/tests/testURL.cc @@ -9,8 +9,11 @@ #include "squid.h" #include "anyp/Uri.h" +#include "base/CharacterSet.h" +#include "base/TextException.h" #include "compat/cppunit.h" #include "debug/Stream.h" +#include "sbuf/Stream.h" #include "unitTestMain.h" #include @@ -25,11 +28,13 @@ class TestUri : public CPPUNIT_NS::TestFixture CPPUNIT_TEST_SUITE(TestUri); CPPUNIT_TEST(testConstructScheme); CPPUNIT_TEST(testDefaultConstructor); + CPPUNIT_TEST(testEncoding); CPPUNIT_TEST_SUITE_END(); protected: void testConstructScheme(); void testDefaultConstructor(); + void testEncoding(); }; CPPUNIT_TEST_SUITE_REGISTRATION(TestUri); @@ -81,6 +86,49 @@ TestUri::testDefaultConstructor() delete urlPointer; } +void +TestUri::testEncoding() +{ + const std::vector< std::pair > basicTestCases = { + {SBuf(""), SBuf("")}, + {SBuf("foo"), SBuf("foo")}, + {SBuf("%"), SBuf("%25")}, + {SBuf("%foo"), SBuf("%25foo")}, + {SBuf("foo%"), SBuf("foo%25")}, + {SBuf("fo%o"), SBuf("fo%25o")}, + {SBuf("fo%%o"), SBuf("fo%25%25o")}, + {SBuf("fo o"), SBuf("fo%20o")}, + {SBuf("?1"), SBuf("%3F1")}, + {SBuf("\377"), SBuf("%FF")}, + {SBuf("fo\0o", 4), SBuf("fo%00o")}, + }; + + for (const auto &testCase: basicTestCases) { + CPPUNIT_ASSERT_EQUAL(testCase.first, AnyP::Uri::Decode(testCase.second)); + CPPUNIT_ASSERT_EQUAL(testCase.second, AnyP::Uri::Encode(testCase.first, CharacterSet::RFC3986_UNRESERVED())); + }; + + const auto invalidEncodings = { + SBuf("%"), + SBuf("%%"), + SBuf("%%%"), + SBuf("%1"), + SBuf("%1Z"), + SBuf("%1\000", 2), + SBuf("%1\377"), + SBuf("%\0002", 3), + SBuf("%\3772"), + }; + + for (const auto &invalidEncoding: invalidEncodings) { + // test various input positions of an invalid escape sequence + CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(invalidEncoding), TextException); + CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf("word", invalidEncoding)), TextException); + CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf(invalidEncoding, "word")), TextException); + CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf("word", invalidEncoding, "word")), TextException); + }; +} + int main(int argc, char *argv[]) {