From 26256f285c4258d428687df94f37edac78d4de48 Mon Sep 17 00:00:00 2001
From: Francesco Chemolli <5175948+kinkie@users.noreply.github.com>
Date: Wed, 31 Jan 2024 18:21:13 +0000
Subject: [PATCH] Add AnyP::Uri::Decode() (#1626)

Implement percent-decoding according to RFC 3986. Currently unused.
TODO: Upgrade rfc1738_unescape() callers.
---
 src/anyp/Uri.cc               | 23 +++++++++++++++++
 src/anyp/Uri.h                |  3 +++
 src/base/CharacterSet.cc      |  8 ++++++
 src/base/CharacterSet.h       |  3 +++
 src/http/one/RequestParser.cc |  4 +--
 src/tests/testURL.cc          | 48 +++++++++++++++++++++++++++++++++++
 6 files changed, 86 insertions(+), 3 deletions(-)
diff --git a/src/anyp/Uri.cc b/src/anyp/Uri.cc
index 2f9a7191f41..aba1bab5593 100644
--- a/src/anyp/Uri.cc
+++ b/src/anyp/Uri.cc
@@ -81,6 +81,29 @@ AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
     return output;
 }
 
+SBuf
+AnyP::Uri::Decode(const SBuf &buf)
+{
+    SBuf output;
+    Parser::Tokenizer tok(buf);
+    while (!tok.atEnd()) {
+        SBuf token;
+        static const auto unencodedChars = CharacterSet("percent", "%").complement("unencoded");
+        if (tok.prefix(token, unencodedChars))
+            output.append(token);
+
+        // we are either at '%' or at end of input
+        if (tok.skip('%')) {
+            int64_t hex1 = 0, hex2 = 0;
+            if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1))
+                output.append(static_cast<char>((hex1 << 4) | hex2));
+            else
+                throw TextException("invalid pct-encoded triplet", Here());
+        }
+    }
+    return output;
+}
+
 const SBuf &
 AnyP::Uri::Asterisk()
 {
diff --git a/src/anyp/Uri.h b/src/anyp/Uri.h
index a90a3b7d471..81090e63cd8 100644
--- a/src/anyp/Uri.h
+++ b/src/anyp/Uri.h
@@ -113,6 +113,9 @@ class Uri
     /// the provided set of expected characters.
     static SBuf Encode(const SBuf &, const CharacterSet &expected);
 
+    /// %-decode the given buffer
+    static SBuf Decode(const SBuf &);
+
     /**
      * The authority-form URI for currently stored values.
      *
diff --git a/src/base/CharacterSet.cc b/src/base/CharacterSet.cc
index efcb057690d..3793b7c745c 100644
--- a/src/base/CharacterSet.cc
+++ b/src/base/CharacterSet.cc
@@ -160,3 +160,11 @@ CharacterSet::ETAGC("ETAGC", {{0x21,0x21},{0x23,0x7e},{0x80,0xff}}),
 CharacterSet::TOKEN68C("TOKEN68C","-._~+/0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
 ;
 
+const CharacterSet &
+CharacterSet::RFC3986_UNRESERVED()
+{
+    // RFC 3986: unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
+    static const auto chars = new CharacterSet("RFC3986_UNRESERVED", "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-._~");
+    return *chars;
+}
+
diff --git a/src/base/CharacterSet.h b/src/base/CharacterSet.h
index a2439d48038..880072c0306 100644
--- a/src/base/CharacterSet.h
+++ b/src/base/CharacterSet.h
@@ -118,6 +118,9 @@ class CharacterSet
     // token68 (internal characters only, excludes '=' terminator)
     static const CharacterSet TOKEN68C;
 
+    /// allowed URI characters that do not have a reserved purpose, RFC 3986
+    static const CharacterSet &RFC3986_UNRESERVED();
+
 private:
     /** index of characters in this set
      *
diff --git a/src/http/one/RequestParser.cc b/src/http/one/RequestParser.cc
index 8dcd1e7eacb..6d73d4c0281 100644
--- a/src/http/one/RequestParser.cc
+++ b/src/http/one/RequestParser.cc
@@ -98,9 +98,7 @@ UriValidCharacters()
         CharacterSet("gen-delims", ":/?#[]@") +
         CharacterSet("sub-delims", "!$&'()*+,;=") +
         // RFC 3986 section 2.3 - unreserved characters
-        CharacterSet::ALPHA +
-        CharacterSet::DIGIT +
-        CharacterSet("unreserved", "-._~") +
+        CharacterSet::RFC3986_UNRESERVED() +
         // RFC 3986 section 2.1 - percent encoding "%" HEXDIG
         CharacterSet("pct-encoded", "%") +
         CharacterSet::HEXDIG;
diff --git a/src/tests/testURL.cc b/src/tests/testURL.cc
index 2600a72a200..d50e22db77f 100644
--- a/src/tests/testURL.cc
+++ b/src/tests/testURL.cc
@@ -9,8 +9,11 @@
 #include "squid.h"
 
 #include "anyp/Uri.h"
+#include "base/CharacterSet.h"
+#include "base/TextException.h"
 #include "compat/cppunit.h"
 #include "debug/Stream.h"
+#include "sbuf/Stream.h"
 #include "unitTestMain.h"
 
 #include <cppunit/TestAssert.h>
@@ -25,11 +28,13 @@ class TestUri : public CPPUNIT_NS::TestFixture
     CPPUNIT_TEST_SUITE(TestUri);
     CPPUNIT_TEST(testConstructScheme);
     CPPUNIT_TEST(testDefaultConstructor);
+    CPPUNIT_TEST(testEncoding);
     CPPUNIT_TEST_SUITE_END();
 
 protected:
     void testConstructScheme();
     void testDefaultConstructor();
+    void testEncoding();
 };
 CPPUNIT_TEST_SUITE_REGISTRATION(TestUri);
 
@@ -81,6 +86,49 @@ TestUri::testDefaultConstructor()
     delete urlPointer;
 }
 
+void
+TestUri::testEncoding()
+{
+    const std::vector< std::pair<SBuf, SBuf> > basicTestCases = {
+        {SBuf(""), SBuf("")},
+        {SBuf("foo"), SBuf("foo")},
+        {SBuf("%"), SBuf("%25")},
+        {SBuf("%foo"), SBuf("%25foo")},
+        {SBuf("foo%"), SBuf("foo%25")},
+        {SBuf("fo%o"), SBuf("fo%25o")},
+        {SBuf("fo%%o"), SBuf("fo%25%25o")},
+        {SBuf("fo o"), SBuf("fo%20o")},
+        {SBuf("?1"), SBuf("%3F1")},
+        {SBuf("\377"), SBuf("%FF")},
+        {SBuf("fo\0o", 4), SBuf("fo%00o")},
+    };
+
+    for (const auto &testCase: basicTestCases) {
+        CPPUNIT_ASSERT_EQUAL(testCase.first, AnyP::Uri::Decode(testCase.second));
+        CPPUNIT_ASSERT_EQUAL(testCase.second, AnyP::Uri::Encode(testCase.first, CharacterSet::RFC3986_UNRESERVED()));
+    };
+
+    const auto invalidEncodings = {
+        SBuf("%"),
+        SBuf("%%"),
+        SBuf("%%%"),
+        SBuf("%1"),
+        SBuf("%1Z"),
+        SBuf("%1\000", 2),
+        SBuf("%1\377"),
+        SBuf("%\0002", 3),
+        SBuf("%\3772"),
+    };
+
+    for (const auto &invalidEncoding: invalidEncodings) {
+        // test various input positions of an invalid escape sequence
+        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(invalidEncoding), TextException);
+        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf("word", invalidEncoding)), TextException);
+        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf(invalidEncoding, "word")), TextException);
+        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf("word", invalidEncoding, "word")), TextException);
+    };
+}
+
 int
 main(int argc, char *argv[])
 {