Skip to content

Commit

Permalink
Address Web Discovery feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
DJAndries committed Jan 10, 2025
1 parent 962e75a commit c7089e1
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 163 deletions.
38 changes: 13 additions & 25 deletions browser/web_discovery/content_scraper_browsertest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "base/path_service.h"
#include "base/test/bind.h"
#include "base/test/scoped_feature_list.h"
#include "base/test/values_test_util.h"
#include "brave/components/constants/brave_paths.h"
#include "brave/components/web_discovery/browser/patterns.h"
#include "brave/components/web_discovery/browser/server_config_loader.h"
Expand Down Expand Up @@ -199,34 +200,25 @@ IN_PROC_BROWSER_TEST_F(WebDiscoveryContentScraperTest, RendererScrape) {

ASSERT_EQ(fields->size(), 2u);

const auto* href_value = (*fields)[0].FindString("href");
const auto* text_value = (*fields)[0].FindString("text");
const auto* query_value_str = (*fields)[0].FindString("q");
ASSERT_TRUE(href_value);
ASSERT_TRUE(text_value);
ASSERT_TRUE(query_value_str);
EXPECT_EQ(*href_value, "https://example.com/foo1");
EXPECT_EQ(*text_value, "Foo1");
EXPECT_EQ(*query_value_str, "A query");

href_value = (*fields)[1].FindString("href");
text_value = (*fields)[1].FindString("text");
base::ExpectDictStringValue("https://example.com/foo1",
(*fields)[0], "href");
base::ExpectDictStringValue("Foo1", (*fields)[0], "text");
base::ExpectDictStringValue("A query", (*fields)[0], "q");

base::ExpectDictStringValue("https://example.com/foo2",
(*fields)[1], "href");
base::ExpectDictStringValue("Foo2", (*fields)[1], "text");

const auto* query_value = (*fields)[1].Find("q");
ASSERT_TRUE(href_value);
ASSERT_TRUE(text_value);
ASSERT_TRUE(query_value);
EXPECT_EQ(*href_value, "https://example.com/foo2");
EXPECT_EQ(*text_value, "Foo2");
EXPECT_TRUE(query_value->is_none());

field_map_it = scrape_result->fields.find("dont>match");
ASSERT_TRUE(field_map_it != scrape_result->fields.end());
fields = &field_map_it->second;

ASSERT_EQ(fields->size(), 1u);
const auto* url_query_value = (*fields)[0].FindString("q2");
ASSERT_TRUE(url_query_value);
EXPECT_EQ(*url_query_value, "testquery");
base::ExpectDictStringValue("testquery", (*fields)[0], "q2");
}();
run_loop_->Quit();
}));
Expand Down Expand Up @@ -255,12 +247,8 @@ IN_PROC_BROWSER_TEST_F(WebDiscoveryContentScraperTest, RustParseAndScrape) {

ASSERT_EQ(fields->size(), 1u);

const auto* text_value = (*fields)[0].FindString("text");
const auto* input_value = (*fields)[0].FindString("input");
ASSERT_TRUE(text_value);
ASSERT_TRUE(input_value);
EXPECT_EQ(*text_value, "Foo3");
EXPECT_EQ(*input_value, "Foo4");
base::ExpectDictStringValue("Foo3", (*fields)[0], "text");
base::ExpectDictStringValue("Foo4", (*fields)[0], "input");

field_map_it = scrape_result->fields.find("dont>match");
ASSERT_TRUE(field_map_it != scrape_result->fields.end());
Expand Down
18 changes: 13 additions & 5 deletions components/web_discovery/browser/hash_detection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

#include "base/containers/fixed_flat_map.h"
#include "brave/components/web_discovery/browser/hash_detection_matrix.h"
#include "brave/components/web_discovery/browser/util.h"

namespace web_discovery {

Expand All @@ -32,18 +31,27 @@ constexpr double kClassifierThreshold = 0.015;

} // namespace

bool IsHashLikely(std::string value, double threshold_multiplier) {
TransformToAlphanumeric(value);

bool IsHashLikely(std::string_view value, double threshold_multiplier) {
if (value.empty()) {
return false;
}

double log_prob_sum = 0.0;
size_t add_count = 0;
for (size_t i = 0; i < value.length() - 1; i++) {
if (!std::isalnum(value[i])) {
continue;
}
size_t next_char_i;
for (next_char_i = i + 1;
next_char_i < value.length() && !std::isalnum(value[next_char_i]);
next_char_i++) {
}
if (!std::isalnum(value[next_char_i])) {
break;
}
auto matrix_pos_a = kTokenMap.find(value[i]);
auto matrix_pos_b = kTokenMap.find(value[i + 1]);
auto matrix_pos_b = kTokenMap.find(value[next_char_i]);
if (matrix_pos_a == kTokenMap.end() || matrix_pos_b == kTokenMap.end()) {
continue;
}
Expand Down
4 changes: 2 additions & 2 deletions components/web_discovery/browser/hash_detection.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
#ifndef BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_HASH_DETECTION_H_
#define BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_HASH_DETECTION_H_

#include <string>
#include <string_view>

namespace web_discovery {

// Uses a pre-trained Markov chain classifier to detect the likelihood
// of a hash in a given piece of text. Used in privacy guard functions
// for detecting potentially private URLs/queries.
bool IsHashLikely(std::string value, double probability_multiplier = 1.0);
bool IsHashLikely(std::string_view value, double probability_multiplier = 1.0);

} // namespace web_discovery

Expand Down
14 changes: 7 additions & 7 deletions components/web_discovery/browser/payload_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -190,13 +190,13 @@ std::vector<base::Value::Dict> GenerateQueryPayloads(

base::Value::Dict GenerateAlivePayload(const ServerConfig& server_config,
std::string date_hour) {
base::Value::Dict payload;
payload.Set(kActionKey, kAliveAction);
base::Value::Dict inner_payload;
inner_payload.Set(kStatusFieldName, true);
inner_payload.Set(kTimestampFieldName, date_hour);
inner_payload.Set(kCountryCodeFieldName, server_config.location);
payload.Set(kInnerPayloadKey, std::move(inner_payload));
auto inner_payload = base::Value::Dict()
.Set(kStatusFieldName, true)
.Set(kTimestampFieldName, date_hour)
.Set(kCountryCodeFieldName, server_config.location);
auto payload = base::Value::Dict()
.Set(kActionKey, kAliveAction)
.Set(kInnerPayloadKey, std::move(inner_payload));
return payload;
}

Expand Down
110 changes: 40 additions & 70 deletions components/web_discovery/browser/payload_generator_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <memory>
#include <utility>

#include "base/test/values_test_util.h"
#include "brave/components/web_discovery/browser/content_scraper.h"
#include "brave/components/web_discovery/browser/patterns.h"
#include "testing/gtest/include/gtest/gtest.h"
Expand Down Expand Up @@ -69,81 +70,62 @@ TEST_F(WebDiscoveryPayloadGeneratorTest, GenerateQueryPayloads) {
auto scrape_result = std::make_unique<PageScrapeResult>(test_url, "test_id");

std::vector<base::Value::Dict> single_dicts;
base::Value::Dict single_dict1;
single_dict1.Set("ab", "value1");
single_dict1.Set("cd", "value2");
single_dicts.push_back(std::move(single_dict1));
base::Value::Dict single_dict2;
single_dict2.Set("ef", "value3");
single_dict2.Set("gh", "value4");
single_dicts.push_back(std::move(single_dict2));
single_dicts.push_back(
base::Value::Dict().Set("ab", "value1").Set("cd", "value2"));
single_dicts.push_back(
base::Value::Dict().Set("ef", "value3").Set("gh", "value4"));
scrape_result->fields[".single-element"] = std::move(single_dicts);

std::vector<base::Value::Dict> result_dicts1;
base::Value::Dict result_dict1, result_dict2, result_dict3, result_dict4,
result_dict5;
result_dict1.Set("njk", "joinvalue1");
result_dict2.Set("abc", "joinvalue2");
result_dict3.Set("njk", "joinvalue3");
result_dict4.Set("abc", "joinvalue4");
result_dicts1.push_back(std::move(result_dict1));
result_dicts1.push_back(std::move(result_dict2));
result_dicts1.push_back(std::move(result_dict3));
result_dicts1.push_back(std::move(result_dict4));
result_dicts1.push_back(base::Value::Dict().Set("njk", "joinvalue1"));
result_dicts1.push_back(base::Value::Dict().Set("abc", "joinvalue2"));
result_dicts1.push_back(base::Value::Dict().Set("njk", "joinvalue3"));
result_dicts1.push_back(base::Value::Dict().Set("abc", "joinvalue4"));

std::vector<base::Value::Dict> result_dicts2;
result_dict5.Set("qurl", "https://example.com/test1");
result_dicts2.push_back(std::move(result_dict5));
result_dicts2.push_back(
base::Value::Dict().Set("qurl", "https://example.com/test1"));
scrape_result->fields["#results"] = std::move(result_dicts1);
scrape_result->fields["qurl"] = std::move(result_dicts2);

auto payloads = GenerateQueryPayloadsHelper(std::move(scrape_result));
ASSERT_EQ(payloads.size(), 3u);

const auto* payload = &payloads[0];
const auto* action = payload->FindString(kActionKey);
const auto* inner_payload = payload->FindDict(kInnerPayloadKey);
ASSERT_TRUE(action && inner_payload);
EXPECT_EQ(*action, "single_action");
ASSERT_TRUE(inner_payload);
base::ExpectDictStringValue("single_action", *payload, kActionKey);

EXPECT_EQ(inner_payload->size(), 3u);

const auto* ctry = inner_payload->FindString("ctry");
const auto* val1 = inner_payload->FindString("ab");
const auto* val2 = inner_payload->FindString("cd");
ASSERT_TRUE(ctry && val1 && val2);
EXPECT_EQ(*ctry, "us");
EXPECT_EQ(*val1, "value1");
EXPECT_EQ(*val2, "value2");
base::ExpectDictStringValue("us", *inner_payload, "ctry");
base::ExpectDictStringValue("value1", *inner_payload, "ab");
base::ExpectDictStringValue("value2", *inner_payload, "cd");

payload = &payloads[1];
action = payload->FindString(kActionKey);
inner_payload = payload->FindDict(kInnerPayloadKey);
ASSERT_TRUE(action && inner_payload);
EXPECT_EQ(*action, "single_action");
ASSERT_TRUE(inner_payload);
base::ExpectDictStringValue("single_action", *payload, kActionKey);

EXPECT_EQ(inner_payload->size(), 3u);

ctry = inner_payload->FindString("ctry");
val1 = inner_payload->FindString("ef");
val2 = inner_payload->FindString("gh");
ASSERT_TRUE(ctry && val1 && val2);
EXPECT_EQ(*ctry, "us");
EXPECT_EQ(*val1, "value3");
EXPECT_EQ(*val2, "value4");
base::ExpectDictStringValue("us", *inner_payload, "ctry");
base::ExpectDictStringValue("value3", *inner_payload, "ef");
base::ExpectDictStringValue("value4", *inner_payload, "gh");

payload = &payloads[2];
action = payload->FindString(kActionKey);
inner_payload = payload->FindDict(kInnerPayloadKey);
ASSERT_TRUE(action && inner_payload);
EXPECT_EQ(*action, "query");
ASSERT_TRUE(inner_payload);
base::ExpectDictStringValue("query", *payload, kActionKey);

EXPECT_EQ(inner_payload->size(), 2u);

const auto* qurl = inner_payload->FindString("qurl");
const auto* r_dict = inner_payload->FindDict("r");
ASSERT_TRUE(qurl && r_dict);
EXPECT_EQ(*qurl, "https://example.com/test1");
ASSERT_TRUE(r_dict);
EXPECT_EQ(r_dict->size(), 4u);
base::ExpectDictStringValue("https://example.com/test1", *inner_payload,
"qurl");

const auto* r0_dict = r_dict->FindDict("0");
const auto* r1_dict = r_dict->FindDict("1");
const auto* r2_dict = r_dict->FindDict("2");
Expand All @@ -153,36 +135,26 @@ TEST_F(WebDiscoveryPayloadGeneratorTest, GenerateQueryPayloads) {
EXPECT_EQ(r1_dict->size(), 1u);
EXPECT_EQ(r2_dict->size(), 1u);
EXPECT_EQ(r3_dict->size(), 1u);
const auto* r0_val = r0_dict->FindString("njk");
const auto* r1_val = r1_dict->FindString("abc");
const auto* r2_val = r2_dict->FindString("njk");
const auto* r3_val = r3_dict->FindString("abc");
ASSERT_TRUE(r0_val && r1_val && r2_val && r2_val);
EXPECT_EQ(*r0_val, "joinvalue1");
EXPECT_EQ(*r1_val, "joinvalue2");
EXPECT_EQ(*r2_val, "joinvalue3");
EXPECT_EQ(*r3_val, "joinvalue4");
base::ExpectDictStringValue("joinvalue1", *r0_dict, "njk");
base::ExpectDictStringValue("joinvalue2", *r1_dict, "abc");
base::ExpectDictStringValue("joinvalue3", *r2_dict, "njk");
base::ExpectDictStringValue("joinvalue4", *r3_dict, "abc");
}

TEST_F(WebDiscoveryPayloadGeneratorTest, GenerateAlivePayload) {
std::string date_hour = "2023051509";

auto alive_payload = GenerateAlivePayload(*server_config_.get(), date_hour);

const auto* action = alive_payload.FindString("action");
const auto* inner_payload = alive_payload.FindDict("payload");
base::ExpectDictStringValue("alive", alive_payload, "action");

ASSERT_TRUE(action && inner_payload);
const auto* inner_payload = alive_payload.FindDict("payload");

EXPECT_EQ(*action, "alive");
const auto* ctry = inner_payload->FindString("ctry");
const auto* ts = inner_payload->FindString("t");
const auto status = inner_payload->FindBool("status");
ASSERT_TRUE(inner_payload);

ASSERT_TRUE(ctry && ts && status);
EXPECT_EQ(*ctry, "us");
EXPECT_EQ(*ts, date_hour);
EXPECT_EQ(*status, true);
base::ExpectDictStringValue("us", *inner_payload, "ctry");
base::ExpectDictStringValue(date_hour, *inner_payload, "t");
base::ExpectDictBooleanValue(true, *inner_payload, "status");
}

TEST_F(WebDiscoveryPayloadGeneratorTest, ExcludePrivateResult) {
Expand Down Expand Up @@ -222,10 +194,8 @@ TEST_F(WebDiscoveryPayloadGeneratorTest, ExcludePrivateResult) {
const auto* ri_dict = r_dict->FindDict(base::NumberToString(i));
ASSERT_TRUE(ri_dict);

const auto* url = ri_dict->FindString("u");
ASSERT_TRUE(url);

EXPECT_EQ(*url, "https://example.com/result" + base::NumberToString(i));
base::ExpectDictStringValue(
"https://example.com/result" + base::NumberToString(i), *ri_dict, "u");
}
}

Expand Down
Loading

0 comments on commit c7089e1

Please sign in to comment.