Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support removing query parameters in URL #7

Merged
merged 1 commit into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ type ScraperConfig struct {
NextPageSelector string `yaml:"nextPageSelector"`
PriceFormat string `yaml:"priceFormat"`
RetryString string `yaml:"retryString"`
UniqueParameters []string `yaml:"uniqueParameters"`
}

type EmailConfig struct {
Expand Down
4 changes: 2 additions & 2 deletions pkg/scraper/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func (bs *BaseScraper) ParseHTML(htmlContent, fetchedUrl string) ([]models.Produ
}

itemLink, _ := s.Find(bs.Config.LinkSelector).Attr("href")
itemLink, err = utils.EnsureFullUrl(itemLink, fetchedUrl)
itemLink, err = utils.EnsureFullUrl(itemLink, fetchedUrl, bs.Config.UniqueParameters)
if err != nil {
log.Printf("Failed to get full URL %v", err)
}
Expand Down Expand Up @@ -139,7 +139,7 @@ func (bs *BaseScraper) ParseHTML(htmlContent, fetchedUrl string) ([]models.Produ
return
}
if href, exists := s.Attr("href"); exists {
nextURL, err = utils.EnsureFullUrl(href, fetchedUrl)
nextURL, err = utils.EnsureFullUrl(href, fetchedUrl, []string{})
if err != nil {
log.Printf("Failed to get full URL %v", err)
}
Expand Down
41 changes: 30 additions & 11 deletions pkg/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@ import (
"strings"
)

func EnsureFullUrl(newUrl, fetchedUrl string) (string, error) {
func EnsureFullUrl(newUrl, fetchedUrl string, uniqueParameters []string) (string, error) {
newUrl = strings.TrimSpace(newUrl)
if newUrl == "" {
return "", nil
}

// Handle relative URLs starting with "./"
var finalUrl *url.URL
var err error

if strings.HasPrefix(newUrl, "./") {
// Handle relative URLs starting with "./"
newUrl = strings.TrimPrefix(newUrl, "./")
baseURL, err := url.Parse(fetchedUrl)
if err != nil {
Expand All @@ -32,12 +35,12 @@ func EnsureFullUrl(newUrl, fetchedUrl string) (string, error) {
}

// Construct the new full URL
newFullUrl := baseURL.Scheme + "://" + baseURL.Host + baseURL.Path + newUrl
return newFullUrl, nil
}

// Handle absolute URLs that do not start with "http://" or "https://"
if !strings.HasPrefix(newUrl, "http://") && !strings.HasPrefix(newUrl, "https://") {
finalUrl, err = url.Parse(baseURL.Scheme + "://" + baseURL.Host + baseURL.Path + newUrl)
if err != nil {
return "", fmt.Errorf("error constructing final URL: %w", err)
}
} else if !strings.HasPrefix(newUrl, "http://") && !strings.HasPrefix(newUrl, "https://") {
// Handle absolute URLs that do not start with "http://" or "https://"
baseURL, err := url.Parse(fetchedUrl)
if err != nil {
return "", fmt.Errorf("error parsing URL: %w", err)
Expand All @@ -46,9 +49,25 @@ func EnsureFullUrl(newUrl, fetchedUrl string) (string, error) {
if !strings.HasPrefix(newUrl, "/") {
newUrl = "/" + newUrl
}
return baseURL.Scheme + "://" + baseURL.Host + newUrl, nil
finalUrl, err = url.Parse(baseURL.Scheme + "://" + baseURL.Host + newUrl)
if err != nil {
return "", fmt.Errorf("error constructing final URL: %w", err)
}
} else {
// Parse complete URL
finalUrl, err = url.Parse(newUrl)
if err != nil {
return "", fmt.Errorf("error parsing URL: %w", err)
}
}

// Remove specified unique parameters from the URL
queryParams := finalUrl.Query()
for _, param := range uniqueParameters {
queryParams.Del(param)
}
finalUrl.RawQuery = queryParams.Encode()

// Return the new URL if it's already a complete URL
return newUrl, nil
// Return the modified URL
return finalUrl.String(), nil
}
37 changes: 31 additions & 6 deletions pkg/utils/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl := ""
fetchedUrl := "https://example.com"
expectedResult := ""
result, err := EnsureFullUrl(newUrl, fetchedUrl)
result, err := EnsureFullUrl(newUrl, fetchedUrl, []string{})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -21,7 +21,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "https://example.com/product1"
fetchedUrl = "https://example.com"
expectedResult = "https://example.com/product1"
result, err = EnsureFullUrl(newUrl, fetchedUrl)
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -33,7 +33,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "/product1"
fetchedUrl = "https://example.com"
expectedResult = "https://example.com/product1"
result, err = EnsureFullUrl(newUrl, fetchedUrl)
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -45,7 +45,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "/product1?param=value"
fetchedUrl = "https://example.com"
expectedResult = "https://example.com/product1?param=value"
result, err = EnsureFullUrl(newUrl, fetchedUrl)
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -57,7 +57,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "/product1#section"
fetchedUrl = "https://example.com"
expectedResult = "https://example.com/product1#section"
result, err = EnsureFullUrl(newUrl, fetchedUrl)
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -69,11 +69,36 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "./baz.php"
fetchedUrl = "https://example.com/foo/bar.php"
expectedResult = "https://example.com/foo/baz.php"
result, err = EnsureFullUrl(newUrl, fetchedUrl)
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if result != expectedResult {
t.Errorf("Expected %s, but got %s", expectedResult, result)
}

// Test case 7: newUrl contains &sid query parameter
newUrl = "./baz.php?sid=12345"
fetchedUrl = "https://example.com/foo/bar.php"
expectedResult = "https://example.com/foo/baz.php"
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{"sid"})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if result != expectedResult {
t.Errorf("Expected %s, but got %s", expectedResult, result)
}

// Test case 8: newUrl contains many query parameters, sid is stripped
newUrl = "./baz.php?page=10&sid=12345&sort=asc"
fetchedUrl = "https://example.com/foo/bar.php"
expectedResult = "https://example.com/foo/baz.php?page=10&sort=asc"
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{"sid"})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if result != expectedResult {
t.Errorf("Expected %s, but got %s", expectedResult, result)
}

}
Loading