From c600717420a3204b0be34c4c1e4e382aafb6d082 Mon Sep 17 00:00:00 2001 From: Stephen Walker-Weinshenker Date: Wed, 15 Nov 2023 21:02:31 -0700 Subject: [PATCH] Added configurable delay to file downloads This helps avoid the rate-limiting introduced by archive.org --- bin/wayback_machine_downloader | 4 ++++ lib/wayback_machine_downloader.rb | 9 ++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index 4fb6d3d..c512fc5 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -46,6 +46,10 @@ option_parser = OptionParser.new do |opts| options[:all] = true end + opts.on("-n", "--delay DELAY", "A configurable delay between page/file downloads (in seconds) to combat rate limiting. Default is 4 seconds") do |t| + options[:delay] = t + end + opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t| options[:threads_count] = t end diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 730714a..d7b7784 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -18,7 +18,7 @@ class WaybackMachineDownloader attr_accessor :base_url, :exact_url, :directory, :all_timestamps, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, - :all, :maximum_pages, :threads_count + :all, :maximum_pages, :threads_count, :delay def initialize params @base_url = params[:base_url] @@ -30,8 +30,11 @@ def initialize params @only_filter = params[:only_filter] @exclude_filter = params[:exclude_filter] @all = params[:all] + # maximum page default is 100 @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = params[:threads_count].to_i + # default delay is 4 seconds + @delay = params[:delay] ? params[:delay].to_i : 4 end def backup_name @@ -89,6 +92,8 @@ def get_all_snapshots_to_consider print "." unless @exact_url @maximum_pages.times do |page_index| + # wait before fetching individual snapshots + sleep(@delay) snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) break if snapshot_list.empty? snapshot_list_to_consider += snapshot_list @@ -209,6 +214,8 @@ def download_files threads << Thread.new do until file_queue.empty? file_remote_info = file_queue.pop(true) rescue nil + # delay start of download operation for configurable amount of time + sleep(@delay) download_file(file_remote_info) if file_remote_info end end