diff --git a/README.md b/README.md index f26e099d..ca108050 100644 --- a/README.md +++ b/README.md @@ -88,9 +88,9 @@ $ ipwb replay http://myDomain/files/myIndex.cdxj $ ipwb replay QmYwAPJzv5CZsnANOTaREALhashYgPpHdWEz79ojWnPbdG ``` -Once started, the replay system's web interface can be accessed through a web browser, e.g., by default. +Once started, the replay system's web interface can be accessed through a web browser, e.g., by default. -To run it under a domain name other than `localhost`, the easiest approach is to use a reverse proxy that supports HTTPS. The replay system utilizes [Service Worker](https://developer.mozilla.org/en-US/docs/Web/API/Service_Worker_API) for URL rerouting/rewriting to prevent [live leakage (zombies)](http://ws-dl.blogspot.com/2012/10/2012-10-10-zombies-in-archives.html). However, for security reason many web browsers have mandated HTTPS for the Service Worker API with only exception if the domain is `localhost`. [Caddy Server](https://caddyserver.com/) and [Traefik](https://traefik.io/) can be used as a reverse-proxy server and are very easy to setup. They come with built-in HTTPS support and manage (install and update) TLS certificates transparently and automatically from [Let's Encrypt](https://letsencrypt.org/). However, any web server proxy that has HTTPS support on the front-end will work. To make ipwb replay aware of the proxy, use `--proxy` or `-P` flag to supply the proxy URL. This way the replay will yield the supplied proxy URL as a prefix when generating various fully qualified domain name (FQDN) URIs or absolute URIs (for example, those in the TimeMap or Link header) instead of the default `http://localhost:5000`. This can be necessary when the service is running in a private network or a container and only exposed via a reverse-proxy. Suppose a reverse-proxy server is running and ready to forward all traffic on the `https://ipwb.example.com` to the ipwb replay server then the replay can be started as following: +To run it under a domain name other than `localhost`, the easiest approach is to use a reverse proxy that supports HTTPS. The replay system utilizes [Service Worker](https://developer.mozilla.org/en-US/docs/Web/API/Service_Worker_API) for URL rerouting/rewriting to prevent [live leakage (zombies)](http://ws-dl.blogspot.com/2012/10/2012-10-10-zombies-in-archives.html). However, for security reason many web browsers have mandated HTTPS for the Service Worker API with only exception if the domain is `localhost`. [Caddy Server](https://caddyserver.com/) and [Traefik](https://traefik.io/) can be used as a reverse-proxy server and are very easy to setup. They come with built-in HTTPS support and manage (install and update) TLS certificates transparently and automatically from [Let's Encrypt](https://letsencrypt.org/). However, any web server proxy that has HTTPS support on the front-end will work. To make ipwb replay aware of the proxy, use `--proxy` or `-P` flag to supply the proxy URL. This way the replay will yield the supplied proxy URL as a prefix when generating various fully qualified domain name (FQDN) URIs or absolute URIs (for example, those in the TimeMap or Link header) instead of the default `http://localhost:2016`. This can be necessary when the service is running in a private network or a container and only exposed via a reverse-proxy. Suppose a reverse-proxy server is running and ready to forward all traffic on the `https://ipwb.example.com` to the ipwb replay server then the replay can be started as following: ``` $ ipwb replay --proxy=https://ipwb.example.com @@ -101,16 +101,16 @@ $ ipwb replay --proxy=https://ipwb.example.com A pre-built Docker image is made available that can be run as following: ``` -$ docker container run -it --rm -p 5000:5000 oduwsdl/ipwb +$ docker container run -it --rm -p 2016:2016 oduwsdl/ipwb ``` -The container will run an IPFS daemon, index a sample WARC file, and replay it using the newly created index. It will take a few seconds to be ready, then the replay will be accessible at with a sample archived page. +The container will run an IPFS daemon, index a sample WARC file, and replay it using the newly created index. It will take a few seconds to be ready, then the replay will be accessible at with a sample archived page. To index and replay your own WARC file, bind mount your data folders inside the container using `-v` (or `--volume`) flag and run commands accordingly. The provided docker image has designated `/data` directory, inside which there are `warc`, `cdxj`, and `ipfs` folders where host folders can be mounted separately or as a single mount point at the parent `/data` directory. Assuming that the host machine has a `/path/to/data` folder under which there are `warc`, `cdxj`, and `ipfs` folders and a WARC file at `/path/to/data/warc/custom.warc.gz`. ``` $ docker container run -it --rm -v /path/to/data:/data oduwsdl/ipwb ipwb index -o /data/cdxj/custom.cdxj /data/warc/custom.warc.gz -$ docker container run -it --rm -v /path/to/data:/data -p 5000:5000 oduwsdl/ipwb ipwb replay /data/cdxj/custom.cdxj +$ docker container run -it --rm -v /path/to/data:/data -p 2016:2016 oduwsdl/ipwb ipwb replay /data/cdxj/custom.cdxj ``` If the host folder structure is something other than `/some/path/{warc,cdxj,ipfs}` then these volumes need to be mounted separately. diff --git a/ipwb/util.py b/ipwb/util.py index 1905f467..dbbf12b6 100644 --- a/ipwb/util.py +++ b/ipwb/util.py @@ -33,7 +33,7 @@ # or '/ip4/{ipaddress}/tcp/{port}/http' # or '/ip6/{ipaddress}/tcp/{port}/http -IPWBREPLAY_ADDRESS = 'localhost:5000' +IPWBREPLAY_ADDRESS = 'localhost:2016' (IPWBREPLAY_HOST, IPWBREPLAY_PORT) = IPWBREPLAY_ADDRESS.split(':') IPWBREPLAY_PORT = int(IPWBREPLAY_PORT) diff --git a/samples/indexes/5mementos.cdxj b/samples/indexes/5mementos.cdxj index ffab369f..a5d9f968 100644 --- a/samples/indexes/5mementos.cdxj +++ b/samples/indexes/5mementos.cdxj @@ -1,11 +1,11 @@ !context ["https://tools.ietf.org/html/rfc7089"] -!id {"uri": "http://localhost:5000/timemap/cdxj/memento.us"} +!id {"uri": "http://localhost:2016/timemap/cdxj/memento.us"} !keys ["memento_datetime_YYYYMMDDhhmmss"] !meta {"original_uri": "http://memento.us/"} -!meta {"timegate_uri": "http://localhost:5000/timegate/memento.us"} -!meta {"timemap_uri": {"link_format": "http://localhost:5000/timemap/link/memento.us","cdxj_format": "http://localhost:5000/timemap/cdxj/memento.us"}} -20130202100000 {"uri": "http://localhost:5000/memento/20130202100000/memento.us/", "rel": "first memento", "datetime"="Sat, 02 Feb 2013 10:00:00 GMT"} -20140114100000 {"uri": "http://localhost:5000/memento/20140114100000/memento.us/", "rel": "memento", "datetime"="Tue, 14 Jan 2014 10:00:00 GMT"} -20140115101500 {"uri": "http://localhost:5000/memento/20140115101500/memento.us/", "rel": "memento", "datetime"="Wed, 15 Jan 2014 10:15:00 GMT"} -20161231110000 {"uri": "http://localhost:5000/memento/20161231110000/memento.us/", "rel": "memento", "datetime"="Sat, 31 Dec 2016 11:00:00 GMT"} -20161231110001 {"uri": "http://localhost:5000/memento/20161231110001/memento.us/", "rel": "last memento", "datetime"="Sat, 31 Dec 2016 11:00:01 GMT"} +!meta {"timegate_uri": "http://localhost:2016/timegate/memento.us"} +!meta {"timemap_uri": {"link_format": "http://localhost:2016/timemap/link/memento.us","cdxj_format": "http://localhost:2016/timemap/cdxj/memento.us"}} +20130202100000 {"uri": "http://localhost:2016/memento/20130202100000/memento.us/", "rel": "first memento", "datetime"="Sat, 02 Feb 2013 10:00:00 GMT"} +20140114100000 {"uri": "http://localhost:2016/memento/20140114100000/memento.us/", "rel": "memento", "datetime"="Tue, 14 Jan 2014 10:00:00 GMT"} +20140115101500 {"uri": "http://localhost:2016/memento/20140115101500/memento.us/", "rel": "memento", "datetime"="Wed, 15 Jan 2014 10:15:00 GMT"} +20161231110000 {"uri": "http://localhost:2016/memento/20161231110000/memento.us/", "rel": "memento", "datetime"="Sat, 31 Dec 2016 11:00:00 GMT"} +20161231110001 {"uri": "http://localhost:2016/memento/20161231110001/memento.us/", "rel": "last memento", "datetime"="Sat, 31 Dec 2016 11:00:01 GMT"} diff --git a/samples/indexes/5mementos.link b/samples/indexes/5mementos.link index 310ec482..62a20162 100644 --- a/samples/indexes/5mementos.link +++ b/samples/indexes/5mementos.link @@ -1,9 +1,9 @@ ; rel="original", -; rel="self timemap"; type="application/link-format", -; rel="timemap"; type="application/cdxj+ors", -; rel="timegate", -; rel="first memento"; datetime="Sat, 02 Feb 2013 10:00:00 GMT", -; rel="memento"; datetime="Tue, 14 Jan 2014 10:00:00 GMT", -; rel="memento"; datetime="Wed, 15 Jan 2014 10:15:00 GMT", -; rel="memento"; datetime="Sat, 31 Dec 2016 11:00:00 GMT", -; rel="last memento"; datetime="Sat, 31 Dec 2016 11:00:01 GMT" +; rel="self timemap"; type="application/link-format", +; rel="timemap"; type="application/cdxj+ors", +; rel="timegate", +; rel="first memento"; datetime="Sat, 02 Feb 2013 10:00:00 GMT", +; rel="memento"; datetime="Tue, 14 Jan 2014 10:00:00 GMT", +; rel="memento"; datetime="Wed, 15 Jan 2014 10:15:00 GMT", +; rel="memento"; datetime="Sat, 31 Dec 2016 11:00:00 GMT", +; rel="last memento"; datetime="Sat, 31 Dec 2016 11:00:01 GMT" diff --git a/tests/test_memento.py b/tests/test_memento.py index 1e8c3349..7e742fed 100644 --- a/tests/test_memento.py +++ b/tests/test_memento.py @@ -18,7 +18,7 @@ def get_urims_from_timemap_in_warc(warcFilename): ipwb_test.start_replay(warcFilename) - tm_uri = 'http://localhost:5000/timemap/link/memento.us/' + tm_uri = 'http://localhost:2016/timemap/link/memento.us/' tm = urlopen(tm_uri).read().decode('utf-8') urims = [] @@ -66,7 +66,7 @@ def test_acceptdatetime_status(warc, lookup, acceptdatetime, status): headers = {'Accept-Datetime': acceptdatetime} - resp = requests.get(f'http://localhost:5000/{lookup}', + resp = requests.get(f'http://localhost:2016/{lookup}', allow_redirects=False, headers=headers) assert resp.status_code == status diff --git a/tests/test_replay.py b/tests/test_replay.py index 2d60d2cd..bb894bd4 100644 --- a/tests/test_replay.py +++ b/tests/test_replay.py @@ -21,7 +21,7 @@ def test_replay_404(warc, lookup, has_md_header): ipwb_test.start_replay(warc) - resp = requests.get(f'http://localhost:5000/{lookup}', + resp = requests.get(f'http://localhost:2016/{lookup}', allow_redirects=False) assert resp.status_code == 404 @@ -53,7 +53,7 @@ def test_replay_404(warc, lookup, has_md_header): def test_replay_search(warc, lookup, status, location): ipwb_test.start_replay(warc) - resp = requests.get(f'http://localhost:5000/{lookup}', + resp = requests.get(f'http://localhost:2016/{lookup}', allow_redirects=False) assert resp.status_code == status if location is not None: # Allow for checks w/o redirects @@ -65,7 +65,7 @@ def test_replay_search(warc, lookup, status, location): def test_replay_dated_memento(): ipwb_test.start_replay('salam-home.warc') - url = 'http://localhost:5000/memento/{}/cs.odu.edu/~salam/' + url = 'http://localhost:2016/memento/{}/cs.odu.edu/~salam/' dest = '/memento/20160305192247/cs.odu.edu/~salam/' invalid_dts = [ @@ -119,7 +119,7 @@ def test_replay_dated_memento(): def test_generate_timemap(warc, index, tmformat, urim): ipwb_test.start_replay(warc) - resp = requests.get(f'http://localhost:5000/timemap/{tmformat}/{urim}', + resp = requests.get(f'http://localhost:2016/timemap/{tmformat}/{urim}', allow_redirects=False) with open(f'samples/indexes/{index}', 'r') as index: