serializer.py

#!/usr/bin/env python
'''
This script will serialize rdf data from Blazegraph repository generated by ToolsEswc located at https://github.com/liyakun/ToolsEswc and remove non-relevant data. The output is generated into file
all_clean.ttl.gz in the same folder with this script.

This script uses python 2.7.6, and it has been tested on Linux Mint 17.3.
It should also run on Ubuntu with corresponding version of Linux Mint 17.3.

It should also able to run Windows as long as you have the corresponding python and shell tools installed.
'''
import codecs
import requests
import subprocess
import gzip
import os

url = 'http://localhost:9999/blazegraph/sparql'
print 'The default url for Blazegraph running is: http://localhost:9999/blazegraph/sparql\n'
user_input = raw_input('If using the default url, please press Enter, else please input: ')

# if user input a specified url, use user input
if user_input != '':
  url = user_input

# check whether Blazegraph is running on http://localhost:9999/bigdata/sparql
request = requests.get(url)
if request.status_code == 200:
    print '\n' + url + ' exists\n'
else:
    raise ValueError('\n'+ url + ' does not exist\n')

# remove most non relevant data
arg = "curl --get -X DELETE -H 'Accept: application/xml' " + url + " --data-urlencode 'query=PREFIX box: <http://fitlayout.github.io/ontology/render.owl#> PREFIX segm: <http://fitlayout.github.io/ontology/segmentation.owl#> CONSTRUCT { ?a ?b ?c } WHERE {{ ?a rdf:type segm:AreaTree } union { ?a rdf:type segm:LogicalArea } union { ?a rdf:type segm:Area } union { ?a rdf:type box:Box } union { ?a rdf:type box:Page } union { ?a segm:ititle ?c  } union { ?a segm:ishort ?c  } union { ?a segm:idateplace ?c  } union { ?a segm:iproceedings ?c  } union { ?a segm:istartdate ?c  } union { ?a segm:ienddate ?c  } union { ?a segm:isubmitted ?c  } union { ?a segm:icoloc ?c  } union { ?a segm:editorname ?c  } union { ?a segm:hasTag ?c  } union { ?a segm:support ?c  } union {?a segm:related ?c} ?a ?b ?c.}'"
subprocess.check_output(arg, shell=True)

# serialize data from repository, you can also serialize with other formats, detail see 
# https://wiki.blazegraph.com/wiki/index.php/REST_API
arg = "curl -X POST " + url + " --data-urlencode 'query=CONSTRUCT  WHERE {hint:Query hint:analytic " + '"true"' +" . hint:Query hint:constructDistinctSPO " + '"false"' + " . ?s ?p ?o }' -H 'Accept:application/turtle' | gzip > output.ttl.gz"
subprocess.check_output(arg, shell=True)

# remove lines contian bad words in file
bad_words = ['http://fitlayout.github.io/ontology/segmentation.owl', 'http://www.bigdata.com/queryHints', 'http://www.bigdata.com/rdf', '<hint:analytic>true</hint:analytic>', '<hint:constructDistinctSPO>false</hint:constructDistinctSPO>']

skip_next_line, count = 0, 0
# remove non-relevant data from data file
with gzip.open('output.ttl.gz', 'rb') as oldfile, gzip.open('all_clean.ttl.gz', 'wb') as newfile:
    for line in oldfile:
	if skip_next_line == 1:
	    count += 1
	if 'xmlns:bds="http://www.bigdata.com/rdf/search' in line:
	  line = '>'
	if '<hint:constructDistinctSPO>false</hint:constructDistinctSPO>' in line:
	    skip_next_line = 1
        if not any(bad_word in line for bad_word in bad_words) and not (skip_next_line == 1 and count == 1):
            newfile.write(line)

# remove raw file
os.remove('output.ttl.gz')