-
Notifications
You must be signed in to change notification settings - Fork 15
Importing iTunes EPF Relational into Neo4j
Paul Tremberth edited this page Dec 13, 2013
·
4 revisions
Resources:
- https://www.apple.com/itunes/affiliates/resources/documentation/itunes-enterprise-partner-feed.html
- https://www.apple.com/itunes/affiliates/resources/documentation/epfimporter.html
- https://github.com/mojotech/EPFImporter
Using artist_type as a label
mysql> SELECT 'artist_id:long', 'name:string', 'is_actual_artist:boolean', 'view_url:string', 'artist_type:label'
-> UNION ALL
-> SELECT artist_id, CONCAT('"', REPLACE(epf_artist.name, '"', '""'), '"'), is_actual_artist, view_url, epf_artist_type.name
-> FROM epf_artist
-> INNER JOIN epf_artist_type
-> ON epf_artist.artist_type_id=epf_artist_type.artist_type_id
-> INTO OUTFILE '/tmp/epfitunes__artists.tsv'
-> FIELDS TERMINATED BY '\t' ENCLOSED BY '' ESCAPED BY '\\'
-> LINES TERMINATED BY '\n';
Query OK, 5775431 rows affected (33.64 sec)
DROP TABLE IF EXISTS entity_mapping;
CREATE TABLE entity_mapping AS
(
SELECT
kind AS entity,
pk,
@curRow := @curRow + 1 AS node_id
FROM
(
(
SELECT
'artist' as kind,
epf_artist.artist_id AS "pk"
FROM
epf_artist
ORDER BY pk ASC
)
UNION
(
SELECT
'song' as kind,
epf_song.song_id AS "pk"
FROM
epf_song
ORDER BY pk ASC
)
) AS entity_union
JOIN (SELECT @curRow := -1) r
);
CREATE INDEX idx_entity_mapping ON entity_mapping(entity, pk);
mysql> DROP TABLE IF EXISTS entity_mapping;
Query OK, 0 rows affected (23.03 sec)
mysql> CREATE TABLE entity_mapping AS
-> (
-> SELECT
-> kind AS entity,
-> pk,
-> @curRow := @curRow + 1 AS node_id
-> FROM
-> (
-> (
-> SELECT
-> 'artist' as kind,
-> epf_artist.artist_id AS "pk"
-> FROM
-> epf_artist
-> ORDER BY pk ASC
-> )
->
-> UNION
->
-> (
-> SELECT
-> 'song' as kind,
-> epf_song.song_id AS "pk"
-> FROM
-> epf_song
-> ORDER BY pk ASC
-> )
-> ) AS entity_union
->
-> JOIN (SELECT @curRow := -1) r
-> );
Query OK, 45042202 rows affected (37 min 58.11 sec)
Records: 45042202 Duplicates: 0 Warnings: 0
SELECT
'l:label',
'pk:long',
'name:string',
'url:string'
UNION ALL
SELECT * FROM (
SELECT
epf_artist_type.name as kind,
epf_artist.artist_id AS pk,
epf_artist.name AS name,
epf_artist.view_url as url
FROM epf_artist
INNER JOIN epf_artist_type
ON epf_artist.artist_type_id=epf_artist_type.artist_type_id
ORDER BY pk ASC
) AS artist_node
INTO OUTFILE '/tmp/epfitunes__nodes001.artists.tsv'
FIELDS TERMINATED BY '\t'
ENCLOSED BY ''
ESCAPED BY '\\'
LINES TERMINATED BY '\n';
mysql> SELECT
-> 'l:label',
-> 'pk:long',
-> 'name:string',
-> 'url:string'
->
-> UNION ALL
->
-> SELECT * FROM (
-> SELECT
-> epf_artist_type.name as kind,
-> epf_artist.artist_id AS pk,
-> epf_artist.name AS name,
-> epf_artist.view_url as url
-> FROM epf_artist
-> INNER JOIN epf_artist_type
-> ON epf_artist.artist_type_id=epf_artist_type.artist_type_id
-> ORDER BY pk ASC
-> ) AS artist_node
->
-> INTO OUTFILE '/tmp/epfitunes__nodes001.artists.tsv'
-> FIELDS TERMINATED BY '\t'
-> ENCLOSED BY ''
-> ESCAPED BY '\\'
-> LINES TERMINATED BY '\n';
Query OK, 5775431 rows affected (24 min 58.37 sec)
mysql> SELECT
-> 'l:label',
-> 'pk:long',
-> 'name:string',
-> 'url:string'
->
-> UNION ALL
->
-> SELECT * FROM (
-> SELECT
-> 'Song' as kind,
-> epf_song.song_id AS pk,
-> epf_song.name AS name,
-> epf_song.view_url as url
-> FROM epf_song
-> ORDER BY pk ASC
-> ) AS song_node
->
->
-> INTO OUTFILE '/tmp/epfitunes__nodes002.songs.tsv'
-> FIELDS TERMINATED BY '\t'
-> ENCLOSED BY ''
-> ESCAPED BY '\\'
-> LINES TERMINATED BY '\n';
Query OK, 39266773 rows affected (1 hour 35 min 17.22 sec)
SELECT
'start',
'end',
'rel_type'
UNION ALL
SELECT
start_entity.node_id AS start,
end_entity.node_id AS end,
'SONG_BY_ARTIST' AS rel_type
FROM
epf_artist_song
JOIN entity_mapping start_entity
ON (
start_entity.pk = epf_artist_song.artist_id
AND
start_entity.entity = 'artist'
)
JOIN entity_mapping end_entity
ON (
end_entity.pk = epf_artist_song.song_id
AND
end_entity.entity = 'song'
)
INTO OUTFILE '/tmp/epfitunes__rels001.all.tsv'
FIELDS TERMINATED BY '\t'
ENCLOSED BY ''
ESCAPED BY '\\'
LINES TERMINATED BY '\n';
mysql> SELECT start_entity.node_id AS start, end_entity.node_id AS end, 'SONG_BY_ARTIST' AS rel_type FROM epf_artist_song JOIN entity_mapping start_entity ON ( start_entity.pk = epf_artist_song.artist_id AND start_entity.entity = 'artist' ) JOIN entity_mapping end_entity ON ( end_entity.pk = epf_artist_song.song_id AND end_entity.entity = 'song' ) LIMIT 10;
+-------+---------+----------------+
| start | end | rel_type |
+-------+---------+----------------+
| 1662 | 5775431 | SONG_BY_ARTIST |
| 1662 | 5775432 | SONG_BY_ARTIST |
| 1662 | 5775433 | SONG_BY_ARTIST |
| 1662 | 5775434 | SONG_BY_ARTIST |
| 1662 | 5775435 | SONG_BY_ARTIST |
| 1662 | 5775436 | SONG_BY_ARTIST |
| 1662 | 5775437 | SONG_BY_ARTIST |
| 1662 | 5775438 | SONG_BY_ARTIST |
| 1662 | 5775439 | SONG_BY_ARTIST |
| 1662 | 5775440 | SONG_BY_ARTIST |
+-------+---------+----------------+
10 rows in set (0.00 sec)
mysql> SELECT * FROM entity_mapping LIMIT 10;
+--------+-------+---------+
| entity | pk | node_id |
+--------+-------+---------+
| artist | 10323 | 1 |
| artist | 10416 | 2 |
| artist | 10432 | 3 |
| artist | 10436 | 4 |
| artist | 10440 | 5 |
| artist | 10453 | 6 |
| artist | 10459 | 7 |
| artist | 10469 | 8 |
| artist | 10475 | 9 |
| artist | 10479 | 10 |
+--------+-------+---------+
10 rows in set (0.00 sec)
mysql> SELECT
-> 'start',
-> 'end',
-> 'rel_type'
->
-> UNION ALL
->
-> SELECT
-> start_entity.node_id AS start,
-> end_entity.node_id AS end,
-> 'SONG_BY_ARTIST' AS rel_type
-> FROM
-> epf_artist_song
->
-> JOIN entity_mapping start_entity
-> ON (
-> start_entity.pk = epf_artist_song.artist_id
-> AND
-> start_entity.entity = 'artist'
-> )
->
-> JOIN entity_mapping end_entity
-> ON (
-> end_entity.pk = epf_artist_song.song_id
-> AND
-> end_entity.entity = 'song'
-> )
->
-> INTO OUTFILE '/tmp/epfitunes__rels001.all.tsv'
-> FIELDS TERMINATED BY '\t'
-> ENCLOSED BY ''
-> ESCAPED BY '\\'
-> LINES TERMINATED BY '\n';
Query OK, 46997430 rows affected (13 min 0.06 sec)
mysql>
mysql> SELECT * FROM entity_mapping LIMIT 10;
+--------+-------+---------+
| entity | pk | node_id |
+--------+-------+---------+
| artist | 10323 | 0 |
| artist | 10416 | 1 |
| artist | 10432 | 2 |
| artist | 10436 | 3 |
| artist | 10440 | 4 |
| artist | 10453 | 5 |
| artist | 10459 | 6 |
| artist | 10469 | 7 |
| artist | 10475 | 8 |
| artist | 10479 | 9 |
+--------+-------+---------+
10 rows in set (0.00 sec)
mysql> SELECT
-> 'start',
-> 'end',
-> 'rel_type'
->
-> UNION ALL
->
-> SELECT
-> start_entity.node_id AS start,
-> end_entity.node_id AS end,
-> 'SONG_BY_ARTIST' AS rel_type
-> FROM
-> epf_artist_song
->
-> JOIN entity_mapping start_entity
-> ON (
-> start_entity.pk = epf_artist_song.artist_id
-> AND
-> start_entity.entity = 'artist'
-> )
->
-> JOIN entity_mapping end_entity
-> ON (
-> end_entity.pk = epf_artist_song.song_id
-> AND
-> end_entity.entity = 'song'
-> )
->
-> INTO OUTFILE '/tmp/epfitunes__rels001.all.tsv'
-> FIELDS TERMINATED BY '\t'
-> ENCLOSED BY ''
-> ESCAPED BY '\\'
-> LINES TERMINATED BY '\n';
Query OK, 46997430 rows affected (11 min 55.20 sec)
user@machine:~/paul/dev/batch-import$ java -server -Xmx10G -jar ./target/batch-import-jar-with-dependencies.jar /home/paul/itunes-neo4j.db /tmp/epfitunes__nodes001.artists.tsv,/tmp/epfitunes__nodes002.songs.tsv /tmp/epfitunes__rels001.all.tsv
Usage: Importer data/dir nodes.csv relationships.csv [node_index node-index-name fulltext|exact nodes_index.csv rel_index rel-index-name fulltext|exact rels_index.csv ....]
Using: Importer /home/paul/itunes-neo4j.db /tmp/epfitunes__nodes001.artists.tsv,/tmp/epfitunes__nodes002.songs.tsv /tmp/epfitunes__rels001.all.tsv
Using Existing Configuration File
.........................................................
Importing 5775430 Nodes took 23 seconds
.................................................................................................... 30953 ms for 10000000
.................................................................................................... 35585 ms for 10000000
.................................................................................................... 28064 ms for 10000000
............................................................................................
Importing 39266772 Nodes took 124 seconds
.................................................................................................... 7871 ms for 10000000
.................................................................................................... 10273 ms for 10000000
.................................................................................................... 8200 ms for 10000000
.................................................................................................... 33701 ms for 10000000
.....................................................................
Importing 46997429 Relationships took 128 seconds
Total import time: 453 seconds
user@machine:~/paul/dev/batch-import$