Skip to content

Importing iTunes EPF Relational into Neo4j

Paul Tremberth edited this page Dec 13, 2013 · 4 revisions

Resources:

Tests using the "artist" table

Using artist_type as a label

mysql> SELECT 'artist_id:long', 'name:string', 'is_actual_artist:boolean', 'view_url:string', 'artist_type:label'
    -> UNION ALL
    -> SELECT artist_id, CONCAT('"', REPLACE(epf_artist.name, '"', '""'), '"'), is_actual_artist, view_url, epf_artist_type.name
    -> FROM epf_artist
    -> INNER JOIN epf_artist_type
    ->     ON epf_artist.artist_type_id=epf_artist_type.artist_type_id
    -> INTO OUTFILE '/tmp/epfitunes__artists.tsv'
    -> FIELDS TERMINATED BY '\t' ENCLOSED BY '' ESCAPED BY '\\'
    -> LINES TERMINATED BY '\n';
Query OK, 5775431 rows affected (33.64 sec)

DROP TABLE IF EXISTS entity_mapping;
CREATE TABLE entity_mapping AS
(
    SELECT
        kind AS entity,
        pk,
        @curRow := @curRow + 1 AS node_id
    FROM
    (
        (
            SELECT
              'artist' as kind,
              epf_artist.artist_id AS "pk"
            FROM
              epf_artist
            ORDER BY pk ASC
        )

        UNION

        (
            SELECT
              'song' as kind,
              epf_song.song_id AS "pk"
            FROM
              epf_song
            ORDER BY pk ASC
        )
    ) AS entity_union

    JOIN (SELECT @curRow := -1) r
);

CREATE INDEX idx_entity_mapping ON entity_mapping(entity, pk);

mysql> DROP TABLE IF EXISTS entity_mapping;
Query OK, 0 rows affected (23.03 sec)

mysql> CREATE TABLE entity_mapping AS
    -> (
    ->     SELECT
    ->         kind AS entity,
    ->         pk,
    ->         @curRow := @curRow + 1 AS node_id
    ->     FROM
    ->     (
    ->         (
    ->             SELECT
    ->               'artist' as kind,
    ->               epf_artist.artist_id AS "pk"
    ->             FROM
    ->               epf_artist
    ->             ORDER BY pk ASC
    ->         )
    ->
    ->         UNION
    ->
    ->         (
    ->             SELECT
    ->               'song' as kind,
    ->               epf_song.song_id AS "pk"
    ->             FROM
    ->               epf_song
    ->             ORDER BY pk ASC
    ->         )
    ->     ) AS entity_union
    ->
    ->     JOIN (SELECT @curRow := -1) r
    -> );
Query OK, 45042202 rows affected (37 min 58.11 sec)
Records: 45042202  Duplicates: 0  Warnings: 0

SELECT
    'l:label',
    'pk:long',
    'name:string',
    'url:string'

UNION ALL

SELECT * FROM (
    SELECT
        epf_artist_type.name as kind,
        epf_artist.artist_id AS pk,
        epf_artist.name AS name,
        epf_artist.view_url as url
    FROM epf_artist
    INNER JOIN epf_artist_type
    ON epf_artist.artist_type_id=epf_artist_type.artist_type_id
    ORDER BY pk ASC
) AS artist_node

INTO OUTFILE '/tmp/epfitunes__nodes001.artists.tsv'
FIELDS TERMINATED BY '\t'
ENCLOSED BY ''
ESCAPED BY '\\'
LINES TERMINATED BY '\n';
mysql> SELECT
    ->     'l:label',
    ->     'pk:long',
    ->     'name:string',
    ->     'url:string'
    ->
    -> UNION ALL
    ->
    -> SELECT * FROM (
    ->     SELECT
    ->         epf_artist_type.name as kind,
    ->         epf_artist.artist_id AS pk,
    ->         epf_artist.name AS name,
    ->         epf_artist.view_url as url
    ->     FROM epf_artist
    ->     INNER JOIN epf_artist_type
    ->     ON epf_artist.artist_type_id=epf_artist_type.artist_type_id
    ->     ORDER BY pk ASC
    -> ) AS artist_node
    ->
    -> INTO OUTFILE '/tmp/epfitunes__nodes001.artists.tsv'
    -> FIELDS TERMINATED BY '\t'
    -> ENCLOSED BY ''
    -> ESCAPED BY '\\'
    -> LINES TERMINATED BY '\n';
Query OK, 5775431 rows affected (24 min 58.37 sec)

mysql> SELECT
    ->     'l:label',
    ->     'pk:long',
    ->     'name:string',
    ->     'url:string'
    ->
    -> UNION ALL
    ->
    -> SELECT * FROM (
    ->     SELECT
    ->         'Song' as kind,
    ->         epf_song.song_id AS pk,
    ->         epf_song.name AS name,
    ->         epf_song.view_url as url
    ->     FROM epf_song
    ->     ORDER BY pk ASC
    -> ) AS song_node
    ->
    ->
    -> INTO OUTFILE '/tmp/epfitunes__nodes002.songs.tsv'
    -> FIELDS TERMINATED BY '\t'
    -> ENCLOSED BY ''
    -> ESCAPED BY '\\'
    -> LINES TERMINATED BY '\n';
Query OK, 39266773 rows affected (1 hour 35 min 17.22 sec)

SELECT
    'start',
    'end',
    'rel_type'

UNION ALL

SELECT
    start_entity.node_id AS start,
    end_entity.node_id AS end,
    'SONG_BY_ARTIST' AS rel_type
FROM
    epf_artist_song

JOIN entity_mapping start_entity
    ON (
        start_entity.pk = epf_artist_song.artist_id
        AND
        start_entity.entity = 'artist'
    )

JOIN entity_mapping end_entity
    ON (
        end_entity.pk = epf_artist_song.song_id
        AND
        end_entity.entity = 'song'
    )

INTO OUTFILE '/tmp/epfitunes__rels001.all.tsv'
FIELDS TERMINATED BY '\t'
ENCLOSED BY ''
ESCAPED BY '\\'
LINES TERMINATED BY '\n';

mysql> SELECT     start_entity.node_id AS start,     end_entity.node_id AS end,     'SONG_BY_ARTIST' AS rel_type FROM     epf_artist_song  JOIN entity_mapping start_entity     ON (         start_entity.pk = epf_artist_song.artist_id         AND         start_entity.entity = 'artist'     )  JOIN entity_mapping end_entity     ON (         end_entity.pk = epf_artist_song.song_id         AND         end_entity.entity = 'song'     ) LIMIT 10;
+-------+---------+----------------+
| start | end     | rel_type       |
+-------+---------+----------------+
|  1662 | 5775431 | SONG_BY_ARTIST |
|  1662 | 5775432 | SONG_BY_ARTIST |
|  1662 | 5775433 | SONG_BY_ARTIST |
|  1662 | 5775434 | SONG_BY_ARTIST |
|  1662 | 5775435 | SONG_BY_ARTIST |
|  1662 | 5775436 | SONG_BY_ARTIST |
|  1662 | 5775437 | SONG_BY_ARTIST |
|  1662 | 5775438 | SONG_BY_ARTIST |
|  1662 | 5775439 | SONG_BY_ARTIST |
|  1662 | 5775440 | SONG_BY_ARTIST |
+-------+---------+----------------+
10 rows in set (0.00 sec)

mysql> SELECT * FROM entity_mapping LIMIT 10;
+--------+-------+---------+
| entity | pk    | node_id |
+--------+-------+---------+
| artist | 10323 |       1 |
| artist | 10416 |       2 |
| artist | 10432 |       3 |
| artist | 10436 |       4 |
| artist | 10440 |       5 |
| artist | 10453 |       6 |
| artist | 10459 |       7 |
| artist | 10469 |       8 |
| artist | 10475 |       9 |
| artist | 10479 |      10 |
+--------+-------+---------+
10 rows in set (0.00 sec)

mysql> SELECT
    ->     'start',
    ->     'end',
    ->     'rel_type'
    ->
    -> UNION ALL
    ->
    -> SELECT
    ->     start_entity.node_id AS start,
    ->     end_entity.node_id AS end,
    ->     'SONG_BY_ARTIST' AS rel_type
    -> FROM
    ->     epf_artist_song
    ->
    -> JOIN entity_mapping start_entity
    ->     ON (
    ->         start_entity.pk = epf_artist_song.artist_id
    ->         AND
    ->         start_entity.entity = 'artist'
    ->     )
    ->
    -> JOIN entity_mapping end_entity
    ->     ON (
    ->         end_entity.pk = epf_artist_song.song_id
    ->         AND
    ->         end_entity.entity = 'song'
    ->     )
    ->
    -> INTO OUTFILE '/tmp/epfitunes__rels001.all.tsv'
    -> FIELDS TERMINATED BY '\t'
    -> ENCLOSED BY ''
    -> ESCAPED BY '\\'
    -> LINES TERMINATED BY '\n';
Query OK, 46997430 rows affected (13 min 0.06 sec)

mysql>

mysql> SELECT * FROM entity_mapping LIMIT 10;
+--------+-------+---------+
| entity | pk    | node_id |
+--------+-------+---------+
| artist | 10323 |       0 |
| artist | 10416 |       1 |
| artist | 10432 |       2 |
| artist | 10436 |       3 |
| artist | 10440 |       4 |
| artist | 10453 |       5 |
| artist | 10459 |       6 |
| artist | 10469 |       7 |
| artist | 10475 |       8 |
| artist | 10479 |       9 |
+--------+-------+---------+
10 rows in set (0.00 sec)

mysql> SELECT
    ->     'start',
    ->     'end',
    ->     'rel_type'
    ->
    -> UNION ALL
    ->
    -> SELECT
    ->     start_entity.node_id AS start,
    ->     end_entity.node_id AS end,
    ->     'SONG_BY_ARTIST' AS rel_type
    -> FROM
    ->     epf_artist_song
    ->
    -> JOIN entity_mapping start_entity
    ->     ON (
    ->         start_entity.pk = epf_artist_song.artist_id
    ->         AND
    ->         start_entity.entity = 'artist'
    ->     )
    ->
    -> JOIN entity_mapping end_entity
    ->     ON (
    ->         end_entity.pk = epf_artist_song.song_id
    ->         AND
    ->         end_entity.entity = 'song'
    ->     )
    ->
    -> INTO OUTFILE '/tmp/epfitunes__rels001.all.tsv'
    -> FIELDS TERMINATED BY '\t'
    -> ENCLOSED BY ''
    -> ESCAPED BY '\\'
    -> LINES TERMINATED BY '\n';
Query OK, 46997430 rows affected (11 min 55.20 sec)


user@machine:~/paul/dev/batch-import$ java -server -Xmx10G -jar ./target/batch-import-jar-with-dependencies.jar /home/paul/itunes-neo4j.db /tmp/epfitunes__nodes001.artists.tsv,/tmp/epfitunes__nodes002.songs.tsv /tmp/epfitunes__rels001.all.tsv
Usage: Importer data/dir nodes.csv relationships.csv [node_index node-index-name fulltext|exact nodes_index.csv rel_index rel-index-name fulltext|exact rels_index.csv ....]
Using: Importer /home/paul/itunes-neo4j.db /tmp/epfitunes__nodes001.artists.tsv,/tmp/epfitunes__nodes002.songs.tsv /tmp/epfitunes__rels001.all.tsv

Using Existing Configuration File
.........................................................
Importing 5775430 Nodes took 23 seconds 
.................................................................................................... 30953 ms for 10000000
.................................................................................................... 35585 ms for 10000000
.................................................................................................... 28064 ms for 10000000
............................................................................................
Importing 39266772 Nodes took 124 seconds 
.................................................................................................... 7871 ms for 10000000
.................................................................................................... 10273 ms for 10000000
.................................................................................................... 8200 ms for 10000000
.................................................................................................... 33701 ms for 10000000
.....................................................................
Importing 46997429 Relationships took 128 seconds 

Total import time: 453 seconds 
user@machine:~/paul/dev/batch-import$