-
Notifications
You must be signed in to change notification settings - Fork 148
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #176 from twitter/cross_partitioning
Graph file splitter (deleted old and resubmit new pull request)
- Loading branch information
Showing
9 changed files
with
244 additions
and
6 deletions.
There are no files selected for viewing
99 changes: 99 additions & 0 deletions
99
...vary-core/src/main/scala/com/twitter/cassovary/graph/distributed/GraphFilesSplitter.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
/* | ||
* Copyright 2015 Twitter, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this | ||
* file except in compliance with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed | ||
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
* CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations under the License. | ||
*/ | ||
package com.twitter.cassovary.graph.distributed | ||
|
||
import com.twitter.cassovary.graph.NodeIdEdgesMaxId | ||
import com.twitter.cassovary.util.BoundedFuturePool | ||
import com.twitter.cassovary.util.io.GraphReaderFromDirectory | ||
import com.twitter.logging.Logger | ||
import com.twitter.util.{Await, Future, FuturePool} | ||
import java.io._ | ||
|
||
/** | ||
* Splits a graph read by `graphReaderFromDirectory` to multiple subgraphs, each | ||
* in a separate subdirectory, named "instance_i" for partition numbered i. | ||
* Splitting is done as per `partitioner`. | ||
*/ | ||
class GraphFilesSplitter[T](outputDir: String, partitioner: Partitioner, | ||
graphReaderFromDirectory: GraphReaderFromDirectory[T]) { | ||
|
||
private val futurePool = new BoundedFuturePool(FuturePool.unboundedPool, | ||
graphReaderFromDirectory.parallelismLimit) | ||
private val log = Logger.get("graphFilesSplitter") | ||
|
||
def splitGraph(): Unit = { | ||
// there are many parts of the original input graph | ||
val inputParts = graphReaderFromDirectory.iterableSeq | ||
|
||
// instanceWriters is a 2-D array indexed by input part# and instance# | ||
val instanceWriters = setupPerInstanceSubdirectories(partitioner.numInstances, | ||
graphReaderFromDirectory.iterableSeq.length) | ||
val futures = Future.collect(inputParts.indices map { i => | ||
split(inputParts(i).iterator, instanceWriters(i)) | ||
}) | ||
Await.result(futures) | ||
} | ||
|
||
private def mkDirHelper(dirName: String): Unit = { | ||
val dir = new File(dirName) | ||
if (dir.exists()) { | ||
log.info("Directory %s already exists.", dir) | ||
} else { | ||
if (dir.mkdir()) { | ||
log.debug("Made new directory %s", dir) | ||
} else { | ||
throw new FileNotFoundException("Unable to create new directory " + dir) | ||
} | ||
} | ||
} | ||
|
||
private def getBufferedWriter(fileName: String): BufferedWriter = { | ||
try { | ||
val f = new File(fileName) | ||
f.createNewFile() | ||
new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "utf-8")) | ||
} catch { | ||
case ex : IOException => throw new IOException(ex.toString) | ||
} | ||
} | ||
|
||
// @return an array of arrays. The right index is of subgraph instance number and | ||
// left index is of input seq number. | ||
private def setupPerInstanceSubdirectories(numInstances: Int, | ||
numInputParts: Int): Array[Array[BufferedWriter]] = { | ||
mkDirHelper(outputDir) | ||
val instanceWriters = Array.ofDim[BufferedWriter](numInputParts, numInstances) | ||
(0 until numInstances) foreach { i => | ||
val subDirName = outputDir + "/instance_" + i | ||
mkDirHelper(subDirName) | ||
(0 until numInputParts) foreach { j => | ||
instanceWriters(j)(i) = getBufferedWriter(subDirName + "/" + j) | ||
} | ||
} | ||
instanceWriters | ||
} | ||
|
||
private def split(it: Iterator[NodeIdEdgesMaxId], | ||
instanceWriters: Array[BufferedWriter]): Future[Unit] = futurePool { | ||
it foreach { origNode => | ||
partitioner.map(origNode) foreach { case (instance, node) => | ||
instanceWriters(instance).write(graphReaderFromDirectory.reverseParseNode(node)) | ||
} | ||
} | ||
instanceWriters foreach { writer => | ||
writer.flush() | ||
writer.close() | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
45 changes: 45 additions & 0 deletions
45
...-core/src/test/scala/com/twitter/cassovary/graph/distributed/GraphFilesSplitterSpec.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
/* | ||
* Copyright 2015 Twitter, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this | ||
* file except in compliance with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed | ||
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
* CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations under the License. | ||
*/ | ||
package com.twitter.cassovary.graph.distributed | ||
|
||
import java.io.File | ||
import java.util.concurrent.Executors | ||
|
||
import com.twitter.cassovary.util.io.AdjacencyListGraphReader | ||
import com.twitter.common.util.FileUtils | ||
import org.scalatest.{Matchers, WordSpec} | ||
|
||
class GraphFilesSplitterSpec extends WordSpec with Matchers { | ||
val inputGraphDir = "cassovary-core/src/test/resources/graphs" | ||
val reader = AdjacencyListGraphReader.forIntIds(inputGraphDir, "toy_6nodes_adj") | ||
val tmpDir = "/tmp/test_graph_splitter" | ||
val numInstances = 2 | ||
val partitioner = new HashSourceMapper(numInstances, i => i % numInstances) | ||
val splitter = new GraphFilesSplitter[Int](tmpDir, partitioner, reader) | ||
"splitter" should { | ||
"make appropriate output files and directories" in { | ||
splitter.splitGraph() | ||
val tmpd = new File(tmpDir) | ||
val subdirs = tmpd.list() | ||
val expectedSubDirs = (0 until numInstances).map(i => "instance_" + i).toList.sorted | ||
subdirs.toList.sorted shouldEqual expectedSubDirs | ||
val expectedFiles = (0 until reader.iterableSeq.length).map(_.toString).toList.sorted | ||
subdirs foreach { s => | ||
val files = new File(tmpDir + "/" + s).list() | ||
files.toList.sorted shouldEqual expectedFiles | ||
} | ||
FileUtils.forceDeletePath(tmpd) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
37 changes: 37 additions & 0 deletions
37
cassovary-examples/src/main/scala/cross-partitioning/CrossPartitioning.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
/* | ||
* Copyright 2015 Twitter, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this | ||
* file except in compliance with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed | ||
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
* CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations under the License. | ||
*/ | ||
|
||
import com.twitter.app.Flags | ||
import com.twitter.cassovary.graph.distributed.{GraphFilesSplitter, HashSourceAndDestMapper} | ||
import com.twitter.cassovary.util.io.AdjacencyListGraphReader | ||
|
||
object CrossPartitioning extends App { | ||
|
||
val flags = new Flags("Cross Partitioning") | ||
val numInstances = flags("n", 10, "Number of instances/shards") | ||
val inputGraphDir = flags("i", "/tmp/input-graph", "Input graph directory") | ||
val subgraphsDir = flags("o", "/tmp/output-graph", "Output subgraphs directory") | ||
val helpFlag = flags("h", false, "Print usage") | ||
flags.parseArgs(args) | ||
|
||
val reader = AdjacencyListGraphReader.forIntIds(inputGraphDir(), "toy_6nodes_adj", null) | ||
|
||
def hashNodeFn(i: Int) = i | ||
|
||
val partitioner = new HashSourceAndDestMapper(numInstances(), hashNodeFn) | ||
val splitter = new GraphFilesSplitter[Int](subgraphsDir(), partitioner, reader) | ||
println(s"Now splitting graph in ${inputGraphDir()} into ${numInstances()} subgraphs.") | ||
splitter.splitGraph() | ||
println("Split is complete.") | ||
} |
17 changes: 17 additions & 0 deletions
17
cassovary-examples/src/main/scala/cross-partitioning/create_info_all_instances.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/bin/bash | ||
|
||
INPUT_GRAPH_FILES_PREFIX=$1 | ||
OUTPUT_GRAPH_DIR=$2 | ||
|
||
ALL_INSTANCES_SUBDIR=$OUTPUT_GRAPH_DIR/all_instances | ||
OUTDEGREES=$ALL_INSTANCES_SUBDIR/outdegrees.txt | ||
INDEGREES=$ALL_INSTANCES_SUBDIR/indegrees.txt | ||
|
||
mkdir -p $ALL_INSTANCES_SUBDIR | ||
echo Creating Outdegrees file in $OUTDEGREES ... | ||
grep -h '. .' $INPUT_GRAPH_FILES_PREFIX* > $OUTDEGREES | ||
|
||
echo Creating Indegrees file in $INDEGREES ... | ||
grep -h '^[0-9][0-9]*$' $INPUT_GRAPH_FILES_PREFIX* | sort -S2G | uniq -c | perl -lane 'print $F[1]," ", $F[0]' > $INDEGREES | ||
echo Done everything. | ||
|