forked from amplab/graphx
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request amplab#68 from mosharaf/master
Faster and stable/reliable broadcast HttpBroadcast is noticeably slow, but the alternatives (TreeBroadcast or BitTorrentBroadcast) are notoriously unreliable. The main problem with them is they try to manage the memory for the pieces of a broadcast themselves. Right now, the BroadcastManager does not know which machines the tasks reading from a broadcast variable is running and when they have finished. Consequently, we try to guess and often guess wrong, which blows up the memory usage and kills/hangs jobs. This very simple implementation solves the problem by not trying to manage the intermediate pieces; instead, it offloads that duty to the BlockManager which is quite good at juggling blocks. Otherwise, it is very similar to the BitTorrentBroadcast implementation (without fancy optimizations). And it runs much faster than HttpBroadcast we have right now. I've been using this for another project for last couple of weeks, and just today did some benchmarking against the Http one. The following shows the improvements for increasing broadcast size for cold runs. Each line represent the number of receivers. ![fix-bc-first](https://f.cloud.github.com/assets/232966/1349342/ffa149e4-36e7-11e3-9fa6-c74555829356.png) After the first broadcast is over, i.e., after JVM is wormed up and for HttpBroadcast the server is already running (I think), the following are the improvements for warm runs. ![fix-bc-succ](https://f.cloud.github.com/assets/232966/1349352/5a948bae-36e8-11e3-98ce-34f19ebd33e0.jpg) The curves are not as nice as the cold runs, but the improvements are obvious, specially for larger broadcasts and more receivers. Depending on how it goes, we should deprecate and/or remove old TreeBroadcast and BitTorrentBroadcast implementations, and hopefully, SPARK-889 will not be necessary any more.
- Loading branch information
Showing
7 changed files
with
328 additions
and
12 deletions.
There are no files selected for viewing
247 changes: 247 additions & 0 deletions
247
core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,247 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.broadcast | ||
|
||
import java.io._ | ||
|
||
import scala.math | ||
import scala.util.Random | ||
|
||
import org.apache.spark._ | ||
import org.apache.spark.storage.{BroadcastBlockId, BroadcastHelperBlockId, StorageLevel} | ||
import org.apache.spark.util.Utils | ||
|
||
|
||
private[spark] class TorrentBroadcast[T](@transient var value_ : T, isLocal: Boolean, id: Long) | ||
extends Broadcast[T](id) with Logging with Serializable { | ||
|
||
def value = value_ | ||
|
||
def broadcastId = BroadcastBlockId(id) | ||
|
||
TorrentBroadcast.synchronized { | ||
SparkEnv.get.blockManager.putSingle(broadcastId, value_, StorageLevel.MEMORY_AND_DISK, false) | ||
} | ||
|
||
@transient var arrayOfBlocks: Array[TorrentBlock] = null | ||
@transient var totalBlocks = -1 | ||
@transient var totalBytes = -1 | ||
@transient var hasBlocks = 0 | ||
|
||
if (!isLocal) { | ||
sendBroadcast() | ||
} | ||
|
||
def sendBroadcast() { | ||
var tInfo = TorrentBroadcast.blockifyObject(value_) | ||
|
||
totalBlocks = tInfo.totalBlocks | ||
totalBytes = tInfo.totalBytes | ||
hasBlocks = tInfo.totalBlocks | ||
|
||
// Store meta-info | ||
val metaId = BroadcastHelperBlockId(broadcastId, "meta") | ||
val metaInfo = TorrentInfo(null, totalBlocks, totalBytes) | ||
TorrentBroadcast.synchronized { | ||
SparkEnv.get.blockManager.putSingle( | ||
metaId, metaInfo, StorageLevel.MEMORY_AND_DISK, true) | ||
} | ||
|
||
// Store individual pieces | ||
for (i <- 0 until totalBlocks) { | ||
val pieceId = BroadcastHelperBlockId(broadcastId, "piece" + i) | ||
TorrentBroadcast.synchronized { | ||
SparkEnv.get.blockManager.putSingle( | ||
pieceId, tInfo.arrayOfBlocks(i), StorageLevel.MEMORY_AND_DISK, true) | ||
} | ||
} | ||
} | ||
|
||
// Called by JVM when deserializing an object | ||
private def readObject(in: ObjectInputStream) { | ||
in.defaultReadObject() | ||
TorrentBroadcast.synchronized { | ||
SparkEnv.get.blockManager.getSingle(broadcastId) match { | ||
case Some(x) => | ||
value_ = x.asInstanceOf[T] | ||
|
||
case None => | ||
val start = System.nanoTime | ||
logInfo("Started reading broadcast variable " + id) | ||
|
||
// Initialize @transient variables that will receive garbage values from the master. | ||
resetWorkerVariables() | ||
|
||
if (receiveBroadcast(id)) { | ||
value_ = TorrentBroadcast.unBlockifyObject[T](arrayOfBlocks, totalBytes, totalBlocks) | ||
|
||
// Store the merged copy in cache so that the next worker doesn't need to rebuild it. | ||
// This creates a tradeoff between memory usage and latency. | ||
// Storing copy doubles the memory footprint; not storing doubles deserialization cost. | ||
SparkEnv.get.blockManager.putSingle( | ||
broadcastId, value_, StorageLevel.MEMORY_AND_DISK, false) | ||
|
||
// Remove arrayOfBlocks from memory once value_ is on local cache | ||
resetWorkerVariables() | ||
} else { | ||
logError("Reading broadcast variable " + id + " failed") | ||
} | ||
|
||
val time = (System.nanoTime - start) / 1e9 | ||
logInfo("Reading broadcast variable " + id + " took " + time + " s") | ||
} | ||
} | ||
} | ||
|
||
private def resetWorkerVariables() { | ||
arrayOfBlocks = null | ||
totalBytes = -1 | ||
totalBlocks = -1 | ||
hasBlocks = 0 | ||
} | ||
|
||
def receiveBroadcast(variableID: Long): Boolean = { | ||
// Receive meta-info | ||
val metaId = BroadcastHelperBlockId(broadcastId, "meta") | ||
var attemptId = 10 | ||
while (attemptId > 0 && totalBlocks == -1) { | ||
TorrentBroadcast.synchronized { | ||
SparkEnv.get.blockManager.getSingle(metaId) match { | ||
case Some(x) => | ||
val tInfo = x.asInstanceOf[TorrentInfo] | ||
totalBlocks = tInfo.totalBlocks | ||
totalBytes = tInfo.totalBytes | ||
arrayOfBlocks = new Array[TorrentBlock](totalBlocks) | ||
hasBlocks = 0 | ||
|
||
case None => | ||
Thread.sleep(500) | ||
} | ||
} | ||
attemptId -= 1 | ||
} | ||
if (totalBlocks == -1) { | ||
return false | ||
} | ||
|
||
// Receive actual blocks | ||
val recvOrder = new Random().shuffle(Array.iterate(0, totalBlocks)(_ + 1).toList) | ||
for (pid <- recvOrder) { | ||
val pieceId = BroadcastHelperBlockId(broadcastId, "piece" + pid) | ||
TorrentBroadcast.synchronized { | ||
SparkEnv.get.blockManager.getSingle(pieceId) match { | ||
case Some(x) => | ||
arrayOfBlocks(pid) = x.asInstanceOf[TorrentBlock] | ||
hasBlocks += 1 | ||
SparkEnv.get.blockManager.putSingle( | ||
pieceId, arrayOfBlocks(pid), StorageLevel.MEMORY_AND_DISK, true) | ||
|
||
case None => | ||
throw new SparkException("Failed to get " + pieceId + " of " + broadcastId) | ||
} | ||
} | ||
} | ||
|
||
(hasBlocks == totalBlocks) | ||
} | ||
|
||
} | ||
|
||
private object TorrentBroadcast | ||
extends Logging { | ||
|
||
private var initialized = false | ||
|
||
def initialize(_isDriver: Boolean) { | ||
synchronized { | ||
if (!initialized) { | ||
initialized = true | ||
} | ||
} | ||
} | ||
|
||
def stop() { | ||
initialized = false | ||
} | ||
|
||
val BLOCK_SIZE = System.getProperty("spark.broadcast.blockSize", "4096").toInt * 1024 | ||
|
||
def blockifyObject[T](obj: T): TorrentInfo = { | ||
val byteArray = Utils.serialize[T](obj) | ||
val bais = new ByteArrayInputStream(byteArray) | ||
|
||
var blockNum = (byteArray.length / BLOCK_SIZE) | ||
if (byteArray.length % BLOCK_SIZE != 0) | ||
blockNum += 1 | ||
|
||
var retVal = new Array[TorrentBlock](blockNum) | ||
var blockID = 0 | ||
|
||
for (i <- 0 until (byteArray.length, BLOCK_SIZE)) { | ||
val thisBlockSize = math.min(BLOCK_SIZE, byteArray.length - i) | ||
var tempByteArray = new Array[Byte](thisBlockSize) | ||
val hasRead = bais.read(tempByteArray, 0, thisBlockSize) | ||
|
||
retVal(blockID) = new TorrentBlock(blockID, tempByteArray) | ||
blockID += 1 | ||
} | ||
bais.close() | ||
|
||
var tInfo = TorrentInfo(retVal, blockNum, byteArray.length) | ||
tInfo.hasBlocks = blockNum | ||
|
||
return tInfo | ||
} | ||
|
||
def unBlockifyObject[T](arrayOfBlocks: Array[TorrentBlock], | ||
totalBytes: Int, | ||
totalBlocks: Int): T = { | ||
var retByteArray = new Array[Byte](totalBytes) | ||
for (i <- 0 until totalBlocks) { | ||
System.arraycopy(arrayOfBlocks(i).byteArray, 0, retByteArray, | ||
i * BLOCK_SIZE, arrayOfBlocks(i).byteArray.length) | ||
} | ||
Utils.deserialize[T](retByteArray, Thread.currentThread.getContextClassLoader) | ||
} | ||
|
||
} | ||
|
||
private[spark] case class TorrentBlock( | ||
blockID: Int, | ||
byteArray: Array[Byte]) | ||
extends Serializable | ||
|
||
private[spark] case class TorrentInfo( | ||
@transient arrayOfBlocks : Array[TorrentBlock], | ||
totalBlocks: Int, | ||
totalBytes: Int) | ||
extends Serializable { | ||
|
||
@transient var hasBlocks = 0 | ||
} | ||
|
||
private[spark] class TorrentBroadcastFactory | ||
extends BroadcastFactory { | ||
|
||
def initialize(isDriver: Boolean) { TorrentBroadcast.initialize(isDriver) } | ||
|
||
def newBroadcast[T](value_ : T, isLocal: Boolean, id: Long) = | ||
new TorrentBroadcast[T](value_, isLocal, id) | ||
|
||
def stop() { TorrentBroadcast.stop() } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.