-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Addd jdbc partition strategy with bucket based implementation #236
- Loading branch information
Showing
9 changed files
with
203 additions
and
136 deletions.
There are no files selected for viewing
49 changes: 49 additions & 0 deletions
49
eel-components/src/main/scala/io/eels/component/jdbc/BucketPartitionStrategy.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
package io.eels.component.jdbc | ||
|
||
import java.sql.{Connection, PreparedStatement} | ||
|
||
import io.eels.Part | ||
|
||
case class BucketPartitionStrategy(columnName: String, | ||
numberOfPartitions: Int, | ||
min: Int, | ||
max: Int) extends JdbcPartitionStrategy { | ||
|
||
def ranges: Seq[Range] = { | ||
|
||
// distribute surplus as evenly as possible across buckets | ||
// min max + 1 because the min-max range is inclusive | ||
val surplus = (max - min + 1) % numberOfPartitions | ||
val gap = (max - min + 1) / numberOfPartitions | ||
|
||
List.tabulate(numberOfPartitions) { k => | ||
val start = min + k * gap + Math.min(k, surplus) | ||
val end = min + ((k + 1) * gap) + Math.min(k + 1, surplus) | ||
Range(start, end) | ||
} | ||
} | ||
|
||
override def parts(connFn: () => Connection, | ||
query: String, | ||
bindFn: (PreparedStatement) => Unit, | ||
fetchSize: Int, | ||
dialect: JdbcDialect): Seq[Part] = { | ||
|
||
ranges.map { range => | ||
|
||
val partitionedQuery = | ||
s"""|SELECT * | ||
|FROM ( | ||
| SELECT * | ||
| FROM ( $query ) | ||
|) | ||
|WHERE ${range.start} <= $columnName AND $columnName <= ${range.end} | ||
|""".stripMargin | ||
|
||
new JdbcPart(connFn, partitionedQuery, bindFn, fetchSize, dialect) | ||
} | ||
} | ||
} | ||
|
||
|
||
|
29 changes: 29 additions & 0 deletions
29
eel-components/src/main/scala/io/eels/component/jdbc/HashPartitionStrategy.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package io.eels.component.jdbc | ||
|
||
import java.sql.{Connection, PreparedStatement} | ||
|
||
import io.eels.Part | ||
|
||
case class HashPartitionStrategy(hashExpression: String, | ||
numberOfPartitions: Int) extends JdbcPartitionStrategy { | ||
|
||
def partitionedQuery(partNum: Int, query: String): String = | ||
s"""|SELECT * | ||
|FROM ( | ||
| SELECT eel_tmp.*, $hashExpression AS eel_hash_col | ||
| FROM ( $query ) eel_tmp | ||
|) | ||
|WHERE eel_hash_col = $partNum | ||
|""".stripMargin | ||
|
||
override def parts(connFn: () => Connection, | ||
query: String, | ||
bindFn: (PreparedStatement) => Unit, | ||
fetchSize: Int, | ||
dialect: JdbcDialect): Seq[Part] = { | ||
|
||
for (k <- 0 until numberOfPartitions) yield { | ||
new JdbcPart(connFn, partitionedQuery(k, query), bindFn, fetchSize, dialect) | ||
} | ||
} | ||
} |
53 changes: 53 additions & 0 deletions
53
eel-components/src/main/scala/io/eels/component/jdbc/JdbcPart.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
package io.eels.component.jdbc | ||
|
||
import java.sql.{Connection, PreparedStatement} | ||
|
||
import com.sksamuel.exts.metrics.Timed | ||
import io.eels.{CloseableIterator, Part, Row} | ||
|
||
import scala.util.Try | ||
|
||
class JdbcPart(connFn: () => Connection, | ||
query: String, | ||
bindFn: (PreparedStatement) => Unit = stmt => (), | ||
fetchSize: Int = 100, | ||
dialect: JdbcDialect | ||
) extends Part with Timed with JdbcPrimitives { | ||
|
||
override def iterator(): CloseableIterator[Seq[Row]] = new CloseableIterator[Seq[Row]] { | ||
|
||
private val conn = connFn() | ||
private val stmt = conn.prepareStatement(query) | ||
stmt.setFetchSize(fetchSize) | ||
bindFn(stmt) | ||
|
||
private val rs = timed(s"Executing query $query") { | ||
stmt.executeQuery() | ||
} | ||
|
||
private val schema = schemaFor(dialect, rs) | ||
|
||
override def close(): Unit = { | ||
Try { super.close() } | ||
Try { rs.close() } | ||
Try { conn.close() } | ||
} | ||
|
||
override val iterator: Iterator[Seq[Row]] = new Iterator[Row] { | ||
|
||
var _hasnext = false | ||
|
||
override def hasNext(): Boolean = _hasnext || { | ||
_hasnext = rs.next() | ||
_hasnext | ||
} | ||
|
||
override def next(): Row = { | ||
_hasnext = false | ||
val values = schema.fieldNames().map(name => rs.getObject(name)) | ||
Row(schema, values) | ||
} | ||
|
||
}.grouped(fetchSize).withPartial(true) | ||
} | ||
} |
13 changes: 13 additions & 0 deletions
13
eel-components/src/main/scala/io/eels/component/jdbc/JdbcPartitionStrategy.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
package io.eels.component.jdbc | ||
|
||
import java.sql.{Connection, PreparedStatement} | ||
|
||
import io.eels.Part | ||
|
||
trait JdbcPartitionStrategy { | ||
def parts(connFn: () => Connection, | ||
query: String, | ||
bindFn: (PreparedStatement) => Unit, | ||
fetchSize: Int, | ||
dialect: JdbcDialect): Seq[Part] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
46 changes: 0 additions & 46 deletions
46
eel-components/src/main/scala/io/eels/component/jdbc/ResultsetPart.scala
This file was deleted.
Oops, something went wrong.
15 changes: 15 additions & 0 deletions
15
eel-components/src/main/scala/io/eels/component/jdbc/SinglePartitionStrategy.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
package io.eels.component.jdbc | ||
|
||
import java.sql.{Connection, PreparedStatement} | ||
|
||
import io.eels.Part | ||
|
||
case object SinglePartitionStrategy extends JdbcPartitionStrategy { | ||
override def parts(connFn: () => Connection, | ||
query: String, | ||
bindFn: (PreparedStatement) => Unit, | ||
fetchSize: Int, | ||
dialect: JdbcDialect): List[Part] = { | ||
List(new JdbcPart(connFn, query, bindFn, fetchSize, dialect)) | ||
} | ||
} |
36 changes: 36 additions & 0 deletions
36
eel-components/src/test/scala/io/eels/component/jdbc/BucketPartitionTest.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
package io.eels.component.jdbc | ||
|
||
import java.sql.DriverManager | ||
|
||
import org.scalatest.{Matchers, WordSpec} | ||
|
||
import scala.util.Random | ||
|
||
class BucketPartitionTest extends WordSpec with Matchers { | ||
|
||
private val conn = DriverManager.getConnection("jdbc:h2:mem:bucket_test") | ||
conn.createStatement().executeUpdate("create table mytable (a integer)") | ||
for (k <- 0 until 20) { | ||
conn.createStatement().executeUpdate(s"insert into mytable (a) values (${Random.nextInt(10000)})") | ||
} | ||
|
||
"BucketPartitionStrategy" should { | ||
"generate evenly spaced ranges" in { | ||
BucketPartitionStrategy("a", 10, 2, 29).ranges shouldBe List(Range.inclusive(2, 4), Range.inclusive(5, 7), Range.inclusive(8, 10), Range.inclusive(11, 13), Range.inclusive(14, 16), Range.inclusive(17, 19), Range.inclusive(20, 22), Range.inclusive(23, 25), Range.inclusive(26, 27), Range.inclusive(28, 29)) | ||
BucketPartitionStrategy("a", 2, 2, 30).ranges shouldBe List(Range.inclusive(2, 16), Range.inclusive(17, 30)) | ||
BucketPartitionStrategy("a", 1, 4, 5).ranges shouldBe List(Range.inclusive(4, 5)) | ||
BucketPartitionStrategy("a", 1, 4, 4).ranges shouldBe List(Range.inclusive(4, 4)) | ||
BucketPartitionStrategy("a", 6, 1, 29).ranges shouldBe List(Range.inclusive(1, 5), Range.inclusive(6, 10), Range.inclusive(11, 15), Range.inclusive(16, 20), Range.inclusive(21, 25), Range.inclusive(26, 29)) | ||
} | ||
"return correct number of ranges" in { | ||
JdbcSource(() => conn, "select * from mytable") | ||
.withPartitionStrategy(BucketPartitionStrategy("a", 4, 0, 10000)) | ||
.parts().size shouldBe 4 | ||
} | ||
"return full and non overlapping data" in { | ||
JdbcSource(() => conn, "select * from mytable") | ||
.withPartitionStrategy(BucketPartitionStrategy("a", 4, 0, 10000)) | ||
.toFrame().collect().size shouldBe 20 | ||
} | ||
} | ||
} |
32 changes: 0 additions & 32 deletions
32
eel-components/src/test/scala/io/eels/component/jdbc/ResultsetPartTest.scala
This file was deleted.
Oops, something went wrong.