diff --git a/gradle.properties b/gradle.properties index 94b2cbe..e3a119e 100644 --- a/gradle.properties +++ b/gradle.properties @@ -35,6 +35,7 @@ flinkMlVersion = 2.3.0 flinkStatefunVersion = 3.2.0 groovy3Version = 3.0.21 groovy4Version = 4.0.21 +groovy5Version = 5.0.0-alpha-8 igniteVersion = 2.16.0 igniteMlVersion = 2.15.0 jacksonVersion = 2.17.1 diff --git a/subprojects/WhiskeySpark/build.gradle b/subprojects/WhiskeySpark/build.gradle index 0f04b89..0faad04 100644 --- a/subprojects/WhiskeySpark/build.gradle +++ b/subprojects/WhiskeySpark/build.gradle @@ -31,11 +31,10 @@ tasks.named('run').configure { } dependencies { - implementation "org.apache.groovy:groovy:$groovy4Version" + implementation "org.apache.groovy:groovy:$groovy5Version" implementation "org.apache.spark:spark-mllib_$sparkVariant:$sparkVersion" implementation "org.apache.spark:spark-sql_$sparkVariant:$sparkVersion" implementation "com.fasterxml.jackson:jackson-bom:$jacksonVersion" - runtimeOnly "org.apache.spark:spark-core_$sparkVariant:$sparkVersion" } tasks.register('versionInfo') { diff --git a/subprojects/WhiskeySpark/src/main/groovy/WhiskeySpark.groovy b/subprojects/WhiskeySpark/src/main/groovy/WhiskeySpark.groovy index 871173b..2c1fb8b 100644 --- a/subprojects/WhiskeySpark/src/main/groovy/WhiskeySpark.groovy +++ b/subprojects/WhiskeySpark/src/main/groovy/WhiskeySpark.groovy @@ -13,44 +13,50 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -//@Grab("com.fasterxml.jackson.module:jackson-module-scala_2.11:2.11.1;transitive=false") -//@Grab('org.apache.spark:spark-sql_2.11:2.4.7') -//@Grab('org.apache.spark:spark-mllib_2.11:2.4.7') -//@GrabExclude("commons-codec:commons-codec:1.10") -//@GrabExclude("javax.xml.stream:stax-api:1.0-2") -//@Grab("commons-io:commons-io:2.10.0") - import org.apache.spark.ml.clustering.KMeans import org.apache.spark.ml.feature.VectorAssembler -import org.apache.spark.sql.Dataset -import org.apache.spark.sql.Row import static org.apache.spark.sql.SparkSession.builder static main(args) { - def spark = builder().config('spark.master', 'local[8]').appName('Whiskey').orCreate - spark.sparkContext().logLevel = 'WARN' - def file = WhiskeySpark.classLoader.getResource('whiskey.csv').file - int k = 5 - Dataset rows = spark.read().format('com.databricks.spark.csv') - .options('header': 'true', 'inferSchema': 'true').load(file) - //def colNames = rows.columns().toList().minus(extras).parallelStream().toArray(String[]::new) + var spark = builder().config('spark.master', 'local[8]').appName('Whiskey').orCreate + spark.sparkContext().logLevel = 'WARN' // quieten logging once we've started + var file = WhiskeySpark.classLoader.getResource('whiskey.csv').file + + var rows = spark.read().format('com.databricks.spark.csv') + .options(header: 'true', inferSchema: 'true').load(file) String[] colNames = rows.columns().toList() - ['RowID', 'Distillery'] - def assembler = new VectorAssembler(inputCols: colNames, outputCol: 'features') - Dataset dataset = assembler.transform(rows) - def clusterer = new KMeans(k: k, seed: 1L) - def model = clusterer.fit(dataset) + var assembler = new VectorAssembler(inputCols: colNames, outputCol: 'features') + var dataset = assembler.transform(rows) + var kmeans = new KMeans(k: 5, seed: 1L) + var model = kmeans.fit(dataset) println '\nCluster centers:' model.clusterCenters().each { println it.values().collect { sprintf '%.2f', it }.join(', ') } println() - spark.sparkContext().logLevel = 'INFO' + var result = model.transform(dataset) + var clusters = result.toLocalIterator().collect { row -> + [row.getAs('prediction'), row.getAs('Distillery')] + }.groupBy { it[0] }.collectValues { it*.get(1) } + clusters.each { k, v -> println "Cluster$k: ${v.join(', ')}"} + println() + spark.sparkContext().logLevel = 'INFO' // logging back to normal spark.stop() } /* +24/05/26 10:55:38 INFO SparkContext: Running Spark version 3.5.1 +... Cluster centers: -1.73, 2.35, 1.58, 0.81, 0.19, 1.15, 1.42, 0.81, 1.23, 1.77, 1.23, 1.31 -2.00, 1.00, 3.00, 0.00, 0.00, 0.00, 3.00, 1.00, 0.00, 2.00, 2.00, 2.00 -2.86, 2.38, 1.52, 0.05, 0.00, 1.95, 1.76, 2.05, 1.81, 2.05, 2.19, 1.71 -1.53, 2.38, 1.06, 0.16, 0.03, 1.09, 1.00, 0.50, 1.53, 1.75, 2.13, 2.28 -3.67, 1.50, 3.67, 3.33, 0.67, 0.17, 1.67, 0.50, 1.17, 1.33, 1.17, 0.17 - */ +2.89, 2.42, 1.53, 0.05, 0.00, 1.84, 1.58, 2.11, 2.11, 2.11, 2.26, 1.58 +1.45, 2.35, 1.06, 0.26, 0.06, 0.84, 1.13, 0.45, 1.26, 1.65, 2.19, 2.10 +1.83, 3.17, 1.00, 0.33, 0.17, 1.00, 0.67, 0.83, 0.83, 1.50, 0.50, 1.50 +3.00, 1.50, 3.00, 2.80, 0.50, 0.30, 1.40, 0.50, 1.50, 1.50, 1.30, 0.50 +1.85, 2.20, 1.70, 0.40, 0.10, 1.85, 1.80, 1.00, 1.35, 2.00, 1.40, 1.85 + +Cluster0: Aberfeldy, Aberlour, Auchroisk, Balmenach, BenNevis, Benrinnes, BlairAthol, Dailuaine, Dalmore, Edradour, Glendronach, Glendullan, Glenfarclas, Glenrothes, Longmorn, Macallan, Mortlach, RoyalLochnagar, Strathisla +Cluster1: AnCnoc, Auchentoshan, Aultmore, Balblair, Benriach, Bladnoch, Bunnahabhain, Cardhu, Craigganmore, Dufftown, GlenElgin, GlenGrant, GlenKeith, GlenMoray, Glenallachie, Glenfiddich, Glengoyne, Glenkinchie, Glenlossie, Glenmorangie, Linkwood, Loch Lomond, Mannochmore, RoyalBrackla, Speyside, Strathmill, Tamdhu, Tamnavulin, Teaninich, Tobermory, Tullibardine +Cluster3: Ardbeg, Caol Ila, Clynelish, GlenScotia, Isle of Jura, Lagavulin, Laphroig, Oban, OldPulteney, Talisker +Cluster4: Ardmore, Belvenie, Benromach, Bowmore, Bruichladdich, Craigallechie, Dalwhinnie, Deanston, GlenGarioch, GlenOrd, Glenlivet, Glenturret, Highland Park, Inchgower, Knochando, OldFettercairn, Scapa, Springbank, Tomatin, Tomintoul +Cluster2: ArranIsleOf, GlenDeveronMacduff, GlenSpey, Miltonduff, Speyburn, Tomore +... +24/05/26 10:55:51 INFO SparkContext: Successfully stopped SparkContext +*/