pom.xml
<?xml version="1.0" encoding="UTF-8"?>
< project xmlns = " http://maven.apache.org/POM/4.0.0" xmlns: xsi= " http://www.w3.org/2001/XMLSchema-instance" xsi: schemaLocation= " http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd" > < modelVersion> 4.0.0</ modelVersion> < parent> < groupId> org.springframework.boot</ groupId> < artifactId> spring-boot-starter-parent</ artifactId> < version> 2.2.5.RELEASE</ version> < relativePath/> </ parent> < groupId> com.sp</ groupId> < artifactId> spark</ artifactId> < version> 0.0.1-SNAPSHOT</ version> < name> spark</ name> < description> spark</ description> < properties> < java.version> 1.8</ java.version> </ properties> < dependencies> < dependency> < groupId> org.apache.spark</ groupId> < artifactId> spark-core_2.12</ artifactId> < version> 3.1.1</ version> </ dependency> < dependency> < groupId> org.apache.spark</ groupId> < artifactId> spark-sql_2.12</ artifactId> < version> 3.1.1</ version> </ dependency> < dependency> < groupId> org.codehaus.janino</ groupId> < artifactId> janino</ artifactId> < version> 3.0.8</ version> </ dependency> < dependency> < groupId> org.springframework.boot</ groupId> < artifactId> spring-boot-starter</ artifactId> </ dependency> < dependency> < groupId> org.springframework.boot</ groupId> < artifactId> spring-boot-devtools</ artifactId> < scope> runtime</ scope> < optional> true</ optional> </ dependency> < dependency> < groupId> org.projectlombok</ groupId> < artifactId> lombok</ artifactId> < optional> true</ optional> </ dependency> < dependency> < groupId> org.springframework.boot</ groupId> < artifactId> spring-boot-starter-test</ artifactId> < scope> test</ scope> </ dependency> </ dependencies> < build> < plugins> < plugin> < groupId> org.springframework.boot</ groupId> < artifactId> spring-boot-maven-plugin</ artifactId> < configuration> < excludes> < exclude> < groupId> org.projectlombok</ groupId> < artifactId> lombok</ artifactId> </ exclude> </ excludes> </ configuration> </ plugin> </ plugins> </ build> </ project>
Application
package com. sp. spark ; import org. apache. spark. SparkConf ;
import org. apache. spark. sql. Dataset ;
import org. apache. spark. sql. Row ;
import org. apache. spark. sql. SQLContext ;
import org. apache. spark. sql. SaveMode ;
import org. apache. spark. sql. SparkSession ;
import org. springframework. boot. autoconfigure. SpringBootApplication ; @SpringBootApplication
public class SparkApplication { public static void main ( String [ ] args) { SparkConf sparkConf = new SparkConf ( ) . setAppName ( "PTableCol1k" ) . setMaster ( "local[*]" ) ; SparkSession sparkSession = SparkSession . builder ( ) . config ( sparkConf) . getOrCreate ( ) ; SQLContext sqlContext = sparkSession. sqlContext ( ) ; sqlContext. setConf ( "spark.sql.parquet.compression.codec" , "gzip" ) ; Dataset < Row > writer = sparkSession. read ( ) . csv ( "D://test.csv" ) ; writer. write ( ) . mode ( SaveMode. Overwrite ) ; writer. write ( ) . parquet ( "D://parquet" ) ; }
}