{"cells":[{"cell_type":"markdown","execution_count":null,"metadata":{"tags":[]},"outputs":[],"source":["SAM001b - Query Storage Pool from SQL Server Master Pool (2 of 3) - Convert data to parquet\n","===========================================================================================\n","\n","Description\n","-----------\n","\n","In this 2nd part of a 3 part tutorial, use Spark to convert a .csv file\n","into a parquet file.\n","\n","### Convert CSV to Parquet using the PySpark kernel\n","\n","First open the .csv file and convert it to a data frame object."]},{"cell_type":"code","execution_count":null,"metadata":{"tags":[]},"outputs":[],"source":["results = spark.read.option(\"inferSchema\", \"true\").csv('/tmp/clickstream_data/datasampleCS.csv').toDF(\"NumberID\", \"Name\", \"Name2\", \"Price\", \"Discount\", \"Money\", \"Money2\", \"Company\", \"Type\", \"Space\")"]},{"cell_type":"markdown","execution_count":null,"metadata":{"tags":[]},"outputs":[],"source":["Verify the schema using the following command."]},{"cell_type":"code","execution_count":null,"metadata":{"tags":[]},"outputs":[],"source":["results.printSchema()"]},{"cell_type":"markdown","execution_count":null,"metadata":{"tags":[]},"outputs":[],"source":["View the first 20 lines of this data using the following command."]},{"cell_type":"code","execution_count":null,"metadata":{"tags":[]},"outputs":[],"source":["results.show()"]},{"cell_type":"markdown","execution_count":null,"metadata":{"tags":[]},"outputs":[],"source":["Turn the .csv file to a parquet file."]},{"cell_type":"code","execution_count":null,"metadata":{"tags":[]},"outputs":[],"source":["sc._jsc.hadoopConfiguration().set(\"mapreduce.fileoutputcommitter.marksuccessfuljobs\", \"false\")"]},{"cell_type":"code","execution_count":null,"metadata":{"tags":[]},"outputs":[],"source":["results.write.mode(\"overwrite\").parquet('/tmp/clickstream_data_parquet')"]},{"cell_type":"markdown","execution_count":null,"metadata":{"tags":[]},"outputs":[],"source":["Verify the parquet file using the following commands."]},{"cell_type":"code","execution_count":null,"metadata":{"tags":[]},"outputs":[],"source":["result_parquet = spark.read.parquet('/tmp/clickstream_data_parquet')"]},{"cell_type":"code","execution_count":null,"metadata":{"tags":[]},"outputs":[],"source":["result_parquet.show()"]},{"cell_type":"code","execution_count":null,"metadata":{"tags":[]},"outputs":[],"source":["print (\"Notebook execution is complete.\")"]}],"nbformat":4,"nbformat_minor":5,"metadata":{"kernelspec":{"name":"pysparkkernel","display_name":"PySpark"},"pansop":{"related":"","test":{"strategy":"","types":null,"disable":{"reason":"","workitems":null,"types":null}},"target":{"current":"","final":""},"internal":{"parameters":null,"symlink":false},"timeout":"0"},"language_info":{"codemirror_mode":"{ Name: \"\", Version: \"\"}","file_extension":"","mimetype":"","name":"","nbconvert_exporter":"","pygments_lexer":"","version":""},"widgets":[]}}