Establish Spark Context for remaining tests

import pyspark
sc = pyspark.SparkContext('local[*]')

rdd = sc.parallelize(range(1000))
rdd.takeSample(False, 5)

[622, 973, 712, 58, 504]

# Test processing of text file, based on quick start
textFile = sc.textFile("README.md")

Use a Spark action

textFile.count()

33

linesWithSpark = textFile.filter(lambda line: "Spark" in line)
print("Lines with spark: %s" % linesWithSpark.count())

Lines with spark: 2

mostWordsInLine = textFile.map(lambda line: len(line.split())).reduce(lambda a, b: a if (a > b) else b)
mostWordsInLine

13

Get count of distinct words in the README.md

wordCounts = textFile.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)
wordCounts.collect()

[('implemented', 1),
 ('10092', 1),
 ('--name', 1),
 ('=', 1),
 ('learning', 1),
 ('#', 1),
 ('for', 1),
 ('Python', 3),
 ('not', 1),
 ('Spark.', 1),
 ('of', 1),
 ('pyenv', 1),
 ('-d', 1),
 ('16', 1),
 ('Prerequisite:', 1),
 ('textFile', 1),
 ('pyspark', 1),
 ('Spark', 1),
 ('Quick', 1),
 ('https://ipython.org/', 1),
 ('Some', 1),
 ('as', 1),
 ('working?', 1),
 ('textFile.count()', 1),
 ('history', 1),
 ('Start](http://spark.apache.org/docs/latest/quick-start.html)', 1),
 ('spark-experiments', 1),
 ('sc.textFile("README.md")', 1),
 ('jupyter/pyspark-notebook', 1),
 ('action', 1),
 ('Apache', 1),
 ('10093', 1),
 ('I', 1),
 ('-v', 1),
 ('docker', 1),
 ('shell', 1),
 ('This', 1),
 ('8888:8888', 1),
 ("I'm", 1),
 ('SPARK_EXPERIMENTS=/Users/ron/Documents/workingArea/spark-experiments', 1),
 ('two.', 1),
 ('export', 1),
 ('>>>', 2),
 ('contains', 1),
 ('project', 1),
 ('$SPARK_EXPERIMENTS:/home/jovyan/work', 1),
 ('-p', 1),
 ('[Spark', 1),
 ('https://github.com/jupyter/docker-stacks/tree/master/pyspark-notebook', 1),
 ('```', 2),
 ('run', 1),
 ('experiments', 1),
 ('from', 1),
 ('Line', 1),
 ('An', 1),
 ('3.5', 1),
 ('using', 1),
 ('steps', 1),
 ('part', 1)]