rdd = sc.parallelize(["b", "b", "c"]) sorted(rdd.map(lambda x: (x, 1)).collect()) # [('a', 1), ('b', 1), ('c', 1)]