代码:
def find_collinear(rdd):
op = rdd.map( lambda x: (find_slope(x)[0][1],x) )
op = op.groupByKey().mapValues(lambda x:[a for a in x])
op = op.map(lambda x:x[1])
return op
def find_slope(x):
p1 = x[0]
p2 = x[1]
if p1[0] == p2[0] :
slope = "inf"
else:
slope = (p2[1] - p1[1]) / (p2[0] - p1[0])
t1 = tuple([x[0], slope])
t2 = tuple([t1, x[1]])
return t2
test_rdd = sc.parallelize(
[((4, 2), (2, 1)), ((4, 2), (-3, 4)), ((4, 2), (6, 3)),
((2, 1), (4, 2)), ((2, 1), (-3, 4)), ((2, 1), (6, 3)),
((-3, 4), (4, 2)), ((-3, 4), (2, 1)), ((-3, 4), (6, 3)),
((6, 3), (4, 2)), ((6, 3), (2, 1)), ((6, 3), (-3, 4))])
temp1 = find_collinear(test_rdd).collect()
输出
[[((4, 2), (2, 1)), ((4, 2), (6, 3)),
((2, 1), (4, 2)), ((2, 1), (6, 3)),
((6, 3), (4, 2)), ((6, 3), (2, 1))],
[((4, 2), (-3, 4)), ((-3, 4), (4, 2))],
[((2, 1), (-3, 4)), ((-3, 4), (2, 1))],
[((-3, 4), (6, 3)), ((6, 3), (-3, 4))]
]
期望输出:
[((6, 3), (4, 2), (2, 1)), ((4, 2), (-3, 4)), ((-3, 4), (2, 1)), ((6, 3), (-3, 4))]
如何从/而不是实际获得预期输出。