I have this code, it generates 2,000,000 points uniformly distributed in a bounding box and does some calculations to partition the points based on some criteria.
import numpy as np
from draw import draw
import time
X = 0
Y = 1
N = 2000000
max_x = -100000
max_y = -100000
min_x = 100000
min_y = 100000
points = np.random.uniform(-10, 10, (N, 2))
start = time.time()
max_x_index = np.argmax(points[:, X])
max_y_index = np.argmax(points[:, Y])
min_x_index = np.argmin(points[:, X])
min_y_index = np.argmin(points[:, Y])
p_right = points[max_x_index]
p_top = points[max_y_index]
p_left = points[min_x_index]
p_bottom = points[min_y_index]
top_right = points[
points[:, X] > ((points[:, Y] - p_top[Y]) / (p_right[Y] - p_top[Y])) * (p_right[X] - p_top[X]) + p_top[X]]
top_left = points[
points[:, X] < ((points[:, Y] - p_top[Y]) / (p_left[Y] - p_top[Y])) * (p_left[X] - p_top[X]) + p_top[X]]
bottom_right = points[
points[:, X] > ((points[:, Y] - p_bottom[Y]) / (p_right[Y] - p_bottom[Y])) * (p_right[X] - p_bottom[X]) + p_bottom[
X]]
bottom_left = points[
points[:, X] < ((points[:, Y] - p_bottom[Y]) / (p_left[Y] - p_bottom[Y])) * (p_left[X] - p_bottom[X]) + p_bottom[X]]
end = time.time()
print(end - start)
The output is usually 0.09, which is in seconds. The most time-consuming section is the last four computations to get top_right, top_left, bottom_right, and bottom_left. I rewrite the code as follows
import numpy as np
from draw import draw
import time
import multiprocessing
N = 2000000
X = 0
Y = 1
points = np.random.uniform(-10, 10, (N, 2))
max_x = -100000
max_y = -100000
min_x = 100000
min_y = 100000
manager = multiprocessing.Manager()
top_right = manager.list()
top_left = manager.list()
bottom_right = manager.list()
bottom_left = manager.list()
def set_top_right():
global X, Y, points, p_top, p_right, top_right
top_right.extend(points[
points[:, X] > ((points[:, Y] - p_top[Y]) / (p_right[Y] - p_top[Y])) * (p_right[X] - p_top[X]) + p_top[X]])
def set_top_left():
global X, Y, points, p_top, p_left, top_left
top_left.extend(points[
points[:, X] < ((points[:, Y] - p_top[Y]) / (p_left[Y] - p_top[Y])) * (p_left[X] - p_top[X]) + p_top[X]])
def set_bottom_right():
global X, Y, points, p_bottom, p_right, bottom_right
bottom_right.extend(points[
points[:, X] > ((points[:, Y] - p_bottom[Y]) / (p_right[Y] - p_bottom[Y])) * (p_right[X] - p_bottom[X]) +
p_bottom[X]])
def set_bottom_left():
global X, Y, points, p_bottom, p_left, bottom_left
bottom_left.extend(points[
points[:, X] < ((points[:, Y] - p_bottom[Y]) / (p_left[Y] - p_bottom[Y])) * (p_left[X] - p_bottom[X]) +
p_bottom[X]])
start = time.time()
max_x_index = np.argmax(points[:, X])
max_y_index = np.argmax(points[:, Y])
min_x_index = np.argmin(points[:, X])
min_y_index = np.argmin(points[:, Y])
p_right = points[max_x_index]
p_top = points[max_y_index]
p_left = points[min_x_index]
p_bottom = points[min_y_index]
p1 = multiprocessing.Process(target=set_top_right)
p2 = multiprocessing.Process(target=set_top_left)
p3 = multiprocessing.Process(target=set_bottom_right)
p4 = multiprocessing.Process(target=set_bottom_left)
p1.start()
p2.start()
p3.start()
p4.start()
p1.join()
p2.join()
p3.join()
p4.join()
end = time.time()
print(end - start)
and surprisingly it became worse, about 0.15 seconds. I'm almost new to Python, however, I guess both approaches are a single thread and there are no I.O. operations. My Laptop CPU is core i5
generation 11 and I expected each core to take one of the processes and make it faster. Then why is this slower?
p1.start() p2.start() p3.start() p4.start()
takes 0.14 seconds.set_top_right
,set_top_left
,set_bottom_right
andset_bottom_left
? If it's ~0.01-0.02 seconds, it's probably the overhead.list
to a numpyndarray
. That has a ton of serialization overhead. Honestly, kudos to the shared memory implementation for only being slightly slowernumpy
is often orders of magnitude faster than working with alist
, especially, amanager.List
, which requires serialization and deserialization. Note, you are still copyingpoints
and all the other state in each subprocess. Using amanager.List
is unlikely to beat serial numpy.