Sculpt: OMP -> BLI_task, step II.

This time, all tools' code itself. Not much to say, except that we can also get rid of that OMP caching pre-process ugly stuff for multires smoothing. Together with previous commit, we have about 5% average speedup on stroke execution (though this vary a lot, up to 30% speedup in rare cases, and in even rarer cases some odd massive slowdowns...). Tech note: we may want to add 'guided'-similar feature to our BLI_task threaded loop, I suspect this could explain random massive slowdowns of new code (very rare, but annoying...).
2016-01-03 23:54:44 +01:00
parent da49ee30b0
commit 2e77c413ab
1 changed files with 790 additions and 658 deletions
--- a/source/blender/editors/sculpt_paint/sculpt.c
+++ b/source/blender/editors/sculpt_paint/sculpt.c
@@ -200,11 +200,6 @@ typedef struct StrokeCache {
 	float clip_tolerance[3];
 	float initial_mouse[2];

-	/* Pre-allocated temporary storage used during smoothing */
-	int num_threads, init_num_threads;
-	float (**tmpgrid_co)[3], (**tmprow_co)[3];
-	float **tmpgrid_mask, **tmprow_mask;
-
 	/* Variants */
 	float radius;
 	float radius_squared;
@@ -1528,36 +1523,71 @@ static float bmesh_neighbor_average_mask(BMVert *v, const int cd_vert_mask_offse
 	}
 }

-static void do_mesh_smooth_brush(Sculpt *sd, SculptSession *ss, PBVHNode *node, float bstrength, int smooth_mask)
+typedef struct SculptDoBrushData {
+	Sculpt *sd;
+	Object *ob;
+	Brush *brush;
+    PBVHNode **nodes;
+
+	/* Data specific to some brushes. */
+	/* Note: even only one or two of those are used at a time, keeping them separated, names help figuring out
+	 *       what it is, and memory overhead is ridiculous anyway... */
+	SculptProjectVector *spvc;
+	float flippedbstrength;
+	float angle;
+	float *offset;
+	float *grab_delta;
+	float *cono;
+	float *area_no;
+	float *area_no_sp;
+	float *area_co;
+	float (*mat)[4];
+	float strength;
+	bool smooth_mask;
+
+	ThreadMutex mutex;
+} SculptDoBrushData;
+
+/* Note: uses after-struct allocated mem to store actual cache... */
+typedef struct SculptDoBrushSmoothGridDataChunk {
+	size_t tmpgrid_size;
+} SculptDoBrushSmoothGridDataChunk;
+
+static void do_smooth_brush_mesh_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
 {
-	Brush *brush = BKE_paint_brush(&sd->paint);
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Sculpt *sd = data->sd;
+	Brush *brush = data->brush;
+	const bool smooth_mask = data->smooth_mask;
+	float bstrength = data->strength;
+
 	PBVHVertexIter vd;
 	SculptBrushTest test;
-	
+
 	CLAMP(bstrength, 0.0f, 1.0f);

 	sculpt_brush_test_init(ss, &test);

-	BKE_pbvh_vertex_iter_begin(ss->pbvh, node, vd, PBVH_ITER_UNIQUE)
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
 	{
 		if (sculpt_brush_test(&test, vd.co)) {
-			const float fade = bstrength * tex_strength(ss, brush, vd.co, test.dist,
-			                                            vd.no, vd.fno,
-			                                            smooth_mask ? 0 : (vd.mask ? *vd.mask : 0.0f));
+			const float fade = bstrength * tex_strength(
+			                       ss, brush, vd.co, test.dist, vd.no, vd.fno,
+			                       smooth_mask ? 0.0f : (vd.mask ? *vd.mask : 0.0f));
 			if (smooth_mask) {
 				float val = neighbor_average_mask(ss, vd.vert_indices[vd.i]) - *vd.mask;
 				val *= fade * bstrength;
 				*vd.mask += val;
-				CLAMP(*vd.mask, 0, 1);
+				CLAMP(*vd.mask, 0.0f, 1.0f);
 			}
 			else {
 				float avg[3], val[3];

 				neighbor_average(ss, avg, vd.vert_indices[vd.i]);
 				sub_v3_v3v3(val, avg, vd.co);
-				mul_v3_fl(val, fade);

-				add_v3_v3(val, vd.co);
+				madd_v3_v3v3fl(val, vd.co, val, fade);

 				sculpt_clip(sd, ss, vd.co, val);
 			}
@@ -1569,36 +1599,40 @@ static void do_mesh_smooth_brush(Sculpt *sd, SculptSession *ss, PBVHNode *node,
 	BKE_pbvh_vertex_iter_end;
 }

-static void do_bmesh_smooth_brush(Sculpt *sd, SculptSession *ss, PBVHNode *node, float bstrength, int smooth_mask)
+static void do_smooth_brush_bmesh_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
 {
-	Brush *brush = BKE_paint_brush(&sd->paint);
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Sculpt *sd = data->sd;
+	Brush *brush = data->brush;
+	const bool smooth_mask = data->smooth_mask;
+	float bstrength = data->strength;
+
 	PBVHVertexIter vd;
 	SculptBrushTest test;
-	
+
 	CLAMP(bstrength, 0.0f, 1.0f);

 	sculpt_brush_test_init(ss, &test);

-	BKE_pbvh_vertex_iter_begin(ss->pbvh, node, vd, PBVH_ITER_UNIQUE)
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
 	{
 		if (sculpt_brush_test(&test, vd.co)) {
-			const float fade = bstrength * tex_strength(ss, brush, vd.co, test.dist,
-			                                            vd.no, vd.fno,
-			                                            smooth_mask ? 0 : *vd.mask);
+			const float fade = bstrength * tex_strength(
+			                       ss, brush, vd.co, test.dist, vd.no, vd.fno, smooth_mask ? 0.0f : *vd.mask);
 			if (smooth_mask) {
 				float val = bmesh_neighbor_average_mask(vd.bm_vert, vd.cd_vert_mask_offset) - *vd.mask;
 				val *= fade * bstrength;
 				*vd.mask += val;
-				CLAMP(*vd.mask, 0, 1);
+				CLAMP(*vd.mask, 0.0f, 1.0f);
 			}
 			else {
 				float avg[3], val[3];

 				bmesh_neighbor_average(avg, vd.bm_vert);
 				sub_v3_v3v3(val, avg, vd.co);
-				mul_v3_fl(val, fade);

-				add_v3_v3(val, vd.co);
+				madd_v3_v3v3fl(val, vd.co, val, fade);

 				sculpt_clip(sd, ss, vd.co, val);
 			}
@@ -1610,74 +1644,77 @@ static void do_bmesh_smooth_brush(Sculpt *sd, SculptSession *ss, PBVHNode *node,
 	BKE_pbvh_vertex_iter_end;
 }

-static void do_multires_smooth_brush(Sculpt *sd, SculptSession *ss, PBVHNode *node,
-                                     float bstrength, int smooth_mask)
+static void do_smooth_brush_multires_task_cb(void *userdata, void *userdata_chunk, int n)
 {
-	Brush *brush = BKE_paint_brush(&sd->paint);
+	SculptDoBrushData *data = userdata;
+	SculptDoBrushSmoothGridDataChunk *data_chunk = userdata_chunk;
+	SculptSession *ss = data->ob->sculpt;
+	Sculpt *sd = data->sd;
+	Brush *brush = data->brush;
+	const bool smooth_mask = data->smooth_mask;
+	float bstrength = data->strength;
+
 	SculptBrushTest test;
-	CCGElem **griddata, *data;
+	CCGElem **griddata, *gddata;
 	CCGKey key;
-	float (*tmpgrid_co)[3], (*tmprow_co)[3];
-	float *tmpgrid_mask, *tmprow_mask;
-	int v1, v2, v3, v4;
-	int thread_num;
+
+	float (*tmpgrid_co)[3] = NULL;
+	float tmprow_co[2][3];
+	float *tmpgrid_mask = NULL;
+	float tmprow_mask[2];
+
 	BLI_bitmap * const *grid_hidden;
-	int *grid_indices, totgrid, gridsize, i, x, y;
+	int *grid_indices, totgrid, gridsize;
+	int i, x, y;

 	sculpt_brush_test_init(ss, &test);

 	CLAMP(bstrength, 0.0f, 1.0f);

-	BKE_pbvh_node_get_grids(ss->pbvh, node, &grid_indices, &totgrid,
-	                        NULL, &gridsize, &griddata);
+	BKE_pbvh_node_get_grids(ss->pbvh, data->nodes[n], &grid_indices, &totgrid, NULL, &gridsize, &griddata);
 	BKE_pbvh_get_grid_key(ss->pbvh, &key);

 	grid_hidden = BKE_pbvh_grid_hidden(ss->pbvh);

-#ifdef _OPENMP
-	thread_num = omp_get_thread_num();
-#else
-	thread_num = 0;
-#endif
-	tmpgrid_co = ss->cache->tmpgrid_co[thread_num];
-	tmprow_co = ss->cache->tmprow_co[thread_num];
-	tmpgrid_mask = ss->cache->tmpgrid_mask[thread_num];
-	tmprow_mask = ss->cache->tmprow_mask[thread_num];
+	if (smooth_mask)
+		tmpgrid_mask = (void *)(data_chunk + 1);
+	else
+		tmpgrid_co = (void *)(data_chunk + 1);

-	for (i = 0; i < totgrid; ++i) {
+	for (i = 0; i < totgrid; i++) {
 		int gi = grid_indices[i];
 		const BLI_bitmap *gh = grid_hidden[gi];
-		data = griddata[gi];
+		gddata = griddata[gi];

 		if (smooth_mask)
-			memset(tmpgrid_mask, 0, sizeof(float) * gridsize * gridsize);
+			memset(tmpgrid_mask, 0, data_chunk->tmpgrid_size);
 		else
-			memset(tmpgrid_co, 0, sizeof(float) * 3 * gridsize * gridsize);
+			memset(tmpgrid_co, 0, data_chunk->tmpgrid_size);

 		for (y = 0; y < gridsize - 1; y++) {
-			v1 = y * gridsize;
+			const int v = y * gridsize;
 			if (smooth_mask) {
-				tmprow_mask[0] = (*CCG_elem_offset_mask(&key, data, v1) +
-				                  *CCG_elem_offset_mask(&key, data, v1 + gridsize));
+				tmprow_mask[0] = (*CCG_elem_offset_mask(&key, gddata, v) +
+				                  *CCG_elem_offset_mask(&key, gddata, v + gridsize));
 			}
 			else {
 				add_v3_v3v3(tmprow_co[0],
-				            CCG_elem_offset_co(&key, data, v1),
-				            CCG_elem_offset_co(&key, data, v1 + gridsize));
+				            CCG_elem_offset_co(&key, gddata, v),
+				            CCG_elem_offset_co(&key, gddata, v + gridsize));
 			}

 			for (x = 0; x < gridsize - 1; x++) {
-				v1 = x + y * gridsize;
-				v2 = v1 + 1;
-				v3 = v1 + gridsize;
-				v4 = v3 + 1;
+				const int v1 = x + y * gridsize;
+				const int v2 = v1 + 1;
+				const int v3 = v1 + gridsize;
+				const int v4 = v3 + 1;

 				if (smooth_mask) {
 					float tmp;

-					tmprow_mask[x + 1] = (*CCG_elem_offset_mask(&key, data, v2) +
-					                      *CCG_elem_offset_mask(&key, data, v4));
-					tmp = tmprow_mask[x + 1] + tmprow_mask[x];
+					tmprow_mask[(x + 1) % 2] = (*CCG_elem_offset_mask(&key, gddata, v2) +
+					                            *CCG_elem_offset_mask(&key, gddata, v4));
+					tmp = tmprow_mask[(x + 1) % 2] + tmprow_mask[x % 2];

 					tmpgrid_mask[v1] += tmp;
 					tmpgrid_mask[v2] += tmp;
@@ -1687,10 +1724,10 @@ static void do_multires_smooth_brush(Sculpt *sd, SculptSession *ss, PBVHNode *no
 				else {
 					float tmp[3];

-					add_v3_v3v3(tmprow_co[x + 1],
-					            CCG_elem_offset_co(&key, data, v2),
-					            CCG_elem_offset_co(&key, data, v4));
-					add_v3_v3v3(tmp, tmprow_co[x + 1], tmprow_co[x]);
+					add_v3_v3v3(tmprow_co[(x + 1) % 2],
+					            CCG_elem_offset_co(&key, gddata, v2),
+					            CCG_elem_offset_co(&key, gddata, v4));
+					add_v3_v3v3(tmp, tmprow_co[(x + 1) % 2], tmprow_co[x % 2]);

 					add_v3_v3(tmpgrid_co[v1], tmp);
 					add_v3_v3(tmpgrid_co[v2], tmp);
@@ -1701,49 +1738,43 @@ static void do_multires_smooth_brush(Sculpt *sd, SculptSession *ss, PBVHNode *no
 		}

 		/* blend with existing coordinates */
-		for (y = 0; y < gridsize; ++y) {
-			for (x = 0; x < gridsize; ++x) {
+		for (y = 0; y < gridsize; y++) {
+			for (x = 0; x < gridsize; x++) {
 				float *co;
 				const float *fno;
 				float *mask;
-				int index;
+				const int index = y * gridsize + x;

 				if (gh) {
-					if (BLI_BITMAP_TEST(gh, y * gridsize + x))
+					if (BLI_BITMAP_TEST(gh, index))
 						continue;
 				}

-				index = x + y * gridsize;
-				co = CCG_elem_offset_co(&key, data, index);
-				fno = CCG_elem_offset_no(&key, data, index);
-				mask = CCG_elem_offset_mask(&key, data, index);
+				co = CCG_elem_offset_co(&key, gddata, index);
+				fno = CCG_elem_offset_no(&key, gddata, index);
+				mask = CCG_elem_offset_mask(&key, gddata, index);

 				if (sculpt_brush_test(&test, co)) {
-					const float strength_mask = (smooth_mask ? 0 : *mask);
-					const float fade = bstrength * tex_strength(ss, brush, co, test.dist,
-					                                            NULL, fno, strength_mask);
-					float n = 1.0f / 16.0f;
-					
+					const float strength_mask = (smooth_mask ? 0.0f : *mask);
+					const float fade = bstrength * tex_strength(ss, brush, co, test.dist, NULL, fno, strength_mask);
+					float f = 1.0f / 16.0f;
+
 					if (x == 0 || x == gridsize - 1)
-						n *= 2;
-					
+						f *= 2.0f;
+
 					if (y == 0 || y == gridsize - 1)
-						n *= 2;
-					
+						f *= 2.0f;
+
 					if (smooth_mask) {
-						*mask += ((tmpgrid_mask[x + y * gridsize] * n) - *mask) * fade;
+						*mask += ((tmpgrid_mask[index] * f) - *mask) * fade;
 					}
 					else {
-						float *avg, val[3];
-
-						avg = tmpgrid_co[x + y * gridsize];
-
-						mul_v3_fl(avg, n);
+						float *avg = tmpgrid_co[index];
+						float val[3];

+						mul_v3_fl(avg, f);
 						sub_v3_v3v3(val, avg, co);
-						mul_v3_fl(val, fade);
-
-						add_v3_v3(val, co);
+						madd_v3_v3v3fl(val, co, val, fade);

 						sculpt_clip(sd, ss, co, val);
 					}
@@ -1753,17 +1784,19 @@ static void do_multires_smooth_brush(Sculpt *sd, SculptSession *ss, PBVHNode *no
 	}
 }

-static void smooth(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode,
-                   float bstrength, int smooth_mask)
+static void smooth(
+        Sculpt *sd, Object *ob, PBVHNode **nodes, const int totnode, float bstrength, const bool smooth_mask)
 {
 	SculptSession *ss = ob->sculpt;
+	Brush *brush = BKE_paint_brush(&sd->paint);
+
 	const int max_iterations = 4;
 	const float fract = 1.0f / max_iterations;
 	PBVHType type = BKE_pbvh_type(ss->pbvh);
-	int iteration, n, count;
+	int iteration, count;
 	float last;

-	CLAMP(bstrength, 0, 1);
+	CLAMP(bstrength, 0.0f, 1.0f);

 	count = (int)(bstrength * max_iterations);
 	last  = max_iterations * (bstrength - count * fract);
@@ -1774,23 +1807,44 @@ static void smooth(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode,
 	}

 	for (iteration = 0; iteration <= count; ++iteration) {
-		float strength = (iteration != count) ? 1.0f : last;
+		const float strength = (iteration != count) ? 1.0f : last;

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-		for (n = 0; n < totnode; n++) {
-			switch (type) {
-				case PBVH_GRIDS:
-					do_multires_smooth_brush(sd, ss, nodes[n], strength,
-					                         smooth_mask);
-					break;
-				case PBVH_FACES:
-					do_mesh_smooth_brush(sd, ss, nodes[n], strength,
-					                     smooth_mask);
-					break;
-				case PBVH_BMESH:
-					do_bmesh_smooth_brush(sd, ss, nodes[n], strength, smooth_mask);
-					break;
+		SculptDoBrushData data = {
+		    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+		    .smooth_mask = smooth_mask, .strength = strength,
+		};
+
+		switch (type) {
+			case PBVH_GRIDS:
+			{
+				int gridsize;
+				size_t size;
+				SculptDoBrushSmoothGridDataChunk *data_chunk;
+
+				BKE_pbvh_node_get_grids(ss->pbvh, NULL, NULL, NULL, NULL, &gridsize, NULL);
+				size = (size_t)gridsize;
+				size = sizeof(float) * size * size * (smooth_mask ? 1 : 3);
+				data_chunk = MEM_mallocN(sizeof(*data_chunk) + size, __func__);
+				data_chunk->tmpgrid_size = size;
+				size += sizeof(*data_chunk);
+
+				BLI_task_parallel_range_ex(
+				            0, totnode, &data, data_chunk, size, do_smooth_brush_multires_task_cb,
+				            ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+
+				MEM_freeN(data_chunk);
+				break;
 			}
+			case PBVH_FACES:
+				BLI_task_parallel_range_ex(
+				            0, totnode, &data, NULL, 0, do_smooth_brush_mesh_task_cb,
+				            ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+				break;
+			case PBVH_BMESH:
+				BLI_task_parallel_range_ex(
+				            0, totnode, &data, NULL, 0, do_smooth_brush_bmesh_task_cb,
+				            ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+				break;
 		}

 		if (ss->multires)
@@ -1804,36 +1858,44 @@ static void do_smooth_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnod
 	smooth(sd, ob, nodes, totnode, ss->cache->bstrength, false);
 }

+static void do_mask_brush_draw_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+	const float bstrength = ss->cache->bstrength;
+
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+
+	sculpt_brush_test_init(ss, &test);
+
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		if (sculpt_brush_test(&test, vd.co)) {
+			const float fade = tex_strength(ss, brush, vd.co, test.dist, vd.no, vd.fno, 0.0f);
+
+			(*vd.mask) += fade * bstrength;
+			CLAMP(*vd.mask, 0, 1);
+
+			if (vd.mvert)
+				vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
+		}
+		BKE_pbvh_vertex_iter_end;
+	}
+}
+
 static void do_mask_brush_draw(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 {
-	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);
-	float bstrength = ss->cache->bstrength;
-	int n;

 	/* threaded loop over nodes */
-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	};

-		sculpt_brush_test_init(ss, &test);
-
-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			if (sculpt_brush_test(&test, vd.co)) {
-				float fade = tex_strength(ss, brush, vd.co, test.dist,
-				                          vd.no, vd.fno, 0);
-
-				(*vd.mask) += fade * bstrength;
-				CLAMP(*vd.mask, 0, 1);
-
-				if (vd.mvert)
-					vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-			}
-			BKE_pbvh_vertex_iter_end;
-		}
-	}
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_mask_brush_draw_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
 }

 static void do_mask_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
@@ -1851,13 +1913,42 @@ static void do_mask_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 	}
 }

+static void do_draw_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+	const float *offset = data->offset;
+
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	float (*proxy)[3];
+
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;
+
+	sculpt_brush_test_init(ss, &test);
+
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		if (sculpt_brush_test(&test, vd.co)) {
+			/* offset vertex */
+			const float fade = tex_strength(ss, brush, vd.co, test.dist, vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+
+			mul_v3_v3fl(proxy[vd.i], offset, fade);
+
+			if (vd.mvert)
+				vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
+		}
+	}
+	BKE_pbvh_vertex_iter_end;
+}
+
 static void do_draw_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 {
 	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);
 	float offset[3];
-	float bstrength = ss->cache->bstrength;
-	int n;
+	const float bstrength = ss->cache->bstrength;

 	/* offset with as much as possible factored in already */
 	mul_v3_v3fl(offset, ss->cache->sculpt_normal_symm, ss->cache->radius);
@@ -1865,31 +1956,57 @@ static void do_draw_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 	mul_v3_fl(offset, bstrength);

 	/* threaded loop over nodes */
-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	    .offset = offset,
+	};

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_draw_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+}

-		sculpt_brush_test_init(ss, &test);
+static void do_crease_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+	SculptProjectVector *spvc = data->spvc;
+	const float flippedbstrength = data->flippedbstrength;
+	const float *offset = data->offset;

-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			if (sculpt_brush_test(&test, vd.co)) {
-				/* offset vertex */
-				float fade = tex_strength(ss, brush, vd.co, test.dist, vd.no,
-				                          vd.fno, vd.mask ? *vd.mask : 0.0f);
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	float (*proxy)[3];

-				mul_v3_v3fl(proxy[vd.i], offset, fade);
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;

-				if (vd.mvert)
-					vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-			}
+	sculpt_brush_test_init(ss, &test);
+
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		if (sculpt_brush_test(&test, vd.co)) {
+			/* offset vertex */
+			const float fade = tex_strength(
+			                       ss, brush, vd.co, test.dist, vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+			float val1[3];
+			float val2[3];
+
+			/* first we pinch */
+			sub_v3_v3v3(val1, test.location, vd.co);
+			mul_v3_fl(val1, fade * flippedbstrength);
+
+			sculpt_project_v3(spvc, val1, val1);
+
+			/* then we draw */
+			mul_v3_v3fl(val2, offset, fade);
+
+			add_v3_v3v3(proxy[vd.i], val1, val2);
+
+			if (vd.mvert)
+				vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
 		}
-		BKE_pbvh_vertex_iter_end;
 	}
+	BKE_pbvh_vertex_iter_end;
 }

 static void do_crease_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
@@ -1901,7 +2018,6 @@ static void do_crease_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnod
 	float bstrength = ss->cache->bstrength;
 	float flippedbstrength, crease_correction;
 	float brush_alpha;
-	int n;

 	SculptProjectVector spvc;

@@ -1926,86 +2042,100 @@ static void do_crease_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnod
 	sculpt_project_v3_cache_init(&spvc, ss->cache->sculpt_normal_symm);

 	/* threaded loop over nodes */
-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	    .spvc = &spvc, .offset = offset, .flippedbstrength = flippedbstrength,
+	};

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_crease_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+}

-		sculpt_brush_test_init(ss, &test);
+static void do_pinch_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;

-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			if (sculpt_brush_test(&test, vd.co)) {
-				/* offset vertex */
-				const float fade = tex_strength(ss, brush, vd.co, test.dist,
-				                                vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
-				float val1[3];
-				float val2[3];
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	float (*proxy)[3];
+	const float bstrength = ss->cache->bstrength;

-				/* first we pinch */
-				sub_v3_v3v3(val1, test.location, vd.co);
-				mul_v3_fl(val1, fade * flippedbstrength);
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;

-				sculpt_project_v3(&spvc, val1, val1);
+	sculpt_brush_test_init(ss, &test);

-				/* then we draw */
-				mul_v3_v3fl(val2, offset, fade);
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		if (sculpt_brush_test(&test, vd.co)) {
+			const float fade = bstrength * tex_strength(
+			                       ss, brush, vd.co, test.dist, vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+			float val[3];

-				add_v3_v3v3(proxy[vd.i], val1, val2);
+			sub_v3_v3v3(val, test.location, vd.co);
+			mul_v3_v3fl(proxy[vd.i], val, fade);

-				if (vd.mvert)
-					vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-			}
+			if (vd.mvert)
+				vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
 		}
-		BKE_pbvh_vertex_iter_end;
 	}
+	BKE_pbvh_vertex_iter_end;
 }

 static void do_pinch_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 {
-	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);
-	float bstrength = ss->cache->bstrength;
-	int n;

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	};

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_pinch_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+}

-		sculpt_brush_test_init(ss, &test);
+static void do_grab_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+	const float *grab_delta = data->grab_delta;

-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			if (sculpt_brush_test(&test, vd.co)) {
-				float fade = bstrength * tex_strength(ss, brush, vd.co, test.dist, vd.no,
-				                                      vd.fno, vd.mask ? *vd.mask : 0.0f);
-				float val[3];
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	SculptOrigVertData orig_data;
+	float (*proxy)[3];
+	const float bstrength = ss->cache->bstrength;

-				sub_v3_v3v3(val, test.location, vd.co);
-				mul_v3_v3fl(proxy[vd.i], val, fade);
+	sculpt_orig_vert_data_init(&orig_data, data->ob, data->nodes[n]);

-				if (vd.mvert)
-					vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-			}
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;
+
+	sculpt_brush_test_init(ss, &test);
+
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		sculpt_orig_vert_data_update(&orig_data, &vd);
+
+		if (sculpt_brush_test(&test, orig_data.co)) {
+			const float fade = bstrength * tex_strength(
+			                       ss, brush, orig_data.co, test.dist, orig_data.no, NULL, vd.mask ? *vd.mask : 0.0f);
+
+			mul_v3_v3fl(proxy[vd.i], grab_delta, fade);
+
+			if (vd.mvert)
+				vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
 		}
-		BKE_pbvh_vertex_iter_end;
 	}
+	BKE_pbvh_vertex_iter_end;
 }

 static void do_grab_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 {
 	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);
-	float bstrength = ss->cache->bstrength;
 	float grab_delta[3];
-	int n;
 	float len;

 	copy_v3_v3(grab_delta, ss->cache->grab_delta_symmetry);
@@ -2018,87 +2148,104 @@ static void do_grab_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 		add_v3_v3(grab_delta, ss->cache->sculpt_normal_symm);
 	}

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		SculptOrigVertData orig_data;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	    .grab_delta = grab_delta,
+	};

-		sculpt_orig_vert_data_init(&orig_data, ob, nodes[n]);
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_grab_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+}

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
+static void do_nudge_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+	const float *cono = data->cono;

-		sculpt_brush_test_init(ss, &test);
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	float (*proxy)[3];
+	const float bstrength = ss->cache->bstrength;

-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			sculpt_orig_vert_data_update(&orig_data, &vd);
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;

-			if (sculpt_brush_test(&test, orig_data.co)) {
-				const float fade = bstrength * tex_strength(ss, brush,
-				                                            orig_data.co,
-				                                            test.dist,
-				                                            orig_data.no,
-				                                            NULL, vd.mask ? *vd.mask : 0.0f);
+	sculpt_brush_test_init(ss, &test);

-				mul_v3_v3fl(proxy[vd.i], grab_delta, fade);
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		if (sculpt_brush_test(&test, vd.co)) {
+			const float fade = bstrength * tex_strength(ss, brush, vd.co, test.dist,
+			                                            vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);

-				if (vd.mvert)
-					vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-			}
+			mul_v3_v3fl(proxy[vd.i], cono, fade);
+
+			if (vd.mvert)
+				vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
 		}
-		BKE_pbvh_vertex_iter_end;
 	}
+	BKE_pbvh_vertex_iter_end;
 }

 static void do_nudge_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 {
 	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);
-	float bstrength = ss->cache->bstrength;
 	float grab_delta[3];
 	float tmp[3], cono[3];
-	int n;

 	copy_v3_v3(grab_delta, ss->cache->grab_delta_symmetry);

 	cross_v3_v3v3(tmp, ss->cache->sculpt_normal_symm, grab_delta);
 	cross_v3_v3v3(cono, tmp, ss->cache->sculpt_normal_symm);

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	    .cono = cono,
+	};

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_nudge_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+}

-		sculpt_brush_test_init(ss, &test);
+static void do_snake_hook_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+	const float *grab_delta = data->grab_delta;

-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			if (sculpt_brush_test(&test, vd.co)) {
-				const float fade = bstrength * tex_strength(ss, brush, vd.co, test.dist,
-				                                            vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	float (*proxy)[3];
+	const float bstrength = ss->cache->bstrength;

-				mul_v3_v3fl(proxy[vd.i], cono, fade);
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;

-				if (vd.mvert)
-					vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-			}
+	sculpt_brush_test_init(ss, &test);
+
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		if (sculpt_brush_test(&test, vd.co)) {
+			const float fade = bstrength * tex_strength(
+			                       ss, brush, vd.co, test.dist, vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+
+			mul_v3_v3fl(proxy[vd.i], grab_delta, fade);
+
+			if (vd.mvert)
+				vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
 		}
-		BKE_pbvh_vertex_iter_end;
 	}
+	BKE_pbvh_vertex_iter_end;
 }

 static void do_snake_hook_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 {
 	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);
-	float bstrength = ss->cache->bstrength;
+	const float bstrength = ss->cache->bstrength;
 	float grab_delta[3];
-	int n;
 	float len;

 	copy_v3_v3(grab_delta, ss->cache->grab_delta_symmetry);
@@ -2114,234 +2261,260 @@ static void do_snake_hook_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int to
 		add_v3_v3(grab_delta, ss->cache->sculpt_normal_symm);
 	}

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	    .grab_delta = grab_delta,
+	};

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_snake_hook_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+}

-		sculpt_brush_test_init(ss, &test);
+static void do_thumb_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+	const float *cono = data->cono;

-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			if (sculpt_brush_test(&test, vd.co)) {
-				const float fade = bstrength * tex_strength(ss, brush, vd.co, test.dist,
-				                                            vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	SculptOrigVertData orig_data;
+	float (*proxy)[3];
+	const float bstrength = ss->cache->bstrength;

-				mul_v3_v3fl(proxy[vd.i], grab_delta, fade);
+	sculpt_orig_vert_data_init(&orig_data, data->ob, data->nodes[n]);

-				if (vd.mvert)
-					vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-			}
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;
+
+	sculpt_brush_test_init(ss, &test);
+
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		sculpt_orig_vert_data_update(&orig_data, &vd);
+
+		if (sculpt_brush_test(&test, orig_data.co)) {
+			const float fade = bstrength * tex_strength(
+			                       ss, brush, orig_data.co, test.dist, orig_data.no, NULL, vd.mask ? *vd.mask : 0.0f);
+
+			mul_v3_v3fl(proxy[vd.i], cono, fade);
+
+			if (vd.mvert)
+				vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
 		}
-		BKE_pbvh_vertex_iter_end;
 	}
+	BKE_pbvh_vertex_iter_end;
 }

 static void do_thumb_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 {
 	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);
-	float bstrength = ss->cache->bstrength;
 	float grab_delta[3];
 	float tmp[3], cono[3];
-	int n;

 	copy_v3_v3(grab_delta, ss->cache->grab_delta_symmetry);

 	cross_v3_v3v3(tmp, ss->cache->sculpt_normal_symm, grab_delta);
 	cross_v3_v3v3(cono, tmp, ss->cache->sculpt_normal_symm);

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		SculptOrigVertData orig_data;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	    .cono = cono,
+	};

-		sculpt_orig_vert_data_init(&orig_data, ob, nodes[n]);
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_thumb_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+}

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
+static void do_rotate_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+	const float angle = data->angle;

-		sculpt_brush_test_init(ss, &test);
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	SculptOrigVertData orig_data;
+	float (*proxy)[3];
+	const float bstrength = ss->cache->bstrength;

-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			sculpt_orig_vert_data_update(&orig_data, &vd);
+	sculpt_orig_vert_data_init(&orig_data, data->ob, data->nodes[n]);

-			if (sculpt_brush_test(&test, orig_data.co)) {
-				const float fade = bstrength * tex_strength(ss, brush,
-				                                            orig_data.co,
-				                                            test.dist,
-				                                            orig_data.no,
-				                                            NULL, vd.mask ? *vd.mask : 0.0f);
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;

-				mul_v3_v3fl(proxy[vd.i], cono, fade);
+	sculpt_brush_test_init(ss, &test);

-				if (vd.mvert)
-					vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-			}
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		sculpt_orig_vert_data_update(&orig_data, &vd);
+
+		if (sculpt_brush_test(&test, orig_data.co)) {
+			float vec[3], rot[3][3];
+			const float fade = bstrength * tex_strength(
+			                       ss, brush, orig_data.co, test.dist, orig_data.no, NULL, vd.mask ? *vd.mask : 0.0f);
+
+			sub_v3_v3v3(vec, orig_data.co, ss->cache->location);
+			axis_angle_normalized_to_mat3(rot, ss->cache->sculpt_normal_symm, angle * fade);
+			mul_v3_m3v3(proxy[vd.i], rot, vec);
+			add_v3_v3(proxy[vd.i], ss->cache->location);
+			sub_v3_v3(proxy[vd.i], orig_data.co);
+
+			if (vd.mvert)
+				vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
 		}
-		BKE_pbvh_vertex_iter_end;
 	}
+	BKE_pbvh_vertex_iter_end;
 }

 static void do_rotate_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 {
 	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);
-	float bstrength = ss->cache->bstrength;
-	int n;
+
 	static const int flip[8] = { 1, -1, -1, 1, -1, 1, 1, -1 };
-	float angle = ss->cache->vertex_rotation * flip[ss->cache->mirror_symmetry_pass];
+	const float angle = ss->cache->vertex_rotation * flip[ss->cache->mirror_symmetry_pass];

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		SculptOrigVertData orig_data;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	    .angle = angle,
+	};

-		sculpt_orig_vert_data_init(&orig_data, ob, nodes[n]);
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_rotate_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+}

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
+static void do_layer_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Sculpt *sd = data->sd;
+	Brush *brush = data->brush;
+	const float *offset = data->offset;

-		sculpt_brush_test_init(ss, &test);
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	SculptOrigVertData orig_data;
+	float *layer_disp;
+	const float bstrength = ss->cache->bstrength;
+	const float lim = (bstrength < 0) ? -data->brush->height : data->brush->height;
+	/* XXX: layer brush needs conversion to proxy but its more complicated */
+	/* proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co; */

-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			sculpt_orig_vert_data_update(&orig_data, &vd);
+	sculpt_orig_vert_data_init(&orig_data, data->ob, data->nodes[n]);

-			if (sculpt_brush_test(&test, orig_data.co)) {
-				float vec[3], rot[3][3];
-				const float fade = bstrength * tex_strength(ss, brush,
-				                                            orig_data.co,
-				                                            test.dist,
-				                                            orig_data.no,
-				                                            NULL, vd.mask ? *vd.mask : 0.0f);
+	/* Why does this have to be threa-protected? */
+	BLI_mutex_lock(&data->mutex);
+	layer_disp = BKE_pbvh_node_layer_disp_get(ss->pbvh, data->nodes[n]);
+	BLI_mutex_unlock(&data->mutex);

-				sub_v3_v3v3(vec, orig_data.co, ss->cache->location);
-				axis_angle_normalized_to_mat3(rot, ss->cache->sculpt_normal_symm, angle * fade);
-				mul_v3_m3v3(proxy[vd.i], rot, vec);
-				add_v3_v3(proxy[vd.i], ss->cache->location);
-				sub_v3_v3(proxy[vd.i], orig_data.co);
+	sculpt_brush_test_init(ss, &test);

-				if (vd.mvert)
-					vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		sculpt_orig_vert_data_update(&orig_data, &vd);
+
+		if (sculpt_brush_test(&test, orig_data.co)) {
+			const float fade = bstrength * tex_strength(
+			                       ss, brush, vd.co, test.dist, vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+			float *disp = &layer_disp[vd.i];
+			float val[3];
+
+			*disp += fade;
+
+			/* Don't let the displacement go past the limit */
+			if ((lim < 0.0f && *disp < lim) || (lim >= 0.0f && *disp > lim))
+				*disp = lim;
+
+			mul_v3_v3fl(val, offset, *disp);
+
+			if (!ss->multires && !ss->bm && ss->layer_co && (brush->flag & BRUSH_PERSISTENT)) {
+				int index = vd.vert_indices[vd.i];
+
+				/* persistent base */
+				add_v3_v3(val, ss->layer_co[index]);
 			}
+			else {
+				add_v3_v3(val, orig_data.co);
+			}
+
+			sculpt_clip(sd, ss, vd.co, val);
+
+			if (vd.mvert)
+				vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
 		}
-		BKE_pbvh_vertex_iter_end;
 	}
+	BKE_pbvh_vertex_iter_end;
 }

 static void do_layer_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 {
 	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);
-	float bstrength = ss->cache->bstrength;
 	float offset[3];
-	float lim = brush->height;
-	int n;
-
-	if (bstrength < 0)
-		lim = -lim;

 	mul_v3_v3v3(offset, ss->cache->scale, ss->cache->sculpt_normal_symm);

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		SculptOrigVertData orig_data;
-		float *layer_disp;
-		/* XXX: layer brush needs conversion to proxy but its more complicated */
-		/* proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co; */
-		
-		sculpt_orig_vert_data_init(&orig_data, ob, nodes[n]);
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	    .offset = offset,
+	};
+	BLI_mutex_init(&data.mutex);

-#pragma omp critical
-		{
-			layer_disp = BKE_pbvh_node_layer_disp_get(ss->pbvh, nodes[n]);
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_layer_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+
+	BLI_mutex_end(&data.mutex);
+}
+
+static void do_inflate_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	float (*proxy)[3];
+	const float bstrength = ss->cache->bstrength;
+
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;
+
+	sculpt_brush_test_init(ss, &test);
+
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		if (sculpt_brush_test(&test, vd.co)) {
+			const float fade = bstrength * tex_strength(
+			                       ss, brush, vd.co, test.dist, vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+			float val[3];
+
+			if (vd.fno)
+				copy_v3_v3(val, vd.fno);
+			else
+				normal_short_to_float_v3(val, vd.no);
+
+			mul_v3_fl(val, fade * ss->cache->radius);
+			mul_v3_v3v3(proxy[vd.i], val, ss->cache->scale);
+
+			if (vd.mvert)
+				vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
 		}
-		
-		sculpt_brush_test_init(ss, &test);
-
-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			sculpt_orig_vert_data_update(&orig_data, &vd);
-
-			if (sculpt_brush_test(&test, orig_data.co)) {
-				const float fade = bstrength * tex_strength(ss, brush, vd.co, test.dist,
-				                                            vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
-				float *disp = &layer_disp[vd.i];
-				float val[3];
-
-				*disp += fade;
-
-				/* Don't let the displacement go past the limit */
-				if ((lim < 0 && *disp < lim) || (lim >= 0 && *disp > lim))
-					*disp = lim;
-
-				mul_v3_v3fl(val, offset, *disp);
-
-				if (!ss->multires && !ss->bm && ss->layer_co && (brush->flag & BRUSH_PERSISTENT)) {
-					int index = vd.vert_indices[vd.i];
-
-					/* persistent base */
-					add_v3_v3(val, ss->layer_co[index]);
-				}
-				else {
-					add_v3_v3(val, orig_data.co);
-				}
-
-				sculpt_clip(sd, ss, vd.co, val);
-
-				if (vd.mvert)
-					vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-			}
-		}
-		BKE_pbvh_vertex_iter_end;
 	}
+	BKE_pbvh_vertex_iter_end;
 }

 static void do_inflate_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 {
-	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);
-	float bstrength = ss->cache->bstrength;
-	int n;

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	};

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
-
-		sculpt_brush_test_init(ss, &test);
-
-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			if (sculpt_brush_test(&test, vd.co)) {
-				const float fade = bstrength * tex_strength(ss, brush, vd.co, test.dist,
-				                                            vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
-				float val[3];
-
-				if (vd.fno) copy_v3_v3(val, vd.fno);
-				else normal_short_to_float_v3(val, vd.no);
-				
-				mul_v3_fl(val, fade * ss->cache->radius);
-				mul_v3_v3v3(proxy[vd.i], val, ss->cache->scale);
-
-				if (vd.mvert)
-					vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-			}
-		}
-		BKE_pbvh_vertex_iter_end;
-	}
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_inflate_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
 }

 static void calc_sculpt_plane(
@@ -2469,23 +2642,59 @@ static float get_offset(Sculpt *sd, SculptSession *ss)
 	return rv;
 }

+static void do_flatten_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+	const float *area_no = data->area_no;
+	const float *area_co = data->area_co;
+
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	float (*proxy)[3];
+	const float bstrength = ss->cache->bstrength;
+
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;
+
+	sculpt_brush_test_init(ss, &test);
+
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		if (sculpt_brush_test_sq(&test, vd.co)) {
+			float intr[3];
+			float val[3];
+
+			point_plane_project(intr, vd.co, area_no, area_co);
+
+			sub_v3_v3v3(val, intr, vd.co);
+
+			if (plane_trim(ss->cache, brush, val)) {
+				const float fade = bstrength * tex_strength(
+				                       ss, brush, vd.co, sqrtf(test.dist), vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+
+				mul_v3_v3fl(proxy[vd.i], val, fade);
+
+				if (vd.mvert)
+					vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
+			}
+		}
+	}
+	BKE_pbvh_vertex_iter_end;
+}
+
 static void do_flatten_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 {
 	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);

-	float bstrength = ss->cache->bstrength;
 	const float radius = ss->cache->radius;

 	float area_no[3];
 	float area_co[3];

 	float offset = get_offset(sd, ss);
-
 	float displace;
-
-	int n;
-
 	float temp[3];

 	calc_sculpt_plane(sd, ob, nodes, totnode, area_no, area_co);
@@ -2496,19 +2705,37 @@ static void do_flatten_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totno
 	mul_v3_fl(temp, displace);
 	add_v3_v3(area_co, temp);

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	    .area_no = area_no, .area_co = area_co,
+	};

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_flatten_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+}

-		sculpt_brush_test_init(ss, &test);
+static void do_clay_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+	const float *area_no = data->area_no;
+	const float *area_co = data->area_co;

-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			if (sculpt_brush_test_sq(&test, vd.co)) {
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	float (*proxy)[3];
+	const bool flip = (ss->cache->bstrength < 0);
+	const float bstrength = flip ? -ss->cache->bstrength : ss->cache->bstrength;
+
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;
+
+	sculpt_brush_test_init(ss, &test);
+
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		if (sculpt_brush_test_sq(&test, vd.co)) {
+			if (plane_point_side_flip(vd.co, area_no, area_co, flip)) {
 				float intr[3];
 				float val[3];

@@ -2517,8 +2744,10 @@ static void do_flatten_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totno
 				sub_v3_v3v3(val, intr, vd.co);

 				if (plane_trim(ss->cache, brush, val)) {
-					const float fade = bstrength * tex_strength(ss, brush, vd.co, sqrtf(test.dist),
-					                                            vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+					/* note, the normal from the vertices is ignored,
+					 * causes glitch with planes, see: T44390 */
+					const float fade = bstrength * tex_strength(
+					                       ss, brush, vd.co, sqrtf(test.dist), vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);

 					mul_v3_v3fl(proxy[vd.i], val, fade);

@@ -2527,8 +2756,8 @@ static void do_flatten_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totno
 				}
 			}
 		}
-		BKE_pbvh_vertex_iter_end;
 	}
+	BKE_pbvh_vertex_iter_end;
 }

 static void do_clay_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
@@ -2536,30 +2765,18 @@ static void do_clay_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);

-	float bstrength = ss->cache->bstrength;
-	float radius    = ss->cache->radius;
-	float offset    = get_offset(sd, ss);
-	
+	const bool flip = (ss->cache->bstrength < 0);
+	const float radius = flip ? -ss->cache->radius : ss->cache->radius;
+
+	float offset = get_offset(sd, ss);
 	float displace;

 	float area_no[3];
 	float area_co[3];
-
-	int n;
-
 	float temp[3];

-	bool flip;
-
 	calc_sculpt_plane(sd, ob, nodes, totnode, area_no, area_co);

-	flip = bstrength < 0;
-
-	if (flip) {
-		bstrength = -bstrength;
-		radius    = -radius;
-	}
-
 	displace = radius * (0.25f + offset);

 	mul_v3_v3v3(temp, area_no, ss->cache->scale);
@@ -2568,43 +2785,61 @@ static void do_clay_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)

 	/* add_v3_v3v3(p, ss->cache->location, area_no); */

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	    .area_no = area_no, .area_co = area_co,
+	};

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_clay_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+}

-		sculpt_brush_test_init(ss, &test);
+static void do_clay_strips_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+	float (*mat)[4] = data->mat;
+	const float *area_no_sp = data->area_no_sp;
+	const float *area_co = data->area_co;

-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			if (sculpt_brush_test_sq(&test, vd.co)) {
-				if (plane_point_side_flip(vd.co, area_no, area_co, flip)) {
-					float intr[3];
-					float val[3];
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	float (*proxy)[3];
+	const bool flip = (ss->cache->bstrength < 0);
+	const float bstrength = flip ? -ss->cache->bstrength : ss->cache->bstrength;

-					point_plane_project(intr, vd.co, area_no, area_co);
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;

-					sub_v3_v3v3(val, intr, vd.co);
+	sculpt_brush_test_init(ss, &test);

-					if (plane_trim(ss->cache, brush, val)) {
-						/* note, the normal from the vertices is ignored,
-						 * causes glitch with planes, see: T44390 */
-						const float fade = bstrength * tex_strength(ss, brush, vd.co, sqrtf(test.dist),
-						                                            vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		if (sculpt_brush_test_cube(&test, vd.co, mat)) {
+			if (plane_point_side_flip(vd.co, area_no_sp, area_co, flip)) {
+				float intr[3];
+				float val[3];

-						mul_v3_v3fl(proxy[vd.i], val, fade);
+				point_plane_project(intr, vd.co, area_no_sp, area_co);

-						if (vd.mvert)
-							vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-					}
+				sub_v3_v3v3(val, intr, vd.co);
+
+				if (plane_trim(ss->cache, brush, val)) {
+					/* note, the normal from the vertices is ignored,
+					 * causes glitch with planes, see: T44390 */
+					const float fade = bstrength * tex_strength(
+					                       ss, brush, vd.co, ss->cache->radius * test.dist,
+					                       vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+
+					mul_v3_v3fl(proxy[vd.i], val, fade);
+
+					if (vd.mvert)
+						vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
 				}
 			}
 		}
-		BKE_pbvh_vertex_iter_end;
 	}
+	BKE_pbvh_vertex_iter_end;
 }

 static void do_clay_strips_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
@@ -2612,25 +2847,20 @@ static void do_clay_strips_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int t
 	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);

-	float bstrength = ss->cache->bstrength;
-	float radius    = ss->cache->radius;
-	float offset    = get_offset(sd, ss);
-	
-	float displace;
+	const bool flip = (ss->cache->bstrength < 0);
+	const float radius    = flip ? -ss->cache->radius : ss->cache->radius;
+	const float offset    = get_offset(sd, ss);
+	const float displace  = radius * (0.25f + offset);;

 	float area_no_sp[3];  /* the sculpt-plane normal (whatever its set to) */
 	float area_no[3];     /* geometry normal */
 	float area_co[3];

-	int n;
-
 	float temp[3];
 	float mat[4][4];
 	float scale[4][4];
 	float tmat[4][4];

-	bool flip;
-
 	calc_sculpt_plane(sd, ob, nodes, totnode, area_no_sp, area_co);

 	if (brush->sculpt_plane != SCULPT_DISP_DIR_AREA || (brush->flag & BRUSH_ORIGINAL_NORMAL))
@@ -2642,15 +2872,6 @@ static void do_clay_strips_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int t
 	if (ss->cache->first_time)
 		return;

-	flip = bstrength < 0;
-
-	if (flip) {
-		bstrength = -bstrength;
-		radius    = -radius;
-	}
-
-	displace = radius * (0.25f + offset);
-
 	mul_v3_v3v3(temp, area_no_sp, ss->cache->scale);
 	mul_v3_fl(temp, displace);
 	add_v3_v3(area_co, temp);
@@ -2671,44 +2892,57 @@ static void do_clay_strips_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int t
 	mul_m4_m4m4(tmat, mat, scale);
 	invert_m4_m4(mat, tmat);

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	    .area_no_sp = area_no_sp, .area_co = area_co, .mat = mat,
+	};

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_clay_strips_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+}

-		sculpt_brush_test_init(ss, &test);
+static void do_fill_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+	const float *area_no = data->area_no;
+	const float *area_co = data->area_co;

-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			if (sculpt_brush_test_cube(&test, vd.co, mat)) {
-				if (plane_point_side_flip(vd.co, area_no_sp, area_co, flip)) {
-					float intr[3];
-					float val[3];
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	float (*proxy)[3];
+	const float bstrength = ss->cache->bstrength;

-					point_plane_project(intr, vd.co, area_no_sp, area_co);
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;

-					sub_v3_v3v3(val, intr, vd.co);
+	sculpt_brush_test_init(ss, &test);

-					if (plane_trim(ss->cache, brush, val)) {
-						/* note, the normal from the vertices is ignored,
-						 * causes glitch with planes, see: T44390 */
-						const float fade = bstrength * tex_strength(ss, brush, vd.co,
-						                                            ss->cache->radius * test.dist,
-						                                            vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		if (sculpt_brush_test_sq(&test, vd.co)) {
+			if (plane_point_side(vd.co, area_no, area_co)) {
+				float intr[3];
+				float val[3];

-						mul_v3_v3fl(proxy[vd.i], val, fade);
+				point_plane_project(intr, vd.co, area_no, area_co);

-						if (vd.mvert)
-							vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-					}
+				sub_v3_v3v3(val, intr, vd.co);
+
+				if (plane_trim(ss->cache, brush, val)) {
+					const float fade = bstrength * tex_strength(
+					                       ss, brush, vd.co, sqrtf(test.dist),
+					                       vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+
+					mul_v3_v3fl(proxy[vd.i], val, fade);
+
+					if (vd.mvert)
+						vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
 				}
 			}
 		}
-		BKE_pbvh_vertex_iter_end;
 	}
+	BKE_pbvh_vertex_iter_end;
 }

 static void do_fill_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
@@ -2716,7 +2950,6 @@ static void do_fill_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);

-	float bstrength = ss->cache->bstrength;
 	const float radius = ss->cache->radius;

 	float area_no[3];
@@ -2725,8 +2958,6 @@ static void do_fill_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)

 	float displace;

-	int n;
-
 	float temp[3];

 	calc_sculpt_plane(sd, ob, nodes, totnode, area_no, area_co);
@@ -2737,42 +2968,56 @@ static void do_fill_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
 	mul_v3_fl(temp, displace);
 	add_v3_v3(area_co, temp);

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	    .area_no = area_no, .area_co = area_co,
+	};

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_fill_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
+}

-		sculpt_brush_test_init(ss, &test);
+static void do_scrape_brush_task_cb(void *userdata, void *UNUSED(userdata_chunk), int n)
+{
+	SculptDoBrushData *data = userdata;
+	SculptSession *ss = data->ob->sculpt;
+	Brush *brush = data->brush;
+	const float *area_no = data->area_no;
+	const float *area_co = data->area_co;

-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			if (sculpt_brush_test_sq(&test, vd.co)) {
-				if (plane_point_side(vd.co, area_no, area_co)) {
-					float intr[3];
-					float val[3];
+	PBVHVertexIter vd;
+	SculptBrushTest test;
+	float (*proxy)[3];
+	const float bstrength = ss->cache->bstrength;

-					point_plane_project(intr, vd.co, area_no, area_co);
+	proxy = BKE_pbvh_node_add_proxy(ss->pbvh, data->nodes[n])->co;

-					sub_v3_v3v3(val, intr, vd.co);
+	sculpt_brush_test_init(ss, &test);

-					if (plane_trim(ss->cache, brush, val)) {
-						const float fade = bstrength * tex_strength(ss, brush, vd.co,
-						                                            sqrtf(test.dist),
-						                                            vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+	BKE_pbvh_vertex_iter_begin(ss->pbvh, data->nodes[n], vd, PBVH_ITER_UNIQUE)
+	{
+		if (sculpt_brush_test_sq(&test, vd.co)) {
+			if (!plane_point_side(vd.co, area_no, area_co)) {
+				float intr[3];
+				float val[3];

-						mul_v3_v3fl(proxy[vd.i], val, fade);
+				point_plane_project(intr, vd.co, area_no, area_co);

-						if (vd.mvert)
-							vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-					}
+				sub_v3_v3v3(val, intr, vd.co);
+
+				if (plane_trim(ss->cache, brush, val)) {
+					const float fade = bstrength * tex_strength(
+					                       ss, brush, vd.co, sqrtf(test.dist), vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
+
+					mul_v3_v3fl(proxy[vd.i], val, fade);
+
+					if (vd.mvert)
+						vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
 				}
 			}
 		}
-		BKE_pbvh_vertex_iter_end;
 	}
+	BKE_pbvh_vertex_iter_end;
 }

 static void do_scrape_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode)
@@ -2780,7 +3025,6 @@ static void do_scrape_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnod
 	SculptSession *ss = ob->sculpt;
 	Brush *brush = BKE_paint_brush(&sd->paint);

-	float bstrength = ss->cache->bstrength;
 	const float radius = ss->cache->radius;

 	float area_no[3];
@@ -2789,8 +3033,6 @@ static void do_scrape_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnod

 	float displace;

-	int n;
-
 	float temp[3];

 	calc_sculpt_plane(sd, ob, nodes, totnode, area_no, area_co);
@@ -2801,42 +3043,13 @@ static void do_scrape_brush(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnod
 	mul_v3_fl(temp, displace);
 	add_v3_v3(area_co, temp);

-#pragma omp parallel for schedule(guided) if ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT)
-	for (n = 0; n < totnode; n++) {
-		PBVHVertexIter vd;
-		SculptBrushTest test;
-		float (*proxy)[3];
+	SculptDoBrushData data = {
+	    .sd = sd, .ob = ob, .brush = brush, .nodes = nodes,
+	    .area_no = area_no, .area_co = area_co,
+	};

-		proxy = BKE_pbvh_node_add_proxy(ss->pbvh, nodes[n])->co;
-
-		sculpt_brush_test_init(ss, &test);
-
-		BKE_pbvh_vertex_iter_begin(ss->pbvh, nodes[n], vd, PBVH_ITER_UNIQUE)
-		{
-			if (sculpt_brush_test_sq(&test, vd.co)) {
-				if (!plane_point_side(vd.co, area_no, area_co)) {
-					float intr[3];
-					float val[3];
-
-					point_plane_project(intr, vd.co, area_no, area_co);
-
-					sub_v3_v3v3(val, intr, vd.co);
-
-					if (plane_trim(ss->cache, brush, val)) {
-						const float fade = bstrength * tex_strength(ss, brush, vd.co,
-						                                            sqrtf(test.dist),
-						                                            vd.no, vd.fno, vd.mask ? *vd.mask : 0.0f);
-
-						mul_v3_v3fl(proxy[vd.i], val, fade);
-
-						if (vd.mvert)
-							vd.mvert->flag |= ME_VERT_PBVH_UPDATE;
-					}
-				}
-			}
-		}
-		BKE_pbvh_vertex_iter_end;
-	}
+	BLI_task_parallel_range_ex(0, totnode, &data, NULL, 0, do_scrape_brush_task_cb,
+	                           ((sd->flags & SCULPT_USE_OPENMP) && totnode > SCULPT_OMP_LIMIT), false);
 }

 static void do_gravity(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode, float bstrength)
@@ -3564,83 +3777,6 @@ static void sculpt_init_mirror_clipping(Object *ob, SculptSession *ss)
 	}
 }

-static void sculpt_omp_start(Sculpt *sd, SculptSession *ss)
-{
-	StrokeCache *cache = ss->cache;
-
-#ifdef _OPENMP
-
-#if defined(__APPLE__)
-	cache->init_num_threads = BLI_system_thread_count();
-#else
-	cache->init_num_threads = omp_get_max_threads();
-#endif
-	/* If using OpenMP then create a number of threads two times the
-	 * number of processor cores.
-	 * Justification: Empirically I've found that two threads per
-	 * processor gives higher throughput. */
-	if (sd->flags & SCULPT_USE_OPENMP) {
-#if defined(__APPLE__)
-		cache->num_threads = system_physical_thread_count();
-#else
-		cache->num_threads = 2 * omp_get_num_procs();
-#endif
-	}
-	else {
-		cache->num_threads = 1;
-	}
-	omp_set_num_threads(cache->num_threads);
-#else
-	(void)sd;
-	cache->num_threads = 1;
-#endif
-	if (ss->multires) {
-		int i, gridsize, array_mem_size;
-		BKE_pbvh_node_get_grids(ss->pbvh, NULL, NULL, NULL, NULL,
-		                        &gridsize, NULL);
-
-		array_mem_size = cache->num_threads * sizeof(void *);
-
-		cache->tmpgrid_co = MEM_mallocN(array_mem_size, "tmpgrid_co array");
-		cache->tmprow_co = MEM_mallocN(array_mem_size, "tmprow_co array");
-		cache->tmpgrid_mask = MEM_mallocN(array_mem_size, "tmpgrid_mask array");
-		cache->tmprow_mask = MEM_mallocN(array_mem_size, "tmprow_mask array");
-
-		for (i = 0; i < cache->num_threads; i++) {
-			const size_t row_size = sizeof(float) * gridsize;
-			const size_t co_row_size = 3 * row_size;
-
-			cache->tmprow_co[i] = MEM_mallocN(co_row_size, "tmprow_co");
-			cache->tmpgrid_co[i] = MEM_mallocN(co_row_size * gridsize, "tmpgrid_co");
-			cache->tmprow_mask[i] = MEM_mallocN(row_size, "tmprow_mask");
-			cache->tmpgrid_mask[i] = MEM_mallocN(row_size * gridsize, "tmpgrid_mask");
-		}
-	}
-}
-
-static void sculpt_omp_done(SculptSession *ss)
-{
-#ifdef _OPENMP
-	omp_set_num_threads(ss->cache->init_num_threads);
-#endif
-
-	if (ss->multires) {
-		int i;
-
-		for (i = 0; i < ss->cache->num_threads; i++) {
-			MEM_freeN(ss->cache->tmpgrid_co[i]);
-			MEM_freeN(ss->cache->tmprow_co[i]);
-			MEM_freeN(ss->cache->tmpgrid_mask[i]);
-			MEM_freeN(ss->cache->tmprow_mask[i]);
-		}
-
-		MEM_freeN(ss->cache->tmpgrid_co);
-		MEM_freeN(ss->cache->tmprow_co);
-		MEM_freeN(ss->cache->tmpgrid_mask);
-		MEM_freeN(ss->cache->tmprow_mask);
-	}
-}
-
 /* Initialize the stroke cache invariants from operator properties */
 static void sculpt_update_cache_invariants(bContext *C, Sculpt *sd, SculptSession *ss, wmOperator *op, const float mouse[2])
 {
@@ -3801,8 +3937,6 @@ static void sculpt_update_cache_invariants(bContext *C, Sculpt *sd, SculptSessio
 		cache->dial = BLI_dial_initialize(cache->initial_mouse, PIXEL_INPUT_THRESHHOLD);
 		
 #undef PIXEL_INPUT_THRESHHOLD
-	
-	sculpt_omp_start(sd, ss);
 }

 static void sculpt_update_brush_delta(UnifiedPaintSettings *ups, Object *ob, Brush *brush)
@@ -4344,8 +4478,6 @@ static void sculpt_stroke_done(const bContext *C, struct PaintStroke *UNUSED(str
 	SculptSession *ss = ob->sculpt;
 	Sculpt *sd = CTX_data_tool_settings(C)->sculpt;

-	sculpt_omp_done(ss);
-
 	/* Finished */
 	if (ss->cache) {
 		UnifiedPaintSettings *ups = &CTX_data_tool_settings(C)->unified_paint_settings;