An experiment in threading
Introduction
I was working on a toy code problem for a simulation for a small contest: We need to find with what probability any two random points chosen inside a square, if taken as the diameter of a circle, the circle will fall inside the square. I was not able to solve this theoretically, so thought, let’s have code solve it. Here’s the code I and my teammate came up with initially.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
| #include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <limits.h>
static inline float min(float a, float b)
{
return a < b ? a : b;
}
int main()
{
uint64_t nPasses = 200000000ul;
uint64_t pass = 0;
srand(time(NULL));
for (int i = 0; i < nPasses; i++) // Mistake in this line: int i, needs to be uint64_t
{
float x1 = (float)rand() / (float)RAND_MAX;
float y1 = (float)rand() / (float)RAND_MAX;
float x2 = (float)rand() / (float)RAND_MAX;
float y2 = (float)rand() / (float)RAND_MAX;
float xc = (x1 + x2) / 2.0;
float yc = (y1 + y2) / 2.0;
float x3 = min(xc, (1 - xc));
float y3 = min(yc, (1 - yc));
float r = 0.5 * (sqrtf((x2 - x1)*(x2 - x1) + (y2 - y1)*(y2-y1)));
if (r <= min(x3, y3))
{
pass++;
}
}
printf("Pass = %lu\nFail = %lu\nRatio = %lf\n", pass, nPasses - pass, (double)pass / (double)nPasses);
return 0;
}
|
And to run it:
1
2
3
4
5
6
7
8
9
| $ gcc qwiz1.c -o qwiz1 -lm
$ time ./qwiz1
Pass = 104729480
Fail = 95270520
Ratio = 0.523647
real 0m11.677s
user 0m11.671s
sys 0m0.000s
|
So, this runs in fairly ok time, but I didn’t get the required 5-th decimal place certainty, because on different passes I was getting different results for 4-th and 5-th decimal. The solution was to run it 20000000000UL
times. This got my code in an infinite loop. The mistake of course being that the for
loop iteration variable was initialized with an int
. However, I had to still Ctrl-C this after some time because it was taking too long.
Using threads to parallelize
I started with an initial simple reimplementation using pthread
and the workers will all run to the nPasses
variable, so our total number of iterations will be equal to nWorkers * nPasses
. Beginner level question: Can you spot the problem with this code?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
| #include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <limits.h>
#include <pthread.h>
static inline float min(float a, float b)
{
return a < b ? a : b;
}
uint64_t nPasses = 20000ul;
uint64_t pass = 0;
void *worker(void *arg)
{
for (uint64_t i = 0; i < nPasses; i++)
{
float x1 = (float)rand() / (float)RAND_MAX;
float y1 = (float)rand() / (float)RAND_MAX;
float x2 = (float)rand() / (float)RAND_MAX;
float y2 = (float)rand() / (float)RAND_MAX;
float xc = (x1 + x2) / 2.0;
float yc = (y1 + y2) / 2.0;
float x3 = min(xc, (1 - xc));
float y3 = min(yc, (1 - yc));
float r = 0.5 * (sqrtf((x2 - x1)*(x2 - x1) + (y2 - y1)*(y2-y1)));
if (r <= min(x3, y3))
{
pass++;
}
}
}
int main()
{
srand(time(NULL));
const int nWorkers = 10;
pthread_t t[nWorkers];
for (int i = 0; i < nWorkers; i++)
{
pthread_create(&t[i], NULL, worker, NULL);
}
for (int i = 0; i < nWorkers; i++)
{
pthread_join(t[i], NULL);
}
printf("Pass = %lu\nFail = %lu\nRatio = %lf\n", pass, (nPasses * nWorkers) - pass, (double)pass / (double)(nPasses * nWorkers));
return 0;
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
| $ gcc qwiz2.c -o qwiz2 -lpthread -lm
$ time ./qwiz2
Pass = 104654
Fail = 95346
Ratio = 0.523270
real 0m0.086s
user 0m0.040s
sys 0m0.491s
$ time ./qwiz2
Pass = 103978
Fail = 96022
Ratio = 0.519890
real 0m0.082s
user 0m0.051s
sys 0m0.431s
|
You can kind of get the hint from the wild variation in the ratios that something is wrong. Of course, the reason is that there is no atomic access to the globals. Let’s add that in the line we access pass
1
2
3
4
5
6
7
8
| pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; // Declare this above near the global vars
// Rest of code [...]
if (r <= min(x3, y3))
{
pthread_mutex_lock(&m);
pass++;
pthread_mutex_unlock(&m);
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
| $ time ./qwiz2
Pass = 105113
Fail = 94887
Ratio = 0.525565
real 0m0.129s
user 0m0.103s
sys 0m0.749s
$ time ./qwiz2
Pass = 104783
Fail = 95217
Ratio = 0.523915
real 0m0.097s
user 0m0.064s
sys 0m0.561s
|
Immediately we can see that the results are more consistent. However, because of the atomic access, our time has gone up from $.08$s to $.12$s. However, there’s a fundamental problem with how we’ve treated the parallelization here. We don’t really need to make shared state where we can prevent it. Here the shared variable is pass
, so let’s let each one of them report their own pass and the master can collect it at the end.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
| #include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <limits.h>
#include <pthread.h>
static inline float min(float a, float b)
{
return a < b ? a : b;
}
struct work
{
uint64_t nPasses;
uint64_t pass;
uint64_t done;
};
void *worker(void *arg)
{
struct work *w = (struct work *) arg;
uint64_t nPasses = w->nPasses;
uint64_t done = w->done;
uint64_t pass = 0;
for (uint64_t i = 0; i < nPasses; i++)
{
float x1 = (float)rand() / (float)RAND_MAX;
float y1 = (float)rand() / (float)RAND_MAX;
float x2 = (float)rand() / (float)RAND_MAX;
float y2 = (float)rand() / (float)RAND_MAX;
float xc = (x1 + x2) / 2.0;
float yc = (y1 + y2) / 2.0;
float x3 = min(xc, (1 - xc));
float y3 = min(yc, (1 - yc));
float r = 0.5 * (sqrtf((x2 - x1)*(x2 - x1) + (y2 - y1)*(y2-y1)));
if (r <= min(x3, y3))
{
pass++;
}
}
w->pass = pass;
}
int main()
{
srand(time(NULL));
const int threadpool_size = 10;
pthread_t t[threadpool_size];
struct work w[threadpool_size];
uint64_t pass = 0, fail = 0;
uint64_t nPasses = 200000ull;
for (int i = 0; i < threadpool_size; i++)
{
w[i].nPasses = nPasses / threadpool_size;
w[i].pass = 0;
w[i].done = 0;
pthread_create(&t[i], NULL, worker, &w[i]);
}
for (int i = 0; i < threadpool_size; i++)
{
pthread_join(t[i], NULL);
}
for (int i = 0; i < threadpool_size; i++)
{
pass += w[i].pass;
}
printf("Pass = %lu\nFail = %lu\nRatio = %lf\n", pass, nPasses - pass, (double)pass / (double)nPasses);
return 0;
}
|
1
2
3
4
5
6
7
8
| $ time ./qwiz3
Pass = 104458
Fail = 95542
Ratio = 0.522290
real 0m0.113s
user 0m0.101s
sys 0m0.574s
|
Hmm, we expected the time to reduce drastically, since we’ve avoided atomic access. However, it’s not behaving as we’d assumed. With a little bit of digging, we find out what the reason for this is. It turns out that the function rand
is not thread-safe. It has shared state which causes delay. The alternative is to use the rand_r
function.
In fact, if we experiment the above code with 1 thread vs multiple, we see actually that single thread outperforms multi-thread implementation. So with the new rand_r
function, here’s the final retooled version:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
| #include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <limits.h>
#include <pthread.h>
#include <unistd.h>
#include <string.h>
static inline float min(float a, float b)
{
return a < b ? a : b;
}
struct work
{
uint64_t nPasses;
uint64_t pass;
uint64_t done;
int seed;
};
void *worker(void *arg)
{
struct work *w = (struct work *) arg;
uint64_t nPasses = w->nPasses;
uint64_t done = w->done;
uint64_t pass = 0;
for (int i = 0; i < 10; i++) {
for (done = 1 ; done <= nPasses / 10; done++)
{
float x1 = (float)rand_r(&w->seed) / (float)RAND_MAX;
float y1 = (float)rand_r(&w->seed) / (float)RAND_MAX;
float x2 = (float)rand_r(&w->seed) / (float)RAND_MAX;
float y2 = (float)rand_r(&w->seed) / (float)RAND_MAX;
float xc = (x1 + x2) / 2.0;
float yc = (y1 + y2) / 2.0;
float x3 = min(xc, (1 - xc));
float y3 = min(yc, (1 - yc));
float r = 0.5 * (sqrtf((x2 - x1)*(x2 - x1) + (y2 - y1)*(y2-y1)));
if (r <= min(x3, y3))
{
pass++;
}
}
w->done += done;
w->pass = pass;
}
return NULL;
}
int main(int argc, char **argv)
{
srand(time(NULL));
uint64_t pass = 0, fail = 0;
uint64_t nPasses = 20000000ull;
int threadpool_size = 1;
if ((argc > 1) && (argc %2))
{
int a = 1;
while (a < argc)
{
if (0 == strcmp(argv[a], "-n"))
{
nPasses = atoll(argv[a + 1]);
}
else if (0 == strcmp(argv[a], "-w"))
{
threadpool_size = atoi(argv[a + 1]);
}
a += 2;
}
}
pthread_t t[threadpool_size];
struct work w[threadpool_size];
for (int i = 0; i < threadpool_size; i++)
{
w[i].nPasses = nPasses / threadpool_size;
w[i].pass = 0;
w[i].done = 0;
w[i].seed = 1337*(i + 1);
pthread_create(&t[i], NULL, worker, &(w[i]));
}
uint64_t done = 0;
while (done < nPasses)
{
done = 0;
printf("\r[");
for (int i = 0; i < threadpool_size; i++)
{
done += w[i].done;
printf("%3.2f%% ", 100.0 * ((double)w[i].done/(double)w[i].nPasses));
}
printf("]");
fflush(stdout);
usleep(10000);
}
printf("\n\n");
for (int i = 0; i < threadpool_size; i++)
{
pthread_join(t[i], NULL);
}
for (int i = 0; i < threadpool_size; i++)
{
pass += w[i].pass;
}
printf("Pass = %lu\nFail = %lu\nRatio = %lf\n", pass, nPasses - pass, (double)pass / (double)nPasses);
return 0;
}
|
With this now, we get a much better time performance, shown here with 2 different inputs, and we can clearly see that adding threads actually improves performance instead of killing it:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
| $ time ./qwizard -n 20000000 -w 10
[100.00% 100.00% 100.00% 100.00% 100.00% 100.00% 100.00% 100.00% 100.00% 100.00% ]
Pass = 10473624
Fail = 9526376
Ratio = 0.523681
real 0m0.366s
user 0m3.030s
sys 0m0.106s
$ time ./qwizard -n 200000000 -w 10
[100.00% 100.00% 100.00% 100.00% 100.00% 100.00% 100.00% 100.00% 100.00% 100.00% ]
Pass = 104715088
Fail = 95284912
Ratio = 0.523575
real 0m1.714s
user 0m16.266s
sys 0m0.000s
$ time ./qwizard -n 200000000 -w 1
[100.00% ]
Pass = 104719524
Fail = 95280476
Ratio = 0.523598
real 0m9.006s
user 0m8.951s
sys 0m0.050s
|