-
Notifications
You must be signed in to change notification settings - Fork 1
/
gpu_mat_mul.py
43 lines (34 loc) · 1.16 KB
/
gpu_mat_mul.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from __future__ import division
from numba import cuda
import numpy
import math
# CUDA kernel
@cuda.jit
def matmul(A, B, C):
"""Perform matrix multiplication of C = A * B
"""
row, col = cuda.grid(2)
if row < C.shape[0] and col < C.shape[1]:
tmp = 0.
for k in range(A.shape[1]):
tmp += A[row, k] * B[k, col]
C[row, col] = tmp
# Host code
# Initialize the data arrays
A = numpy.full((2400, 1200), 3, numpy.float) # matrix containing all 3's
B = numpy.full((1200, 2200), 4, numpy.float) # matrix containing all 4's
# Copy the arrays to the device
A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)
# Allocate memory on the device for the result
C_global_mem = cuda.device_array((2400, 2200))
# Configure the blocks
threadsperblock = (30, 30)
blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[0]))
blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)
# Start the kernel
matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)
# Copy the result back to the host
C = C_global_mem.copy_to_host()
print(C.shape)