CUDA_Resize/resize.py at master · royinx/CUDA_Resize · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# pylint: disable=line-too-long, invalid-name, too-many-locals, raising-bad-type, c-extension-no-member, redefined-outer-name
import cv2
import cupy as cp
import numpy as np
from line_profiler import LineProfiler

with open('lib_cuResize.cu', 'r', encoding="utf-8") as reader:
    module = cp.RawModule(code=reader.read())

cuResizeKer = module.get_function("cuResize")
profile = LineProfiler()

@profile
def cuda_resize(inputs: cp.ndarray, # src: (N,H,W,C)
                shape: tuple, # (dst_h, dst_w)
                out: cp.ndarray=None, # dst: (N,H,W,C)
                pad: bool=True):
    """
    to optimise with shared memory
    block = (1024, )  # 1024 threads per block , to loop a row for dst row, with MAX_WIDTH 7680 (8K)
    grid = (dst_h,N)  #
    """
    out_dtype = cp.uint8

    N, src_h, src_w, C = inputs.shape
    assert C == 3 # resize kernel only accept 3 channel tensors.
    dst_h, dst_w = shape

    if len(shape)!=2:
        raise "cuda resize target shape must be (h,w)"
    if out:
        assert out.dtype == out_dtype
        assert out.shape[1] == dst_h
        assert out.shape[2] == dst_w

    resize_scale = 1
    left_pad = 0
    top_pad = 0
    if pad:
        padded_batch = cp.zeros((N, dst_h, dst_w, C), dtype=out_dtype)
        if src_h / src_w > dst_h / dst_w:
            resize_scale = dst_h / src_h
            ker_h = dst_h
            ker_w = int(src_w * resize_scale)
            left_pad = int((dst_w - ker_w) / 2)
        else:
            resize_scale = dst_w / src_w
            ker_h = int(src_h * resize_scale)
            ker_w = dst_w
            top_pad = int((dst_h - ker_h) / 2)
    else:
        ker_h = dst_h
        ker_w = dst_w

    shape = (N, ker_h, ker_w, C)
    if not out:
        out = cp.empty(tuple(shape),dtype = out_dtype)
    # define kernel configs
    block = (1024, )
    grid  = (ker_h, N)
    with cp.cuda.stream.Stream() as stream:
        print(inputs.dtype, out.dtype ,
              inputs.shape, out.shape,
              src_h, src_w,
              ker_h, ker_w,
              cp.float32(src_h/ker_h), cp.float32(src_w/ker_w))

        cuResizeKer(grid, block,
                (inputs, out,
                cp.int32(src_h), cp.int32(src_w),
                cp.int32(ker_h), cp.int32(ker_w),
                cp.float32(src_h/ker_h), cp.float32(src_w/ker_w)
                )
            )

        if pad:
            if src_h / src_w > dst_h / dst_w:
                padded_batch[:, :, left_pad:left_pad + out.shape[2], :] = out
            else:
                padded_batch[:, top_pad:top_pad + out.shape[1], :, :] = out
            padded_batch = cp.ascontiguousarray(padded_batch)
        stream.synchronize()

    if pad:
        return resize_scale, top_pad, left_pad, padded_batch
    return resize_scale, top_pad, left_pad, out


def main(input_array: cp.ndarray, resize_shape:tuple):
    input_array_gpu = cp.empty(shape=input_array.shape,dtype=input_array.dtype)

    if isinstance(input_array, cp.ndarray): # DtoD
        cp.cuda.runtime.memcpy(dst = int(input_array_gpu.data), # dst_ptr
                                src = int(input_array.data), # src_ptr
                                size=input_array.nbytes,
                                kind=3) # 0: HtoH, 1: HtoD, 2: DtoH, 3: DtoD, 4: unified virtual addressing
    elif isinstance(input_array, np.ndarray):
        cp.cuda.runtime.memcpy(dst = int(input_array_gpu.data), # dst_ptr
                                src = input_array.ctypes.data, # src_ptr
                                size=input_array.nbytes,
                                kind=1)

    resize_scale, top_pad, left_pad, output_array = cuda_resize(input_array_gpu,
                                                                resize_shape,
                                                                pad=True) # N,W,H,C

    return output_array, [resize_scale, top_pad, left_pad]

if __name__ == "__main__":
    # prepare data
    batch = 50
    img_batch = np.tile(cv2.resize(cv2.imread("trump.jpg"),
                                   (1920,1080)),
                        [batch,1,1,1])
    img_batch[-1] = np.tile(cv2.resize(cv2.imread("rgba.png"),(1920,1080)),[1,1,1])
    output_array, _ = main(img_batch, (320,640))
    print(output_array)

    for idx, img in enumerate(cp.asnumpy(output_array)):
        cv2.imwrite(f"output_{idx}.jpg", img)