-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathresize.py
More file actions
121 lines (103 loc) · 4.29 KB
/
resize.py
File metadata and controls
121 lines (103 loc) · 4.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# pylint: disable=line-too-long, invalid-name, too-many-locals, raising-bad-type, c-extension-no-member, redefined-outer-name
import cv2
import cupy as cp
import numpy as np
from line_profiler import LineProfiler
with open('lib_cuResize.cu', 'r', encoding="utf-8") as reader:
module = cp.RawModule(code=reader.read())
cuResizeKer = module.get_function("cuResize")
profile = LineProfiler()
@profile
def cuda_resize(inputs: cp.ndarray, # src: (N,H,W,C)
shape: tuple, # (dst_h, dst_w)
out: cp.ndarray=None, # dst: (N,H,W,C)
pad: bool=True):
"""
to optimise with shared memory
block = (1024, ) # 1024 threads per block , to loop a row for dst row, with MAX_WIDTH 7680 (8K)
grid = (dst_h,N) #
"""
out_dtype = cp.uint8
N, src_h, src_w, C = inputs.shape
assert C == 3 # resize kernel only accept 3 channel tensors.
dst_h, dst_w = shape
if len(shape)!=2:
raise "cuda resize target shape must be (h,w)"
if out:
assert out.dtype == out_dtype
assert out.shape[1] == dst_h
assert out.shape[2] == dst_w
resize_scale = 1
left_pad = 0
top_pad = 0
if pad:
padded_batch = cp.zeros((N, dst_h, dst_w, C), dtype=out_dtype)
if src_h / src_w > dst_h / dst_w:
resize_scale = dst_h / src_h
ker_h = dst_h
ker_w = int(src_w * resize_scale)
left_pad = int((dst_w - ker_w) / 2)
else:
resize_scale = dst_w / src_w
ker_h = int(src_h * resize_scale)
ker_w = dst_w
top_pad = int((dst_h - ker_h) / 2)
else:
ker_h = dst_h
ker_w = dst_w
shape = (N, ker_h, ker_w, C)
if not out:
out = cp.empty(tuple(shape),dtype = out_dtype)
# define kernel configs
block = (1024, )
grid = (ker_h, N)
with cp.cuda.stream.Stream() as stream:
print(inputs.dtype, out.dtype ,
inputs.shape, out.shape,
src_h, src_w,
ker_h, ker_w,
cp.float32(src_h/ker_h), cp.float32(src_w/ker_w))
cuResizeKer(grid, block,
(inputs, out,
cp.int32(src_h), cp.int32(src_w),
cp.int32(ker_h), cp.int32(ker_w),
cp.float32(src_h/ker_h), cp.float32(src_w/ker_w)
)
)
if pad:
if src_h / src_w > dst_h / dst_w:
padded_batch[:, :, left_pad:left_pad + out.shape[2], :] = out
else:
padded_batch[:, top_pad:top_pad + out.shape[1], :, :] = out
padded_batch = cp.ascontiguousarray(padded_batch)
stream.synchronize()
if pad:
return resize_scale, top_pad, left_pad, padded_batch
return resize_scale, top_pad, left_pad, out
def main(input_array: cp.ndarray, resize_shape:tuple):
input_array_gpu = cp.empty(shape=input_array.shape,dtype=input_array.dtype)
if isinstance(input_array, cp.ndarray): # DtoD
cp.cuda.runtime.memcpy(dst = int(input_array_gpu.data), # dst_ptr
src = int(input_array.data), # src_ptr
size=input_array.nbytes,
kind=3) # 0: HtoH, 1: HtoD, 2: DtoH, 3: DtoD, 4: unified virtual addressing
elif isinstance(input_array, np.ndarray):
cp.cuda.runtime.memcpy(dst = int(input_array_gpu.data), # dst_ptr
src = input_array.ctypes.data, # src_ptr
size=input_array.nbytes,
kind=1)
resize_scale, top_pad, left_pad, output_array = cuda_resize(input_array_gpu,
resize_shape,
pad=True) # N,W,H,C
return output_array, [resize_scale, top_pad, left_pad]
if __name__ == "__main__":
# prepare data
batch = 50
img_batch = np.tile(cv2.resize(cv2.imread("trump.jpg"),
(1920,1080)),
[batch,1,1,1])
img_batch[-1] = np.tile(cv2.resize(cv2.imread("rgba.png"),(1920,1080)),[1,1,1])
output_array, _ = main(img_batch, (320,640))
print(output_array)
for idx, img in enumerate(cp.asnumpy(output_array)):
cv2.imwrite(f"output_{idx}.jpg", img)