@@ -247,8 +247,8 @@ module Reduce =
247247 let gid = ndRange.GlobalID0
248248
249249 if gid = 0 then
250- let mutable currentKey = keys.[ gid ]
251- let mutable segmentResult = values.[ gid ]
250+ let mutable currentKey = keys.[ 0 ]
251+ let mutable segmentResult = values.[ 0 ]
252252 let mutable segmentCount = 0
253253
254254 for i in 1 .. length - 1 do
@@ -277,51 +277,39 @@ module Reduce =
277277
278278 let kernel = kernel.GetKernel()
279279
280- processor.Post( Msg.MsgSetArguments( fun () -> kernel.KernelFunc ndRange resultLength keys values reducedValues reducedKeys))
280+ processor.Post( Msg.MsgSetArguments( fun () -> kernel.KernelFunc ndRange keys.Length keys values reducedValues reducedKeys))
281281
282282 processor.Post( Msg.CreateRunMsg<_, _>( kernel))
283283
284+ reducedKeys, reducedValues
285+
284286 let segmentSequential ( clContext : ClContext ) workGroupSize ( reduceOp : Expr < 'a -> 'a -> 'a >) =
285287
286288 let kernel =
287- <@ fun ( ndRange : Range1D ) uniqueKeyCount ( offsets : ClArray < int >) ( keys : ClArray < int >) ( values : ClArray < 'a >) ( reducedValues : ClArray < 'a >) ( reducedKeys : ClArray < int >) ->
289+ <@ fun ( ndRange : Range1D ) uniqueKeyCount keysLength ( offsets : ClArray < int >) ( keys : ClArray < int >) ( values : ClArray < 'a >) ( reducedValues : ClArray < 'a >) ( reducedKeys : ClArray < int >) ->
288290
289291 let gid = ndRange.GlobalID0
290292
291293 if gid < uniqueKeyCount then
292294 let startPosition = offsets.[ gid]
293- let sourceKey = keys.[ startPosition]
294295
295- let mutable nextPosition = startPosition + 1 // TODO()
296- let mutable nextKey = keys.[ nextPosition]
296+ let sourceKey = keys.[ startPosition]
297297 let mutable sum = values.[ startPosition]
298298
299- while nextKey = sourceKey do
300- sum <- (% reduceOp) sum values.[ nextPosition]
299+ let mutable currentPosition = startPosition + 1
300+
301+ while currentPosition < keysLength
302+ && sourceKey = keys.[ currentPosition] do
301303
302- nextPosition <- nextPosition + 1
303- nextKey <- keys .[ nextPosition ]
304+ sum <- (% reduceOp ) sum values .[ currentPosition ]
305+ currentPosition <- currentPosition + 1
304306
305307 reducedValues.[ gid] <- sum
306308 reducedKeys.[ gid] <- sourceKey @>
307309
308310 let kernel = clContext.Compile kernel
309311
310- let getUniqueBitmap = ClArray.getUniqueBitmap clContext workGroupSize
311-
312- let prefixSum = PrefixSum.runExcludeInplace <@ (+) @> clContext workGroupSize
313-
314- let removeDuplicates = ClArray.removeDuplications clContext workGroupSize
315-
316- fun ( processor : MailboxProcessor < _ >) allocationMode ( keys : ClArray < int >) ( values : ClArray < 'a >) ->
317-
318- let bitmap = getUniqueBitmap processor DeviceOnly keys
319-
320- let resultLength = ( prefixSum processor bitmap 0 ) .ToHostAndFree processor
321-
322- let offsets = removeDuplicates processor bitmap
323-
324- bitmap.Free processor
312+ fun ( processor : MailboxProcessor < _ >) allocationMode ( resultLength : int ) ( offsets : ClArray < int >) ( keys : ClArray < int >) ( values : ClArray < 'a >) ->
325313
326314 let reducedValues = clContext.CreateClArrayWithSpecificAllocationMode( allocationMode, resultLength)
327315
@@ -331,10 +319,12 @@ module Reduce =
331319
332320 let kernel = kernel.GetKernel()
333321
334- processor.Post( Msg.MsgSetArguments( fun () -> kernel.KernelFunc ndRange resultLength offsets keys values reducedValues reducedKeys))
322+ processor.Post( Msg.MsgSetArguments( fun () -> kernel.KernelFunc ndRange resultLength keys.Length offsets keys values reducedValues reducedKeys))
335323
336324 processor.Post( Msg.CreateRunMsg<_, _>( kernel))
337325
326+ reducedKeys, reducedValues
327+
338328 let oneWorkGroupSegments ( clContext : ClContext ) workGroupSize ( reduceOp : Expr < 'a -> 'a -> 'a >) =
339329
340330 let kernel =
@@ -343,40 +333,39 @@ module Reduce =
343333 let lid = ndRange.GlobalID0
344334
345335 // load values to local memory (may be without it)
346- let localValues = localArray< 'a> length
336+ let localValues = localArray< 'a> workGroupSize
347337 if lid < length then localValues.[ lid] <- values.[ lid]
348338
349339 // load keys to local memory (mb without it)
350- let localKeys = localArray< int> length
340+ let localKeys = localArray< int> workGroupSize
351341 if lid < length then localKeys.[ lid] <- keys.[ lid]
352342
353343 // get unique keys bitmap
354- let localBitmap = localArray< int> length
355- (% PreparePositions.getUniqueBitmapLocal< int>) localKeys length lid localBitmap
344+ let localBitmap = localArray< int> workGroupSize
345+ localBitmap.[ lid] <- 0
346+ (% PreparePositions.getUniqueBitmapLocal< int>) localKeys workGroupSize lid localBitmap
356347
357348 // get positions from bitmap by prefix sum
358349 // ??? get bitmap by prefix sum in another kernel ???
350+ // ??? we can restrict prefix sum for 0 .. length ???
359351 (% SubSum.localIntPrefixSum) lid workGroupSize localBitmap
360- let localPositions = localBitmap
361352
362- let uniqueKeysCount = localPositions .[ length - 1 ]
353+ let uniqueKeysCount = localBitmap .[ length - 1 ]
363354
364355 if lid < uniqueKeysCount then
365356 let itemKeyId = lid + 1
366- // we can count start position by itemKeyId
367- // but loose coalesced memory read pattern
368357
369358 let startKeyIndex =
370- (% Search.Bin.lowerPosition) length itemKeyId localPositions
359+ (% Search.Bin.lowerPosition) length itemKeyId localBitmap
371360
372361 match startKeyIndex with
373362 | Some startPosition ->
374- let sourcePosition = localPositions .[ startPosition]
363+ let sourceKeyPosition = localBitmap .[ startPosition]
375364 let mutable currentSum = localValues.[ startPosition]
376365 let mutable currentIndex = startPosition + 1
377366
378367 while currentIndex < length
379- && localPositions .[ currentIndex] = sourcePosition do
368+ && localBitmap .[ currentIndex] = sourceKeyPosition do
380369
381370 currentSum <- (% reduceOp) currentSum localValues.[ currentIndex]
382371 currentIndex <- currentIndex + 1
@@ -388,6 +377,7 @@ module Reduce =
388377 let kernel = clContext.Compile kernel
389378
390379 fun ( processor : MailboxProcessor < _ >) allocationMode ( resultLength : int ) ( keys : ClArray < int >) ( values : ClArray < 'a >) ->
380+ if keys.Length > workGroupSize then failwith " The length of the value should not exceed the size of the workgroup"
391381
392382 let reducedValues = clContext.CreateClArrayWithSpecificAllocationMode( allocationMode, resultLength)
393383
@@ -397,6 +387,9 @@ module Reduce =
397387
398388 let kernel = kernel.GetKernel()
399389
400- processor.Post( Msg.MsgSetArguments( fun () -> kernel.KernelFunc ndRange resultLength keys values reducedValues reducedKeys))
390+ processor.Post( Msg.MsgSetArguments( fun () -> kernel.KernelFunc ndRange keys.Length keys values reducedValues reducedKeys))
401391
402392 processor.Post( Msg.CreateRunMsg<_, _>( kernel))
393+
394+ reducedKeys, reducedValues
395+
0 commit comments