@@ -380,3 +380,59 @@ function remove_spaces_and_split(str::String)
380380 str = replace (str, " " => " " )
381381 return split (str,' ,' )
382382end
383+
384+ """
385+ dedup_colnames_colvalues!(column_names::Vector{String}, column_values::Vector)
386+
387+ Inplace dedup column names and values by giving priority to non missing elements
388+
389+ Eg.
390+ Applying function to the following arguments:
391+ column_names = ["exam_log_id", "in_date", "in_date", "out_time", "api_url", "in_time", "user_id"]
392+ column_values = ["215c81e9-e002-402e-8482-1382a65ef1e4", "2024-03-11", missing, "2024-03-11T08:27:12.363+01:00", "exam/save-values", "2024-03-11T08:27:09.363+01:00", "b7845f35-0169-488d-b8ce-111f0f07d695"]
393+
394+ will result in a dropping the third element of both vectors
395+
396+ """
397+ function dedup_colnames_colvalues! (
398+ column_names:: Vector{String} ,
399+ column_values:: Vector
400+ )
401+
402+ # Function to check if a value is considered "missing" in your context
403+ is_missing (value) = value === missing # Replace 'nothing' with your definition of a missing value
404+
405+ # Create a dictionary to track the first occurrence and its index
406+ seen = Dict {String, Int} ()
407+
408+ # Iterate backwards to prefer the first non-missing value when deduplicating
409+ for i in length (column_names): - 1 : 1
410+ name, value = column_names[i], column_values[i]
411+ if ! is_missing (value)
412+ if haskey (seen, name)
413+ # Update with non-missing value if found later
414+ column_values[seen[name]] = value
415+ else
416+ seen[name] = i # Track the first non-missing occurrence
417+ end
418+ elseif ! haskey (seen, name)
419+ # Even if it's missing, we need to track the first occurrence
420+ seen[name] = i
421+ end
422+ end
423+
424+ # Filter the original arrays based on the indices stored in 'seen'
425+ # This will also sort them based on their original order preserved by iteration order in 'seen'
426+ filter_indices = sort (collect (values (seen)))
427+
428+ # Assign new values to the beginning of the original arrays
429+ column_names[1 : length (filter_indices)] = [column_names[i] for i in filter_indices]
430+ column_values[1 : length (filter_indices)] = [column_values[i] for i in filter_indices]
431+
432+ # Resize the original arrays to remove the extra elements
433+ resize! (column_names, length (filter_indices))
434+ resize! (column_values, length (filter_indices))
435+
436+ nothing
437+
438+ end
0 commit comments