diff --git a/DESCRIPTION b/DESCRIPTION index 6c7029d..9da2d12 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: geocodebr Title: Geolocalização De Endereços Brasileiros (Geocoding Brazilian Addresses) -Version: 0.6.2 +Version: 0.6.2.9000 Authors@R: c( person("Rafael H. M.", "Pereira", , "rafa.pereira.br@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-2125-7465")), @@ -42,7 +42,6 @@ Imports: duckspatial (>= 1.0.0), enderecobr (>= 0.5.0), fs, - geoarrow (>= 0.4.2), glue, h3r, httr2 (>= 1.0.0), @@ -68,4 +67,4 @@ Config/testthat/edition: 3 Encoding: UTF-8 Language: pt Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.3 +Config/roxygen2/version: 8.0.0 diff --git a/NEWS.md b/NEWS.md index bb9dd5e..ab62d13 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,20 @@ +# geocodebr v0.6.3 dev + +## Correção de bugs (Bug fixes) + +- Bug corrigido que agora permite usuários passarem como input tabelas de +endereços com apenas alguns campos. Os campos de municio e unidade da federação +continuam sendo obrigatórios. Encerra [#89](https://github.com/ipeaGIT/geocodebr/issues/89) +e [#94](https://github.com/ipeaGIT/geocodebr/issues/94) + +## Mudanças pequenas (Minor changes) + +- A função `geocode_reverso()` teve pequeno ganho de velocidade, com drástica +redução no consumo de memória. Na amostra de 1000 pontos, o uso de memória caiu +de 161MB para 95MB. + + + # geocodebr v0.6.2 ## Correção de bugs (Bug fixes) @@ -7,7 +24,7 @@ data release corrente, e ignora eventuais dados de releases antigos que estejam na pasta. [Encerra #90](https://github.com/ipeaGIT/geocodebr/issues/90) - A função `geocode()` agora retorna erro informativo quando alguma coluna na tabela de input tem nome com algum caractere não alfanumérico, como . , ? ^ - ! ~. Não -há problema com o sublinhado _, como em “name_muni”. Fecha [issue #92](https://github.com/ipeaGIT/geocodebr/issues/92) +há problema com o barra baixa _, como em “name_muni”. Fecha [issue #92](https://github.com/ipeaGIT/geocodebr/issues/92) - Corrigido erro na função de `geocode_reverso()` que impedia usar valores muito altos de `dist_max`. [Encerra #88](https://github.com/ipeaGIT/geocodebr/issues/88) - Incluido 'Language: pt' na DESCRIPTION diff --git a/R/geocode.R b/R/geocode.R index ef24773..be44759 100644 --- a/R/geocode.R +++ b/R/geocode.R @@ -210,6 +210,30 @@ geocode_core <- function( # systime start 66666 ---------------- # timer$mark("Start") + # fix eventual missing fields in input data ------------------------------------------------------- + # geocodebr requires all address fields to be declared + # if one or more fields are empty, we add mock columns with empty strings + + campos_endereco <- assert_and_assign_address_fields( + campos_endereco, + enderecos + ) + + # determine which columns are missing, if any + missing_cols <- campos_endereco[unlist(lapply(campos_endereco, is.null))] + + if (length(missing_cols)>=1) { + + # add empty string to missing cols + data.table::setDT(enderecos) + new_colnames <- paste0(names(missing_cols), "tempgeocodebr") + enderecos[, (new_colnames) := NA_character_ ] + + # update address fields with fake columns + campos_endereco[sapply(campos_endereco, is.null)] <- as.list(new_colnames) + } + + # normalize input data ------------------------------------------------------- # standardizing the addresses table to increase the chances of finding a match # in the CNEFE data @@ -219,11 +243,6 @@ geocode_core <- function( message_standardizing_addresses() } - campos_endereco <- assert_and_assign_address_fields( - campos_endereco, - enderecos - ) - input_padrao <- enderecobr::padronizar_enderecos( enderecos = enderecos, campos_do_endereco = enderecobr::correspondencia_campos( @@ -487,6 +506,21 @@ geocode_core <- function( # drop geocodebr temp id column output_df[, tempidgeocodebr := NULL] + # # col precisao como ordered factor + # ordem_precisao <- c( + # "numero", + # "numero_aproximado", + # "logradouro", + # "cep", + # "localidade", + # "municipio" + # ) + # output_df[, precisao := factor( + # precisao, + # levels = ordem_precisao, + # ordered = TRUE + # )] + # Disconnect from DuckDB when done duckdb::dbDisconnect(con) @@ -508,6 +542,11 @@ geocode_core <- function( # timer$mark("Add H3") } + # drop eventual mock columns with empty strings + if (length(missing_cols)>=1) { + output_df[, (new_colnames) := NULL] + } + # remove data.table class data.table::setindex(output_df, NULL) data.table::setDF(output_df) diff --git a/R/geocode_reverso.R b/R/geocode_reverso.R index b82a58d..14bbb90 100644 --- a/R/geocode_reverso.R +++ b/R/geocode_reverso.R @@ -67,9 +67,6 @@ geocode_reverso <- function( ) } - # pontos <- sf::st_transform(pontos, 4674) - - # prep input ------------------------------------------------------- # converte pontos de input para data.frame @@ -114,24 +111,8 @@ geocode_reverso <- function( # limita escopo de busca aos municipios ------------------------------------------------------- # determine potential municipalities munis <- system.file("extdata/munis_bbox_2022.parquet", package = "geocodebr") |> - arrow::open_dataset() |> - sf::st_as_sf() - - # place holder to use geoarrow becaue: - # Namespace in Imports field not imported from: 'geoarrow' - # All declared Imports should be used. - geoarrow::as_geoarrow_vctr("POINT (0 1)") - - # munis_path <- system.file("extdata/munis_2022.parquet", package = "geocodebr") - # - # query_register_muni <- glue::glue( - # "CREATE OR REPLACE TEMP VIEW munis AS - # SELECT *, - # geometry::GEOMETRY AS geometry - # FROM read_parquet('{munis_path}');" - # ) - # - # DBI::dbExecute(conn, query_register_muni) + duckspatial::ddbs_open_dataset() + potential_munis <- duckspatial::ddbs_join( x = pontos, @@ -185,22 +166,28 @@ geocode_reverso <- function( # ST_Point(lon, lat)::GEOMETRY('EPSG:4674') AS geom - cnefe_utm_duck <- duckspatial::ddbs_transform( + # converte cnefe para UTM + cnefe_utm_duck <- duckspatial::ddbs_transform( x = 'cnefe_tb', - y = 'EPSG:31983',conn = conn, + y = 'EPSG:31983', + conn = conn, quiet = TRUE ) - # input to UTM - input_utm_duck <- duckspatial::ddbs_transform( + # converte pontos para UTM + input_utm_duck <- duckspatial::ddbs_transform( x = pontos, y = 'EPSG:31983', + conn = conn, + name = "pontos_utm", + overwrite = T, quiet = TRUE ) - # buffers around input points + # buffer around input points buff <- duckspatial::ddbs_buffer( - x = input_utm_duck, + x = "pontos_utm", + conn = conn, distance = dist_max, quiet = TRUE ) @@ -210,30 +197,14 @@ geocode_reverso <- function( result <- duckspatial::ddbs_join( x = cnefe_utm_duck, y = buff, - join = "within", + join = "intersects", # intersects within + conn = conn, + name = "join_result", + overwrite = T, quiet = TRUE ) ) - # write to connection - duckspatial::ddbs_write_table( - conn = conn, - data = input_utm_duck, - name = "pontos_utm", - overwrite = T, - temp_view = T, - quiet = TRUE - ) - - duckspatial::ddbs_write_table( - conn = conn, - data = result, - name = "join_result", - overwrite = T, - temp_view = T, - quiet = TRUE - ) - # Get column names from both tables cols_a <- DBI::dbGetQuery(conn, "SELECT column_name FROM (DESCRIBE pontos_utm)")$column_name cols_b <- DBI::dbGetQuery(conn, "SELECT column_name FROM (DESCRIBE join_result)")$column_name @@ -256,7 +227,7 @@ geocode_reverso <- function( ST_Distance(a.geometry, b.geometry) AS distancia_metros, ROW_NUMBER() OVER ( PARTITION BY a.id - ORDER BY ST_Distance(a.geometry, b.geometry) + ORDER BY distancia_metros ) AS rn FROM pontos_utm AS a JOIN join_result AS b diff --git a/R/onLoad.R b/R/onLoad.R deleted file mode 100644 index 08c8fba..0000000 --- a/R/onLoad.R +++ /dev/null @@ -1,5 +0,0 @@ -.onLoad <- function(libname, pkgname){ # nocov start - - loadNamespace("geoarrow") - -} # nocov end diff --git a/cran-comments.md b/cran-comments.md index 7bcd358..6f540c5 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,21 +1,22 @@ ## R CMD check results -── R CMD check results ────────────────────────────────────────────── geocodebr 0.6.2 ──── -Duration: 2m 33.3s +── R CMD check results ───────────────────────────────────────────── geocodebr 0.6.3 ──── +Duration: 2m 39s 0 errors ✔ | 0 warnings ✔ | 0 notes ✔ -# geocodebr v0.6.2 -## Correção de bugs (Bug fixes) +# geocodebr v0.6.3 -- Fixed a bug to ensure that the package uses only cached data from the -current release and ignores any data from older releases that may be -in the folder. [Closes #90](https://github.com/ipeaGIT/geocodebr/issues/90) -- The `geocode()` function now returns an informational error when a column in the -input table has a name containing a non-alphanumeric character, such as . , ? ^ - ! ~. There -is no issue with the underscore _, as in “name_muni”. Closed [issue #92](https://github.com/ipeaGIT/geocodebr/issues/92) -- Fixed a bug in the `geocode_reverso()` function that prevented the use of very -high values for `dist_max`. [Closes #88](https://github.com/ipeaGIT/geocodebr/issues/88) -- Added ‘Language: pt’ to DESCRIPTION +## Bug fixes +- Fixed a bug that now allows users to pass address tables containing only a +subset of address fields as input. Municipality and state fields remain +mandatory. Closes [#89](https://github.com/ipeaGIT/geocodebr/issues/89) +and [#94](https://github.com/ipeaGIT/geocodebr/issues/94) + +## Minor changes + +- The `geocode_reverso()` function achieved a small speed improvement, along +with a substantial reduction in memory usage. In a sample of 1,000 points, +memory consumption dropped from 161MB to 95MB. diff --git a/inst/extdata/large_sample.parquet b/inst/extdata/large_sample.parquet index 7709723..70993af 100644 Binary files a/inst/extdata/large_sample.parquet and b/inst/extdata/large_sample.parquet differ diff --git a/inst/extdata/munis_bbox_2022.parquet b/inst/extdata/munis_bbox_2022.parquet index c83c2d6..3f65f94 100644 Binary files a/inst/extdata/munis_bbox_2022.parquet and b/inst/extdata/munis_bbox_2022.parquet differ diff --git a/man/definir_pasta_cache.Rd b/man/definir_pasta_cache.Rd index 68c89ab..42eb8f6 100644 --- a/man/definir_pasta_cache.Rd +++ b/man/definir_pasta_cache.Rd @@ -9,7 +9,7 @@ definir_pasta_cache(path, verboso = TRUE) \arguments{ \item{path}{Uma string. O caminho para o diretório usado para armazenar os dados em cache. Se \code{NULL}, o pacote usará um diretório versionado salvo -dentro do diretório retornado por \code{\link[tools:userdir]{tools::R_user_dir()}}.} +dentro do diretório retornado por \code{\link[tools:R_user_dir]{tools::R_user_dir()}}.} \item{verboso}{Um valor lógico. Indica se barras de progresso e mensagens devem ser exibidas durante o download dos dados do CNEFE e a geocodificação diff --git a/man/geocodebr.Rd b/man/geocodebr.Rd index 175aae4..deff2d2 100644 --- a/man/geocodebr.Rd +++ b/man/geocodebr.Rd @@ -7,7 +7,7 @@ \title{Package: geocodebr: Geolocalização De Endereços Brasileiros (Geocoding Brazilian Addresses)} \description{ -\if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} +\if{html}{\figure{logo.svg}{options: style='float: right' alt='logo' width='120'}} Método simples e eficiente de geolocalizar dados no Brasil. O pacote é baseado em conjuntos de dados espaciais abertos de endereços brasileiros, utilizando como fonte principal o Cadastro Nacional de Endereços para Fins Estatísticos (CNEFE). O CNEFE é publicado pelo Instituto Brasileiro de Geografia e Estatística (IBGE), órgão oficial de estatísticas e geografia do Brasil. (A simple and efficient method for geolocating data in Brazil. The package is based on open spatial datasets of Brazilian addresses, primarily using the Cadastro Nacional de Endereços para Fins Estatísticos (CNEFE), published by the Instituto Brasileiro de Geografia e Estatística (IBGE), Brazil's official statistics and geography agency.) } @@ -25,6 +25,7 @@ Useful links: Authors: \itemize{ + \item Rafael H. M. Pereira \email{rafa.pereira.br@gmail.com} (\href{https://orcid.org/0000-0003-2125-7465}{ORCID}) \item Daniel Herszenhut \email{dhersz@gmail.com} (\href{https://orcid.org/0000-0001-8066-1105}{ORCID}) \item Gabriel Garcia de Almeida (\href{https://orcid.org/0009-0003-3557-7328}{ORCID}) } diff --git a/tests/tests_rafa/benchmark_20k.R b/tests/tests_rafa/benchmark_20k.R index 9d5e7fd..b467768 100644 --- a/tests/tests_rafa/benchmark_20k.R +++ b/tests/tests_rafa/benchmark_20k.R @@ -9,14 +9,16 @@ ncores <- 7 campos <- geocodebr::definir_campos( - logradouro = 'logradouro', - numero = 'numero', + # logradouro = 'logradouro', + # numero = 'numero', cep = 'cep', localidade = 'bairro', municipio = 'municipio', estado = 'uf' ) +input_df$logradouro <- NULL +input_df$numero <- NULL bench::mark(iterations = 3, a <- geocodebr::geocode( @@ -34,5 +36,8 @@ bench::mark(iterations = 3, # expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory # streetmap 0.6.0 dev 7.10s 7.26s 0.136 5.47MB 0 5 0 36.7s # laptop 0.6.0 CRAN 5.2s 5.53s 0.174 7.46MB 0 5 0 28.8s -# load 1 a <- geoc… 8.1s 8.79s 0.116 3.03MB 0 3 0 26s -# sem 1 a <- geoc… 10.3s 10.5s 0.0944 3.03MB 0 3 0 31.8s +#1 "" 8.67s 8.86s 0.113 2.04MB 0.0565 2 1 17.7s +#1 "" 8.35s 8.82s 0.115 5.43MB 0 3 0 26.1s +#1 "NA_int" 6.52s 6.58s 0.152 4.18MB 0.0760 2 1 13.2s +#1 "NA_int" 6.54s 6.81s 0.147 1.73MB 0.0734 2 1 13.6s +1 a <- geocodebr::ge… 7.58s 7.72s 0.124 4.18MB 0 3 0 24.1s diff --git a/tests/tests_rafa/generate_sample_data.R b/tests/tests_rafa/generate_sample_data.R index b07906a..032bd12 100644 --- a/tests/tests_rafa/generate_sample_data.R +++ b/tests/tests_rafa/generate_sample_data.R @@ -130,6 +130,9 @@ setDT(df) df[, id := 1:nrow(df)] head(df) +data.table::setindex(df, NULL) +data.table::setDF(df) + arrow::write_parquet(df, './inst/extdata/large_sample.parquet') diff --git a/tests/tests_rafa/munis_bbox.R b/tests/tests_rafa/munis_bbox.R index 29473b7..93f79a2 100644 --- a/tests/tests_rafa/munis_bbox.R +++ b/tests/tests_rafa/munis_bbox.R @@ -8,34 +8,34 @@ library(geoarrow) # Load the state polygons df <- geobr::read_municipality(year = 2022, simplified = T) -# Calculate bounding boxes of states -bounding_boxes <- df |> - st_as_sf() |> # Ensure df is an sf object - rowwise() |> # Process each polygon individually - mutate( - xmin = st_bbox(geometry)["xmin"], # Extract xmin from the bounding box - ymin = st_bbox(geometry)["ymin"], # Extract ymin from the bounding box - xmax = st_bbox(geometry)["xmax"], # Extract xmax from the bounding box - ymax = st_bbox(geometry)["ymax"] # Extract ymax from the bounding box - ) |> - ungroup() |> # Unrowwise after rowwise operations - select(code_muni, xmin, ymin, xmax, ymax) |> # Select desired columns - st_drop_geometry() - -# View the resulting bounding box data.frame -head(bounding_boxes) - -data.table::fwrite(bounding_boxes, './inst/extdata/munis_bbox.csv') - - -head(input_table) - -candidate_states <- - subset(x = bounding_boxes, - (xmin < bbox_lon_min | xmax > bbox_lon_max) & - (ymin < bbox_lat_min | ymax > bbox_lat_max) - ) - +# # Calculate bounding boxes of states +# bounding_boxes <- df |> +# st_as_sf() |> # Ensure df is an sf object +# rowwise() |> # Process each polygon individually +# mutate( +# xmin = st_bbox(geometry)["xmin"], # Extract xmin from the bounding box +# ymin = st_bbox(geometry)["ymin"], # Extract ymin from the bounding box +# xmax = st_bbox(geometry)["xmax"], # Extract xmax from the bounding box +# ymax = st_bbox(geometry)["ymax"] # Extract ymax from the bounding box +# ) |> +# ungroup() |> # Unrowwise after rowwise operations +# select(code_muni, xmin, ymin, xmax, ymax) |> # Select desired columns +# st_drop_geometry() +# +# # View the resulting bounding box data.frame +# head(bounding_boxes) +# +# data.table::fwrite(bounding_boxes, './inst/extdata/munis_bbox.csv') +# +# +# head(input_table) +# +# candidate_states <- +# subset(x = bounding_boxes, +# (xmin < bbox_lon_min | xmax > bbox_lon_max) & +# (ymin < bbox_lat_min | ymax > bbox_lat_max) +# ) +# @@ -65,17 +65,23 @@ bounding_boxes <- bounding_boxes |> head(bounding_boxes) -# arrow::write_parquet(bounding_boxes2, "munis_bbox_2022.parquet") -arrow::write_parquet(bounding_boxes, "munis_bbox_2022.parquet", - compression='zstd', - compression_level = 7) +# # arrow::write_parquet(bounding_boxes2, "munis_bbox_2022.parquet") +# arrow::write_parquet(bounding_boxes, "munis_bbox_2022.parquet", +# compression='zstd', +# compression_level = 7) +# + +# remove the classes "tbl_df" "tbl" from an object +class(bounding_boxes) <- setdiff(class(bounding_boxes), c("tbl_df", "tbl")) + +duckspatial::ddbs_write_dataset( + data = bounding_boxes, + path = './inst/extdata/munis_bbox_2022.parquet', + crs = "EPSG:4674", + overwrite = T, + parquet_compression = "ZSTD", + quiet = TRUE +) -path <- "//storage6/usuarios/Proj_acess_oport/git_rafa/prep_roger/data/municipality/2022/municipalities_2022_simplified.parquet" -df <- arrow::open_dataset(path) |> - dplyr::select(code_muni, geometry) |> - sf::st_as_sf() -arrow::write_parquet(df, "munis_2022.parquet", - compression='zstd', - compression_level = 7) diff --git a/tests/tests_rafa/reverse_geocode_tests.R b/tests/tests_rafa/reverse_geocode_tests.R index db51853..3ce2130 100644 --- a/tests/tests_rafa/reverse_geocode_tests.R +++ b/tests/tests_rafa/reverse_geocode_tests.R @@ -1,6 +1,7 @@ devtools::load_all('.') +# library(geocodebr) library(dplyr) -library(geoarrow) +library(sf) # input data @@ -20,21 +21,19 @@ bench::system_time( dist_max = 1000 ) ) + View(out) # ttt <- data.frame(id=1, lat=-15.814192047159876, lon=-47.90534614672923) -# reverse_geocode(df = ttt) - -# take aways -# ok reverse_geocode_filter # mais rapdido e eficiente, mas sem progress bar -# ok reverse_geocode_join # igual o _filter, mas usa join -# ok reverse_geocode_hybrid # com progress bar mas um pouco mais lento e bem mais memoria -# ok reverse_geocode_arrow # tempo igual a _hybrid, mas usa bem mais memoria -# ok filterloop # disparado o mais lento, com progress e memoria media +# ttt <- sf::st_as_sf( +# ttt, +# coords = c("lon", "lat"), +# crs = 4674 +# ) +# +# geocode_reverso(pontos = ttt) -# essa funcao pode fica muito mais rapida / eficiente se usarmos a biblioteca de -# dados espaciais do duckdb b5 <- bench::mark( current = geocode_reverso(pontos = pontos, dist_max = 1000), @@ -43,322 +42,13 @@ b5 <- bench::mark( ) b5 -# - -# # 500 pontos -# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory -# -# 1 duck_filter4 1.54m 1.62m 0.0101 221.5MB 0.0423 5 21 8.28m -# 2 duck_filter_loop4 5.35m 6.88m 0.00255 14.5MB 0.00357 5 7 32.7m -# 3 hybrid4 2.41m 2.47m 0.00663 34.5MB 0.302 5 228 12.56m - -# # 1000 pontos -# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory -# -# 1 duck_filt… 2.97m 2.97m 0.00560 11.3MB 0 1 0 2.97m -# 2 duck_join4 3.03m 3.03m 0.00550 11.5MB 0 1 0 3.03m -# 3 arrow4 4.27m 4.27m 0.00391 240.1MB 0.316 1 81 4.27m -# 4 hybrid4 4.24m 4.24m 0.00393 122.4MB 0.110 1 28 4.24m -# 4 filterloop 10.7m 11.27m 0.00146 19.8MB 0.00195 3 4 34.19m -# 1 current 1.92s 1.98s 0.496 247.62MB 3.17 5 32 10.08s -# 2 geocrev2 1.28s 1.33s 0.730 8.03MB 0.730 5 5 6.85s -# 3 geocrev3 1.38s 1.61s 0.626 8.29MB 0.752 5 6 7.98s -# - - - -# aternativas da funcao de geocode reverso ----------------------------------------------------------- - -#' muni join + haversine -#' (usa spatial join para detectar munis candidatos, e depois calcular haversine na unha) -#' a diferenca dessa para a versao implementada é que a implementada calcula distancias dentro do duckspatial com ST_DIST - -geocode_reverso2 <- function( - pontos, - dist_max = 1000, - verboso = TRUE, - cache = TRUE, - n_cores = NULL -) { - # check input - checkmate::assert_class(pontos, 'sf') - checkmate::assert_number(dist_max, lower = 500, upper = 100000) # max 100 Km - checkmate::assert_logical(verboso) - checkmate::assert_logical(cache) - - # check if geometry type is POINT - if (any(sf::st_geometry_type(pontos) != 'POINT')) { - cli::cli_abort( - "Input precisa ser um sf data frame com geometria do tipo POINT." - ) - } - - epsg <- sf::st_crs(pontos)$epsg - if (epsg != 4674) { - cli::cli_abort( - "Dados de input precisam estar com sistema de coordenadas geogr\u00e1ficas SIRGAS 2000, EPSG 4674." - ) - } - - # prep input ------------------------------------------------------- - - # converte para data.frame - coords <- sfheaders::sf_to_df(pontos, fill = TRUE) - data.table::setDT(coords) - coords[, c('sfg_id', 'point_id') := NULL] - data.table::setnames(coords, old = c('x', 'y'), new = c('lon', 'lat')) - - # create temp id - coords[, tempidgeocodebr := 1:nrow(coords)] - - # convert max_dist to degrees - # 1 degree of latitude is always 111320 meters - margin_lat <- dist_max / 111320 - - # 1 degree of longitude is 111320 * cos(lat) - coords[, c("lat_min", "lat_max") := .(lat - margin_lat, lat + margin_lat)] - - coords[, - c("lon_min", "lon_max") := .( - lon - dist_max / 111320 * cos(lat), - lon + dist_max / 111320 * cos(lat) - ) - ] - - # get bounding box around input points - # using a range of max dist around input points - bbox_lat_min <- min(coords$lat_min) - bbox_lat_max <- max(coords$lat_max) - bbox_lon_min <- min(coords$lon_min) - bbox_lon_max <- max(coords$lon_max) - - # check if input falls within Brazil - bbox_brazil <- data.frame( - xmin = -73.99044997, - ymin = -33.75208127, - xmax = -28.83594354, - ymax = 5.27184108 - ) - - error_msg <- 'Coordenadas de input localizadas fora do bounding box do Brasil.' - if ( - bbox_lon_min < bbox_brazil$xmin | - bbox_lon_max > bbox_brazil$xmax | - bbox_lat_min < bbox_brazil$ymin | - bbox_lat_max > bbox_brazil$ymax - ) { - cli::cli_abort(error_msg) - } - - # download cnefe ------------------------------------------------------- - - # downloading cnefe - cnefe_dir <- geocodebr::download_cnefe( - tabela = 'municipio_logradouro_numero_cep_localidade', - verboso = verboso, - cache = cache - ) - - # creating a temporary db and register the input table data - con <- create_geocodebr_db(n_cores = n_cores) - - - # limita escopo de busca aos municipios ------------------------------------------------------- - # determine potential municipalities - munis <- system.file("extdata/munis_bbox_2022.parquet", package = "geocodebr") |> - arrow::open_dataset() |> - sf::st_as_sf() - # munis_path <- system.file("extdata/munis_2022.parquet", package = "geocodebr") - # - # query_register_muni <- glue::glue( - # "CREATE OR REPLACE TEMP VIEW munis AS - # SELECT *, - # geometry::GEOMETRY AS geometry - # FROM read_parquet('{munis_path}');" - # ) - # - # DBI::dbExecute(conn, query_register_muni) - - potential_munis <- duckspatial::ddbs_join( - x = pontos, - y = munis, - join = "within", - quiet = TRUE - ) |> - dplyr::pull(code_muni) |> - unique() - - potential_munis <- enderecobr::padronizar_municipios(potential_munis) - - # lida com munis com apostrofe no nome tipo Olho d'agua - potential_munis <- gsub("'", "''", potential_munis, fixed = TRUE) - - unique_munis <- paste(glue::glue("'{potential_munis}'"), collapse = ",") - - # build path to local file - path_to_parquet <- fs::path( - listar_pasta_cache(), - glue::glue("geocodebr_data_release_{data_release}"), - paste0("municipio_logradouro_numero_cep_localidade.parquet") - ) - - # create filtered_cnefe table, filter on the fly - cols_to_keep <- c( - "estado", - "municipio", - "logradouro", - "numero", - "cep", - "localidade", - "endereco_completo", - "lon", - "lat" - ) - cols_to_keep <- paste0(cols_to_keep, collapse = ", ") - - # Load CNEFE data and filter it to include only municipalities - # present in the input table, reducing the search scope - # Narrow search global scope of cnefe to bounding box - query_filter_cnefe <- glue::glue( - "CREATE TEMP VIEW filtered_cnefe AS - SELECT {cols_to_keep} - FROM read_parquet('{path_to_parquet}') m - WHERE m.municipio IN ({unique_munis});" - ) - - DBI::dbExecute(con, query_filter_cnefe) - # DBI::dbExecute(con, query_filter_cnefe) - # b <- DBI::dbReadTable(con, "filtered_cnefe") - - # Convert input data frame to DuckDB table - duckdb::dbWriteTable(con, "input_table_db", coords, temporary = TRUE) - # Haversine macro (kept for speed; consider spatial extension later) - DBI::dbExecute( - con, - " - CREATE MACRO IF NOT EXISTS haversine(lat1, lon1, lat2, lon2) AS ( - 6378137 * 2 * ASIN( - SQRT( - POWER(l(lat2 - lat1) / 2), 2) + - COS(RADIANS(lat1)) * COS(RADIANS(lat2)) * - POWER(SIN(RADIANS(lon2 - lon1) / 2), 2) - ) - ) - ); - " - ) +## 500 pontos +# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory +# v0.6.2 2.9s 3.02s 0.330 153MB 1.39 5 21 15.1s +# v0.6.2.9000 2.4s 2.49s 0.393 4.72MB 0.708 5 9 12.7s filtered_cnefe.lat - AND input_table_db.lon_min < filtered_cnefe.lon - AND input_table_db.lon_max > filtered_cnefe.lon - ), - - ranked AS ( - SELECT - *, - RANK() OVER ( - PARTITION BY tempidgeocodebr - ORDER BY distancia_metros ASC - ) AS ranking - FROM dist_data - ) - - SELECT * EXCLUDE(tempidgeocodebr, ranking) - FROM ranked - WHERE ranking = 1;" - ) - - - - output <- DBI::dbGetQuery(con, query_filter_cases_nearby) - - # TODO 6666666 - if (nrow(output)==0){ - stop("Nenhum endereco proximo foi encontrados") - } - - # organize output ------------------------------------------------- - - # convert df to simple feature - output_sf <- sfheaders::sf_point( - obj = output, - x = 'lon', - y = 'lat', - keep = TRUE - ) - - sf::st_crs(output_sf) <- 4674 - - duckdb::dbDisconnect(con) - - return(output_sf) -} - - - - - - - - - - - - - - - -pontos <- readRDS( - system.file("extdata/pontos.rds", package = "geocodebr") -) - -bench::mark( - # duck_filter1 = reverse_geocode_filter(coordenadas = pontos, dist_max = 2000, n_cores = 1), - # duck_filter8 = reverse_geocode_filter(coordenadas = pontos, dist_max = 2000, n_cores = 8), - # duck_join1 = reverse_geocode_join(coordenadas = pontos, dist_max = 2000, n_cores = 1), - # duck_join8 = reverse_geocode_join(coordenadas = pontos, dist_max = 2000, n_cores = 8), - hybrid1 = reverse_geocode_hybrid(coordenadas = pontos, dist_max = 2000, n_cores = 1), - hybrid8 = reverse_geocode_hybrid(coordenadas = pontos, dist_max = 2000, n_cores = 8), - iterations = 5, - check = F -) - -# 1000 pontos -# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory -# -# 1 duck_filter1 3.19m 3.31m 0.00490 75MB 0.0108 5 11 17m -# 2 duck_filter8 2.93m 3.04m 0.00516 51MB 0.00723 5 7 16.1m -# -# 1 duck_join1 2.93m 3.54m 0.00475 76.2MB 0.00854 5 9 17.6m -# 2 duck_join8 3.62m 4.05m 0.00407 51.2MB 0.00651 5 8 20.5m -# -# 1 hybrid1 5.13m 5.9m 0.00277 88.3MB 0.262 5 473 30.1m -# 2 hybrid8 5.09m 5.19m 0.00321 66MB 0.307 5 478 25.9m +## 1000 pontos +# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result +# v0.6.2 4.05s 4.66s 0.217 161MB 1.04 5 24 23s +# v0.6.2.9000 3.15s 4.11s 0.236 95MB 0.378 5 8 21.2s diff --git a/tests/testthat/test-geocode.R b/tests/testthat/test-geocode.R index 8d7fb0d..f83943f 100644 --- a/tests/testthat/test-geocode.R +++ b/tests/testthat/test-geocode.R @@ -70,6 +70,72 @@ test_that("expected output", { }) + +test_that("partial address in the input", { + + df_parcial <- data.frame( + uf = c("PA", "PA"), + municipio = c("Santarem", "Santarem"), + cep_estab = c("68005000", "68000000"), + stringsAsFactors = FALSE + ) + + campos_parcial <- geocodebr::definir_campos( + estado = "uf", + municipio = "municipio", + cep = "cep_estab" + ) + + testthat::succeed( + tester(enderecos = df_parcial, campos_endereco = campos_parcial) + ) + + # missing critical column uf of muni + testthat::expect_error( + geocodebr::definir_campos( + # estado = "uf", + municipio = "municipio") + ) + + # quando colunas de dados e campos nao correspondem + df_parcial2 <- data.frame( + uf = c("PA", "PA"), + municipio = c("Santarem", "Santarem"), + #cep_estab = c("68005000", "68000000"), + stringsAsFactors = FALSE + ) + + testthat::expect_error( + tester(enderecos = df_parcial2, campos_endereco = campos_parcial) + ) + + +}) + + +test_that("precisao Ipea", { + + df_ipea <- data.frame( + uf = c("DF", "RJ"), + municipio = c("Brasilia", "Rio de Janeiro"), + cep = c("70390-025", "20071-001"), + stringsAsFactors = FALSE + ) + + campos_ipea <- geocodebr::definir_campos( + estado = "uf", + municipio = "municipio", + cep = "cep" + ) + + out_ipea <- tester(enderecos = df_ipea, campos_endereco = campos_ipea) + + testthat::expect_true(all(out_ipea$desvio_metros < 115)) + +}) + + + test_that("argumento padronizar endereco", { # erro se input nao estiver padronizado