diff --git a/.sqlx/query-2e2ecdf53fcd4b328e22164c73aa66fce426521d0f717e67d6cff03b5424ab3f.json b/.sqlx/query-2e2ecdf53fcd4b328e22164c73aa66fce426521d0f717e67d6cff03b5424ab3f.json deleted file mode 100644 index 63917d227..000000000 --- a/.sqlx/query-2e2ecdf53fcd4b328e22164c73aa66fce426521d0f717e67d6cff03b5424ab3f.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "db_name": "PostgreSQL", - "query": "INSERT INTO files (path, mime, content, compression)\n VALUES ($1, $2, $3, $4)\n ON CONFLICT (path) DO UPDATE\n SET mime = EXCLUDED.mime, content = EXCLUDED.content, compression = EXCLUDED.compression", - "describe": { - "columns": [], - "parameters": { - "Left": [ - "Varchar", - "Varchar", - "Bytea", - "Int4" - ] - }, - "nullable": [] - }, - "hash": "2e2ecdf53fcd4b328e22164c73aa66fce426521d0f717e67d6cff03b5424ab3f" -} diff --git a/.sqlx/query-2fd2aad681960b30ca5149dfc7050c477667d5f022349661385026c757df88cc.json b/.sqlx/query-2fd2aad681960b30ca5149dfc7050c477667d5f022349661385026c757df88cc.json deleted file mode 100644 index 9510799b3..000000000 --- a/.sqlx/query-2fd2aad681960b30ca5149dfc7050c477667d5f022349661385026c757df88cc.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "db_name": "PostgreSQL", - "query": "SELECT\n path, mime, date_updated, compression,\n substring(content from $2 for $3) as content\n FROM files\n WHERE path = $1;", - "describe": { - "columns": [ - { - "ordinal": 0, - "name": "path", - "type_info": "Varchar" - }, - { - "ordinal": 1, - "name": "mime", - "type_info": "Varchar" - }, - { - "ordinal": 2, - "name": "date_updated", - "type_info": "Timestamptz" - }, - { - "ordinal": 3, - "name": "compression", - "type_info": "Int4" - }, - { - "ordinal": 4, - "name": "content", - "type_info": "Bytea" - } - ], - "parameters": { - "Left": [ - "Text", - "Int4", - "Int4" - ] - }, - "nullable": [ - false, - false, - false, - true, - null - ] - }, - "hash": "2fd2aad681960b30ca5149dfc7050c477667d5f022349661385026c757df88cc" -} diff --git a/.sqlx/query-33c3fd9e8cd6c41e3279ed9c449ce0b48db8a4628a6a30ca9987026d8710341a.json b/.sqlx/query-33c3fd9e8cd6c41e3279ed9c449ce0b48db8a4628a6a30ca9987026d8710341a.json deleted file mode 100644 index 6a60fbc92..000000000 --- a/.sqlx/query-33c3fd9e8cd6c41e3279ed9c449ce0b48db8a4628a6a30ca9987026d8710341a.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "db_name": "PostgreSQL", - "query": "SELECT COUNT(*) > 0 as \"has_count!\" FROM files WHERE path = $1", - "describe": { - "columns": [ - { - "ordinal": 0, - "name": "has_count!", - "type_info": "Bool" - } - ], - "parameters": { - "Left": [ - "Text" - ] - }, - "nullable": [ - null - ] - }, - "hash": "33c3fd9e8cd6c41e3279ed9c449ce0b48db8a4628a6a30ca9987026d8710341a" -} diff --git a/.sqlx/query-3d98a27dab09b6f74a097b8cea8077063f77fb1758c9c4a075d2dfa4e4a80822.json b/.sqlx/query-3d98a27dab09b6f74a097b8cea8077063f77fb1758c9c4a075d2dfa4e4a80822.json deleted file mode 100644 index 0bd35131c..000000000 --- a/.sqlx/query-3d98a27dab09b6f74a097b8cea8077063f77fb1758c9c4a075d2dfa4e4a80822.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "db_name": "PostgreSQL", - "query": "SELECT path\n FROM files\n WHERE path LIKE $1\n ORDER BY path;", - "describe": { - "columns": [ - { - "ordinal": 0, - "name": "path", - "type_info": "Varchar" - } - ], - "parameters": { - "Left": [ - "Text" - ] - }, - "nullable": [ - false - ] - }, - "hash": "3d98a27dab09b6f74a097b8cea8077063f77fb1758c9c4a075d2dfa4e4a80822" -} diff --git a/.sqlx/query-735acda27c734197a3e68d8bb7adcb7ea545673a9cd0cde77be894043ad4f6d9.json b/.sqlx/query-735acda27c734197a3e68d8bb7adcb7ea545673a9cd0cde77be894043ad4f6d9.json deleted file mode 100644 index 2bd024ca5..000000000 --- a/.sqlx/query-735acda27c734197a3e68d8bb7adcb7ea545673a9cd0cde77be894043ad4f6d9.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "db_name": "PostgreSQL", - "query": "DELETE FROM files WHERE path LIKE $1;", - "describe": { - "columns": [], - "parameters": { - "Left": [ - "Text" - ] - }, - "nullable": [] - }, - "hash": "735acda27c734197a3e68d8bb7adcb7ea545673a9cd0cde77be894043ad4f6d9" -} diff --git a/.sqlx/query-ce9cf294c964be76b19585a35c37f4b3643b09cfea98ed8ba13e4dadb5b5e48c.json b/.sqlx/query-ce9cf294c964be76b19585a35c37f4b3643b09cfea98ed8ba13e4dadb5b5e48c.json deleted file mode 100644 index 62e26518c..000000000 --- a/.sqlx/query-ce9cf294c964be76b19585a35c37f4b3643b09cfea98ed8ba13e4dadb5b5e48c.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "db_name": "PostgreSQL", - "query": "SELECT\n path,\n mime,\n date_updated,\n compression,\n content\n FROM files\n WHERE path = $1;", - "describe": { - "columns": [ - { - "ordinal": 0, - "name": "path", - "type_info": "Varchar" - }, - { - "ordinal": 1, - "name": "mime", - "type_info": "Varchar" - }, - { - "ordinal": 2, - "name": "date_updated", - "type_info": "Timestamptz" - }, - { - "ordinal": 3, - "name": "compression", - "type_info": "Int4" - }, - { - "ordinal": 4, - "name": "content", - "type_info": "Bytea" - } - ], - "parameters": { - "Left": [ - "Text" - ] - }, - "nullable": [ - false, - false, - false, - true, - true - ] - }, - "hash": "ce9cf294c964be76b19585a35c37f4b3643b09cfea98ed8ba13e4dadb5b5e48c" -} diff --git a/Cargo.lock b/Cargo.lock index 5d4f264ca..105aa0693 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -687,29 +687,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59e62db736db19c488966c8d787f52e6270be565727236fd5579eaa301e7bc4a" dependencies = [ "aws-smithy-async", - "aws-smithy-protocol-test", "aws-smithy-runtime-api", "aws-smithy-types", - "bytes", "h2 0.3.27", "h2 0.4.12", "http 0.2.12", "http 1.4.0", "http-body 0.4.6", - "http-body 1.0.1", "hyper 0.14.32", "hyper 1.8.1", "hyper-rustls 0.24.2", "hyper-rustls 0.27.7", "hyper-util", - "indexmap 2.12.1", "pin-project-lite", "rustls 0.21.12", "rustls 0.23.35", "rustls-native-certs", "rustls-pki-types", - "serde", - "serde_json", "tokio", "tokio-rustls 0.26.4", "tower", @@ -734,25 +728,6 @@ dependencies = [ "aws-smithy-runtime-api", ] -[[package]] -name = "aws-smithy-protocol-test" -version = "0.63.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01317a9e3c5c06f1af35001ef0c873c1e34e458c20b2ee1eee0fb431e6dbb010" -dependencies = [ - "assert-json-diff", - "aws-smithy-runtime-api", - "base64-simd", - "cbor-diag", - "ciborium", - "http 0.2.12", - "pretty_assertions", - "regex-lite", - "roxmltree", - "serde_json", - "thiserror 2.0.17", -] - [[package]] name = "aws-smithy-query" version = "0.60.9" @@ -785,7 +760,6 @@ dependencies = [ "pin-utils", "tokio", "tracing", - "tracing-subscriber", ] [[package]] @@ -1074,15 +1048,6 @@ dependencies = [ "cfg_aliases", ] -[[package]] -name = "bs58" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf88ba1141d185c399bee5288d850d63b8369520c1eafc32a0430b5b6c287bf4" -dependencies = [ - "tinyvec", -] - [[package]] name = "bstr" version = "1.12.1" @@ -1155,25 +1120,6 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" -[[package]] -name = "cbor-diag" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc245b6ecd09b23901a4fbad1ad975701fd5061ceaef6afa93a2d70605a64429" -dependencies = [ - "bs58", - "chrono", - "data-encoding", - "half", - "nom", - "num-bigint", - "num-rational", - "num-traits", - "separator", - "url", - "uuid", -] - [[package]] name = "cc" version = "1.2.49" @@ -1788,12 +1734,6 @@ dependencies = [ "parking_lot_core", ] -[[package]] -name = "data-encoding" -version = "2.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476" - [[package]] name = "debugid" version = "0.8.0" @@ -1964,26 +1904,17 @@ version = "0.6.0" dependencies = [ "anyhow", "askama", - "async-compression", "async-stream", - "aws-config", - "aws-sdk-s3", - "aws-smithy-runtime", - "aws-smithy-types", - "aws-smithy-types-convert", "axum", "axum-extra", "base64", "bincode 2.0.1", - "bzip2", "chrono", "clap", "comrak", "constant_time_eq", "crates-index", "crates-index-diff", - "criterion", - "dashmap", "derive_builder", "derive_more 2.1.0", "docs_rs_cargo_metadata", @@ -1996,11 +1927,11 @@ dependencies = [ "docs_rs_opentelemetry", "docs_rs_registry_api", "docs_rs_repository_stats", + "docs_rs_storage", "docs_rs_types", "docs_rs_uri", "docs_rs_utils", "docsrs-metadata", - "flate2", "fn-error-context", "font-awesome-as-a-crate", "futures-util", @@ -2021,7 +1952,6 @@ dependencies = [ "num_cpus", "opentelemetry", "opentelemetry_sdk", - "path-slash", "phf 0.13.1", "phf_codegen 0.13.1", "pretty_assertions", @@ -2050,11 +1980,8 @@ dependencies = [ "tracing", "tracing-futures", "tracing-log", - "tracing-subscriber", "url", "walkdir", - "zip", - "zstd", ] [[package]] @@ -2209,6 +2136,48 @@ dependencies = [ "tracing", ] +[[package]] +name = "docs_rs_storage" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-compression", + "async-stream", + "aws-config", + "aws-sdk-s3", + "aws-smithy-types-convert", + "bzip2", + "chrono", + "criterion", + "dashmap", + "docs_rs_env_vars", + "docs_rs_headers", + "docs_rs_logging", + "docs_rs_mimes", + "docs_rs_opentelemetry", + "docs_rs_types", + "docs_rs_utils", + "flate2", + "futures-util", + "http 1.4.0", + "itertools 0.14.0", + "mime", + "opentelemetry", + "rand 0.9.2", + "serde", + "serde_json", + "sqlx", + "strum", + "tempfile", + "test-case", + "thiserror 2.0.17", + "tokio", + "tracing", + "walkdir", + "zip", + "zstd", +] + [[package]] name = "docs_rs_types" version = "0.1.0" @@ -5157,12 +5126,6 @@ dependencies = [ "unicase", ] -[[package]] -name = "minimal-lexical" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" - [[package]] name = "miniz_oxide" version = "0.8.9" @@ -5263,16 +5226,6 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" -[[package]] -name = "nom" -version = "7.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" -dependencies = [ - "memchr", - "minimal-lexical", -] - [[package]] name = "normpath" version = "1.5.0" @@ -5300,16 +5253,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "num-bigint" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" -dependencies = [ - "num-integer", - "num-traits", -] - [[package]] name = "num-bigint-dig" version = "0.8.6" @@ -5352,17 +5295,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-bigint", - "num-integer", - "num-traits", -] - [[package]] name = "num-traits" version = "0.2.19" @@ -5810,12 +5742,6 @@ dependencies = [ "windows-link 0.2.1", ] -[[package]] -name = "path-slash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e91099d4268b0e11973f036e885d652fb0b21fedcf69738c627f94db6a44f42" - [[package]] name = "pem-rfc7468" version = "0.7.0" @@ -6513,15 +6439,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "roxmltree" -version = "0.14.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "921904a62e410e37e215c40381b7117f830d9d89ba60ab5236170541dd25646b" -dependencies = [ - "xmlparser", -] - [[package]] name = "rsa" version = "0.9.9" @@ -7001,12 +6918,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "separator" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f97841a747eef040fcd2e7b3b9a220a7205926e60488e673d9e4926d27772ce5" - [[package]] name = "serde" version = "1.0.228" @@ -7043,7 +6954,6 @@ version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" dependencies = [ - "indexmap 2.12.1", "itoa 1.0.15", "memchr", "ryu", @@ -8240,7 +8150,6 @@ dependencies = [ "serde", "serde_json", "sharded-slab", - "smallvec", "thread_local", "tracing", "tracing-core", diff --git a/Cargo.toml b/Cargo.toml index 1d89c2c14..d7799a907 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ exclude = [ [workspace.dependencies] anyhow = { version = "1.0.42", features = ["backtrace"]} askama = "0.14.0" +async-stream = "0.3.5" axum-extra = { version = "0.12.0", features = ["typed-header", "routing", "middleware"] } bincode = "2.0.1" chrono = { version = "0.4.11", default-features = false, features = ["clock", "serde"] } @@ -36,17 +37,21 @@ opentelemetry = "0.31.0" opentelemetry-otlp = { version = "0.31.0", features = ["grpc-tonic", "metrics"] } opentelemetry-resource-detectors = "0.10.0" opentelemetry_sdk = { version = "0.31.0", features = ["rt-tokio"] } +rand = "0.9" regex = "1" reqwest = { version = "0.12", features = ["json", "gzip"] } sentry = { version = "0.46.0", features = ["panic", "tracing", "tower-http", "anyhow", "backtrace"] } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" sqlx = { version = "0.8", features = [ "runtime-tokio", "postgres", "sqlite", "chrono" ] } +strum = { version = "0.27.0", features = ["derive"] } +tempfile = "3.1.0" test-case = "3.0.0" thiserror = "2.0.3" tokio = { version = "1.0", features = ["rt-multi-thread", "signal", "macros", "process", "sync"] } tracing = "0.1.37" url = { version = "2.1.1", features = ["serde"] } +walkdir = "2" [dependencies] docs_rs_cargo_metadata = { path = "crates/lib/docs_rs_cargo_metadata" } @@ -59,13 +64,13 @@ docs_rs_mimes = { path = "crates/lib/docs_rs_mimes" } docs_rs_opentelemetry = { path = "crates/lib/docs_rs_opentelemetry" } docs_rs_registry_api = { path = "crates/lib/docs_rs_registry_api" } docs_rs_repository_stats = { path = "crates/lib/docs_rs_repository_stats" } +docs_rs_storage = { path = "crates/lib/docs_rs_storage" } docs_rs_types = { path = "crates/lib/docs_rs_types" } docs_rs_uri = { path = "crates/lib/docs_rs_uri" } docs_rs_utils = { path = "crates/lib/docs_rs_utils" } sentry = { workspace = true } log = "0.4" tracing = { workspace = true } -tracing-subscriber = { version = "0.3.20", default-features = false, features = ["ansi", "fmt", "json", "env-filter", "tracing-log"] } tracing-log = "0.2.0" regex = { workspace = true } clap = { version = "4.0.22", features = [ "derive" ] } @@ -86,17 +91,11 @@ toml = "0.9.2" opentelemetry = { workspace = true } opentelemetry_sdk = { workspace = true } rustwide = { version = "0.21.0", features = ["unstable-toolchain-ci", "unstable"] } -zstd = "0.13.0" -flate2 = "1.1.1" hostname = "0.4.0" -path-slash = "0.2.0" base64 = "0.22" -strum = { version = "0.27.0", features = ["derive"] } +strum = { workspace = true } lol_html = "2.0.0" font-awesome-as-a-crate = { path = "crates/lib/font-awesome-as-a-crate" } -dashmap = "6.0.0" -zip = {version = "6.0.0", default-features = false, features = ["bzip2"]} -bzip2 = "0.6.0" getrandom = "0.3.1" itertools = { workspace = true } hex = "0.4.3" @@ -105,15 +104,11 @@ sysinfo = { version = "0.37.2", default-features = false, features = ["system"] derive_builder = "0.20.2" # Async -async-compression = { version = "0.4.32", features = ["tokio", "bzip2", "zstd", "gzip"] } tokio = { workspace = true } tokio-util = { version = "0.7.15", default-features = false, features = ["io"] } tracing-futures= { version = "0.2.5", features = ["std-future", "futures-03"] } futures-util = { workspace = true } -async-stream = "0.3.5" -aws-config = { version = "1.0.0", default-features = false, features = ["rt-tokio", "default-https-client"] } -aws-sdk-s3 = "1.3.0" -aws-smithy-types-convert = { version = "0.60.0", features = ["convert-chrono"] } +async-stream = { workspace = true } http = { workspace = true } # Data serialization and deserialization @@ -128,12 +123,12 @@ tower = "0.5.1" tower-http = { version = "0.6.0", features = ["fs", "trace", "timeout", "catch-panic"] } mime = { workspace = true } -tempfile = "3.1.0" +tempfile = { workspace = true } fn-error-context = "0.2.0" # Templating askama = { workspace = true } -walkdir = "2" +walkdir = { workspace = true } phf = "0.13.1" # Date and Time utilities @@ -148,17 +143,15 @@ docs_rs_database = { path = "crates/lib/docs_rs_database", features = ["testing" docs_rs_fastly = { path = "crates/lib/docs_rs_fastly", features = ["testing"] } docs_rs_headers = { path = "crates/lib/docs_rs_headers", features = ["testing"] } docs_rs_opentelemetry = { path = "crates/lib/docs_rs_opentelemetry", features = ["testing"] } +docs_rs_storage = { path = "crates/lib/docs_rs_storage", features = ["testing"] } docs_rs_types = { path = "crates/lib/docs_rs_types", features = ["testing"] } -criterion = "0.8.0" kuchikiki = "0.8" http-body-util = "0.1.0" -rand = "0.9" +rand = { workspace = true } mockito = { workspace = true } test-case = { workspace = true } tower = { version = "0.5.1", features = ["util"] } opentelemetry_sdk = { version = "0.31.0", features = ["rt-tokio", "testing"] } -aws-smithy-types = "1.0.1" -aws-smithy-runtime = {version = "1.0.1", features = ["client", "test-util"]} indoc = "2.0.0" pretty_assertions = "1.4.0" @@ -174,10 +167,6 @@ syntect = { version = "5.0.0", default-features = false, features = ["parsing", [package.metadata.cargo-machete] ignored = ["phf"] -[[bench]] -name = "compression" -harness = false - [[bin]] name = "cratesfyi" test = false diff --git a/crates/lib/docs_rs_database/src/config.rs b/crates/lib/docs_rs_database/src/config.rs index c507e6708..1d11f1166 100644 --- a/crates/lib/docs_rs_database/src/config.rs +++ b/crates/lib/docs_rs_database/src/config.rs @@ -15,4 +15,15 @@ impl Config { min_pool_idle: env("DOCSRS_MIN_POOL_IDLE", 10u32)?, }) } + + #[cfg(any(feature = "testing", test))] + pub fn test_config() -> anyhow::Result { + let mut config = Self::from_environment()?; + + // Use less connections for each test compared to production. + config.max_pool_size = 8; + config.min_pool_idle = 2; + + Ok(config) + } } diff --git a/crates/lib/docs_rs_fastly/src/cdn/real.rs b/crates/lib/docs_rs_fastly/src/cdn/real.rs index 477e3e7f8..c39acdfaa 100644 --- a/crates/lib/docs_rs_fastly/src/cdn/real.rs +++ b/crates/lib/docs_rs_fastly/src/cdn/real.rs @@ -170,7 +170,7 @@ impl CdnBehaviour for RealCdn { #[cfg(test)] mod tests { use super::*; - use docs_rs_opentelemetry::testing::setup_test_meter_provider; + use docs_rs_opentelemetry::testing::TestMetrics; use std::str::FromStr as _; #[tokio::test] @@ -191,9 +191,8 @@ mod tests { .create_async() .await; - let (_exporter, meter_provider) = setup_test_meter_provider(); - - let cdn = RealCdn::from_config(&config, &meter_provider)?; + let test_metrics = TestMetrics::new(); + let cdn = RealCdn::from_config(&config, test_metrics.provider())?; cdn.purge_surrogate_keys(vec![ SurrogateKey::from_str("crate-foo").unwrap(), @@ -224,8 +223,8 @@ mod tests { .create_async() .await; - let (_exporter, meter_provider) = setup_test_meter_provider(); - let cdn = RealCdn::from_config(&config, &meter_provider)?; + let test_metrics = TestMetrics::new(); + let cdn = RealCdn::from_config(&config, test_metrics.provider())?; assert!( cdn.purge_surrogate_keys(vec![ @@ -275,8 +274,8 @@ mod tests { .create_async() .await; - let (_exporter, meter_provider) = setup_test_meter_provider(); - let cdn = RealCdn::from_config(&config, &meter_provider)?; + let test_metrics = TestMetrics::new(); + let cdn = RealCdn::from_config(&config, test_metrics.provider())?; let keys: Vec<_> = (0..350) .map(|n| SurrogateKey::from_str(&format!("crate-foo-{n}")).unwrap()) diff --git a/crates/lib/docs_rs_logging/Cargo.toml b/crates/lib/docs_rs_logging/Cargo.toml index 834c6da55..6dbb38ac0 100644 --- a/crates/lib/docs_rs_logging/Cargo.toml +++ b/crates/lib/docs_rs_logging/Cargo.toml @@ -9,3 +9,6 @@ docs_rs_utils = { path = "../docs_rs_utils" } sentry = { workspace = true } tracing = { workspace = true } tracing-subscriber = { version = "0.3.20", default-features = false, features = ["ansi", "fmt", "json", "env-filter", "tracing-log"] } + +[features] +testing = [] diff --git a/crates/lib/docs_rs_logging/src/lib.rs b/crates/lib/docs_rs_logging/src/lib.rs index f2607f517..8f43de126 100644 --- a/crates/lib/docs_rs_logging/src/lib.rs +++ b/crates/lib/docs_rs_logging/src/lib.rs @@ -1,3 +1,6 @@ +#[cfg(feature = "testing")] +pub mod testing; + use sentry::{ TransactionContext, integrations::panic as sentry_panic, integrations::tracing as sentry_tracing, diff --git a/crates/lib/docs_rs_logging/src/testing.rs b/crates/lib/docs_rs_logging/src/testing.rs new file mode 100644 index 000000000..3dd44a20d --- /dev/null +++ b/crates/lib/docs_rs_logging/src/testing.rs @@ -0,0 +1,15 @@ +use std::str::FromStr as _; +use tracing_subscriber::{EnvFilter, filter::Directive}; + +pub fn init() { + let subscriber = tracing_subscriber::FmtSubscriber::builder() + .with_env_filter( + EnvFilter::builder() + .with_default_directive(Directive::from_str("docs_rs=info").unwrap()) + .with_env_var("DOCSRS_LOG") + .from_env_lossy(), + ) + .with_test_writer() + .finish(); + let _ = tracing::subscriber::set_global_default(subscriber); +} diff --git a/crates/lib/docs_rs_mimes/src/detect.rs b/crates/lib/docs_rs_mimes/src/detect.rs index 27124e040..7d93e5f06 100644 --- a/crates/lib/docs_rs_mimes/src/detect.rs +++ b/crates/lib/docs_rs_mimes/src/detect.rs @@ -49,4 +49,24 @@ mod tests { fn test_detect_mime(ext: &str, expected: &Mime) { assert_eq!(&detect_mime(format!("something.{ext}")), expected); } + + #[test] + fn test_mime_types() { + check_mime(".gitignore", "text/plain"); + check_mime("hello.toml", "text/toml"); + check_mime("hello.css", "text/css"); + check_mime("hello.js", "text/javascript"); + check_mime("hello.html", "text/html"); + check_mime("hello.hello.md", "text/markdown"); + check_mime("hello.markdown", "text/markdown"); + check_mime("hello.json", "application/json"); + check_mime("hello.txt", "text/plain"); + check_mime("file.rs", "text/rust"); + check_mime("important.svg", "image/svg+xml"); + } + + fn check_mime(path: &str, expected_mime: &str) { + let detected_mime = detect_mime(Path::new(&path)); + assert_eq!(detected_mime, expected_mime); + } } diff --git a/crates/lib/docs_rs_opentelemetry/src/testing.rs b/crates/lib/docs_rs_opentelemetry/src/testing/collected_metrics.rs similarity index 76% rename from crates/lib/docs_rs_opentelemetry/src/testing.rs rename to crates/lib/docs_rs_opentelemetry/src/testing/collected_metrics.rs index 0fc259420..ffd36e866 100644 --- a/crates/lib/docs_rs_opentelemetry/src/testing.rs +++ b/crates/lib/docs_rs_opentelemetry/src/testing/collected_metrics.rs @@ -1,30 +1,9 @@ -use std::sync::Arc; - -use crate::AnyMeterProvider; use anyhow::{Result, anyhow}; use derive_more::Deref; -use opentelemetry_sdk::metrics::{ - InMemoryMetricExporter, PeriodicReader, - data::{ - AggregatedMetrics, HistogramDataPoint, Metric, MetricData, ResourceMetrics, SumDataPoint, - }, +use opentelemetry_sdk::metrics::data::{ + AggregatedMetrics, HistogramDataPoint, Metric, MetricData, ResourceMetrics, SumDataPoint, }; -/// set up a standalone InMemoryMetricExporter and MeterProvider for testing purposes. -/// For when you want to collect metrics, and then inspect what was collected. -pub fn setup_test_meter_provider() -> (InMemoryMetricExporter, AnyMeterProvider) { - let metric_exporter = InMemoryMetricExporter::default(); - - ( - metric_exporter.clone(), - Arc::new( - opentelemetry_sdk::metrics::SdkMeterProvider::builder() - .with_reader(PeriodicReader::builder(metric_exporter.clone()).build()) - .build(), - ), - ) -} - /// small wrapper around the collected result of the InMemoryMetricExporter. /// For convenience in tests. #[derive(Debug)] diff --git a/crates/lib/docs_rs_opentelemetry/src/testing/mod.rs b/crates/lib/docs_rs_opentelemetry/src/testing/mod.rs new file mode 100644 index 000000000..8fe8746cf --- /dev/null +++ b/crates/lib/docs_rs_opentelemetry/src/testing/mod.rs @@ -0,0 +1,5 @@ +mod collected_metrics; +mod test_env; + +pub use collected_metrics::CollectedMetrics; +pub use test_env::TestMetrics; diff --git a/crates/lib/docs_rs_opentelemetry/src/testing/test_env.rs b/crates/lib/docs_rs_opentelemetry/src/testing/test_env.rs new file mode 100644 index 000000000..ec7a7ded1 --- /dev/null +++ b/crates/lib/docs_rs_opentelemetry/src/testing/test_env.rs @@ -0,0 +1,39 @@ +use crate::{AnyMeterProvider, testing::collected_metrics::CollectedMetrics}; +use opentelemetry_sdk::metrics::{InMemoryMetricExporter, PeriodicReader}; +use std::sync::Arc; + +/// A test metrics environment that collects metrics in memory. +pub struct TestMetrics { + exporter: InMemoryMetricExporter, + provider: AnyMeterProvider, +} + +impl TestMetrics { + pub fn new() -> Self { + let metric_exporter = InMemoryMetricExporter::default(); + + Self { + exporter: metric_exporter.clone(), + provider: Arc::new( + opentelemetry_sdk::metrics::SdkMeterProvider::builder() + .with_reader(PeriodicReader::builder(metric_exporter.clone()).build()) + .build(), + ), + } + } + + pub fn collected_metrics(&self) -> CollectedMetrics { + self.provider.force_flush().unwrap(); + CollectedMetrics(self.exporter.get_finished_metrics().unwrap()) + } + + pub fn provider(&self) -> &AnyMeterProvider { + &self.provider + } +} + +impl Default for TestMetrics { + fn default() -> Self { + Self::new() + } +} diff --git a/crates/lib/docs_rs_storage/Cargo.toml b/crates/lib/docs_rs_storage/Cargo.toml new file mode 100644 index 000000000..ca37ecb68 --- /dev/null +++ b/crates/lib/docs_rs_storage/Cargo.toml @@ -0,0 +1,59 @@ +[package] +name = "docs_rs_storage" +version = "0.1.0" +edition = "2024" + +[dependencies] +anyhow = { workspace = true } +async-compression = { version = "0.4.32", features = ["tokio", "bzip2", "zstd", "gzip"] } +async-stream = { workspace = true } +aws-config = { version = "1.0.0", default-features = false, features = ["rt-tokio", "default-https-client"] } +aws-sdk-s3 = "1.3.0" +aws-smithy-types-convert = { version = "0.60.0", features = ["convert-chrono"] } +bzip2 = "0.6.0" +chrono = { workspace = true } +dashmap = "6.0.0" +docs_rs_env_vars = { path = "../docs_rs_env_vars" } +docs_rs_headers = { path = "../docs_rs_headers" } +docs_rs_logging = { path = "../docs_rs_logging", optional = true } +docs_rs_mimes = { path = "../docs_rs_mimes" } +docs_rs_opentelemetry = { path = "../docs_rs_opentelemetry" } +docs_rs_types = { path = "../docs_rs_types" } +docs_rs_utils = { path = "../docs_rs_utils" } +flate2 = "1.1.1" +futures-util = { workspace = true } +http = { workspace = true } +itertools = { workspace = true } +mime = { workspace = true } +opentelemetry = { workspace = true } +rand = { workspace = true, optional = true } +serde = { workspace = true } +serde_json = { workspace = true } +sqlx = { workspace = true } # for sqlite +strum = { workspace = true } +tempfile = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } +walkdir = { workspace = true } +zip = {version = "6.0.0", default-features = false, features = ["bzip2"]} +zstd = "0.13.0" + +[dev-dependencies] +criterion = "0.8.0" +docs_rs_logging = { path = "../docs_rs_logging", features = ["testing"] } +docs_rs_opentelemetry = { path = "../docs_rs_opentelemetry", features = ["testing"] } +rand = { workspace = true } +test-case = { workspace = true } + +[features] +testing = [ + "dep:rand", + "dep:docs_rs_logging", + "docs_rs_logging/testing", + "docs_rs_opentelemetry/testing", +] + +[[bench]] +name = "compression" +harness = false diff --git a/benches/compression.rs b/crates/lib/docs_rs_storage/benches/compression.rs similarity index 96% rename from benches/compression.rs rename to crates/lib/docs_rs_storage/benches/compression.rs index 2514d6806..d8b2c9a15 100644 --- a/benches/compression.rs +++ b/crates/lib/docs_rs_storage/benches/compression.rs @@ -1,5 +1,5 @@ use criterion::{Criterion, Throughput, criterion_group, criterion_main}; -use docs_rs::storage::{CompressionAlgorithm, compress, decompress}; +use docs_rs_storage::{CompressionAlgorithm, compress, decompress}; use std::hint::black_box; pub fn regex_capture_matches(c: &mut Criterion) { diff --git a/benches/struct.CaptureMatches.html b/crates/lib/docs_rs_storage/benches/struct.CaptureMatches.html similarity index 100% rename from benches/struct.CaptureMatches.html rename to crates/lib/docs_rs_storage/benches/struct.CaptureMatches.html diff --git a/src/storage/archive_index.rs b/crates/lib/docs_rs_storage/src/archive_index.rs similarity index 97% rename from src/storage/archive_index.rs rename to crates/lib/docs_rs_storage/src/archive_index.rs index 498a2ec7a..cf07ca536 100644 --- a/src/storage/archive_index.rs +++ b/crates/lib/docs_rs_storage/src/archive_index.rs @@ -1,13 +1,12 @@ -use crate::{ - error::Result, - storage::{FileRange, compression::CompressionAlgorithm}, -}; -use anyhow::{Context as _, bail}; +use crate::{compression::CompressionAlgorithm, types::FileRange}; +use anyhow::{Context as _, Result, bail}; use itertools::Itertools as _; use sqlx::{Acquire as _, QueryBuilder, Row as _, Sqlite}; use std::{fs, io, path::Path}; use tracing::instrument; +pub(crate) const ARCHIVE_INDEX_FILE_EXTENSION: &str = "index"; + #[derive(PartialEq, Eq, Debug)] pub(crate) struct FileInfo { range: FileRange, diff --git a/crates/lib/docs_rs_storage/src/backends/memory.rs b/crates/lib/docs_rs_storage/src/backends/memory.rs new file mode 100644 index 000000000..385cd7f3f --- /dev/null +++ b/crates/lib/docs_rs_storage/src/backends/memory.rs @@ -0,0 +1,83 @@ +use crate::{ + Blob, + backends::StorageBackendMethods, + blob::{BlobUpload, StreamingBlob}, + errors::PathNotFoundError, + metrics::StorageMetrics, + types::FileRange, +}; +use anyhow::{Result, anyhow}; +use dashmap::DashMap; +use docs_rs_headers::compute_etag; +use futures_util::stream::{self, BoxStream}; +use itertools::Itertools as _; + +pub(crate) struct MemoryBackend { + otel_metrics: StorageMetrics, + objects: DashMap, +} + +impl MemoryBackend { + pub(crate) fn new(otel_metrics: StorageMetrics) -> Self { + Self { + otel_metrics, + objects: DashMap::new(), + } + } +} + +impl StorageBackendMethods for MemoryBackend { + async fn exists(&self, path: &str) -> Result { + Ok(self.objects.contains_key(path)) + } + + async fn get_stream(&self, path: &str, range: Option) -> Result { + let mut blob = self.objects.get(path).ok_or(PathNotFoundError)?.clone(); + debug_assert!(blob.etag.is_some()); + + if let Some(r) = range { + blob.content = blob + .content + .get(*r.start() as usize..=*r.end() as usize) + .ok_or_else(|| anyhow!("invalid range"))? + .to_vec(); + blob.etag = Some(compute_etag(&blob.content)); + } + Ok(blob.into()) + } + + async fn store_batch(&self, batch: Vec) -> Result<()> { + self.otel_metrics + .uploaded_files + .add(batch.len() as u64, &[]); + + for upload in batch { + let blob: Blob = upload.into(); + self.objects.insert(blob.path.clone(), blob); + } + + Ok(()) + } + + async fn list_prefix<'a>(&'a self, prefix: &'a str) -> BoxStream<'a, Result> { + Box::pin(stream::iter( + self.objects + .iter() + .filter_map(move |entry| { + let key = entry.key(); + if key.starts_with(prefix) { + Some(key.clone()) + } else { + None + } + }) + .sorted_unstable() + .map(Ok), + )) + } + + async fn delete_prefix(&self, prefix: &str) -> Result<()> { + self.objects.retain(|key, _| !key.starts_with(prefix)); + Ok(()) + } +} diff --git a/crates/lib/docs_rs_storage/src/backends/mod.rs b/crates/lib/docs_rs_storage/src/backends/mod.rs new file mode 100644 index 000000000..a257f8de3 --- /dev/null +++ b/crates/lib/docs_rs_storage/src/backends/mod.rs @@ -0,0 +1,53 @@ +#[cfg(any(test, feature = "testing"))] +pub(crate) mod memory; +pub(crate) mod s3; + +use crate::{BlobUpload, StreamingBlob, types::FileRange}; +use anyhow::Result; +use futures_util::stream::BoxStream; + +pub(crate) trait StorageBackendMethods { + async fn exists(&self, path: &str) -> Result; + async fn get_stream(&self, path: &str, range: Option) -> Result; + async fn store_batch(&self, batch: Vec) -> Result<()>; + async fn list_prefix<'a>(&'a self, prefix: &'a str) -> BoxStream<'a, Result>; + async fn delete_prefix(&self, prefix: &str) -> Result<()>; +} + +pub(crate) enum StorageBackend { + #[cfg(any(test, feature = "testing"))] + Memory(memory::MemoryBackend), + S3(s3::S3Backend), +} + +macro_rules! call_inner { + ($self:expr, $method:ident ( $($args:expr),* $(,)? )) => {{ + match $self { + #[cfg(any(test, feature = "testing"))] + StorageBackend::Memory(backend) => backend.$method($($args),*).await, + StorageBackend::S3(backend) => backend.$method($($args),*).await, + } + }}; +} + +impl StorageBackendMethods for StorageBackend { + async fn exists(&self, path: &str) -> Result { + call_inner!(self, exists(path)) + } + + async fn get_stream(&self, path: &str, range: Option) -> Result { + call_inner!(self, get_stream(path, range)) + } + + async fn store_batch(&self, batch: Vec) -> Result<()> { + call_inner!(self, store_batch(batch)) + } + + async fn list_prefix<'a>(&'a self, prefix: &'a str) -> BoxStream<'a, Result> { + call_inner!(self, list_prefix(prefix)) + } + + async fn delete_prefix(&self, prefix: &str) -> Result<()> { + call_inner!(self, delete_prefix(prefix)) + } +} diff --git a/src/storage/s3.rs b/crates/lib/docs_rs_storage/src/backends/s3.rs similarity index 82% rename from src/storage/s3.rs rename to crates/lib/docs_rs_storage/src/backends/s3.rs index 84f4dc8d2..93f7a6c67 100644 --- a/src/storage/s3.rs +++ b/crates/lib/docs_rs_storage/src/backends/s3.rs @@ -1,5 +1,11 @@ -use super::{BlobUpload, FileRange, StorageMetrics, StreamingBlob}; -use crate::Config; +use crate::{ + Config, + backends::StorageBackendMethods, + blob::{BlobUpload, StreamingBlob}, + errors::PathNotFoundError, + metrics::StorageMetrics, + types::FileRange, +}; use anyhow::{Context as _, Error}; use async_stream::try_stream; use aws_config::BehaviorVersion; @@ -10,15 +16,13 @@ use aws_sdk_s3::{ types::{Delete, ObjectIdentifier}, }; use aws_smithy_types_convert::date_time::DateTimeExt; -use axum_extra::headers; use chrono::Utc; -use docs_rs_headers::compute_etag; +use docs_rs_headers::{ETag, compute_etag}; use futures_util::{ future::TryFutureExt, - pin_mut, - stream::{FuturesUnordered, Stream, StreamExt}, + stream::{BoxStream, FuturesUnordered, StreamExt}, }; -use tracing::{error, instrument, warn}; +use tracing::{error, warn}; // error codes to check for when trying to determine if an error is // a "NOT FOUND" error. @@ -52,13 +56,13 @@ where if let Some(err_code) = err.code() && NOT_FOUND_ERROR_CODES.contains(&err_code) { - return Err(super::PathNotFoundError.into()); + return Err(PathNotFoundError.into()); } if let SdkError::ServiceError(err) = &err && err.raw().status().as_u16() == http::StatusCode::NOT_FOUND.as_u16() { - return Err(super::PathNotFoundError.into()); + return Err(PathNotFoundError.into()); } Err(err.into()) @@ -67,16 +71,16 @@ where } } -pub(super) struct S3Backend { +pub(crate) struct S3Backend { client: Client, bucket: String, otel_metrics: StorageMetrics, - #[cfg(test)] + #[cfg(any(test, feature = "testing"))] temporary: bool, } impl S3Backend { - pub(super) async fn new(config: &Config, otel_metrics: StorageMetrics) -> Result { + pub(crate) async fn new(config: &Config, otel_metrics: StorageMetrics) -> Result { let shared_config = aws_config::load_defaults(BehaviorVersion::latest()).await; let mut config_builder = aws_sdk_s3::config::Builder::from(&shared_config) .retry_config(RetryConfig::standard().with_max_attempts(config.aws_sdk_max_retries)) @@ -88,14 +92,10 @@ impl S3Backend { let client = Client::from_conf(config_builder.build()); - #[cfg(test)] + #[cfg(any(test, feature = "testing"))] { // Create the temporary S3 bucket during tests. if config.s3_bucket_is_temporary { - if cfg!(not(test)) { - panic!("safeguard to prevent creating temporary buckets outside of tests"); - } - client .create_bucket() .bucket(&config.s3_bucket) @@ -108,12 +108,31 @@ impl S3Backend { client, otel_metrics, bucket: config.s3_bucket.clone(), - #[cfg(test)] + #[cfg(any(test, feature = "testing"))] temporary: config.s3_bucket_is_temporary, }) } - pub(super) async fn exists(&self, path: &str) -> Result { + #[cfg(any(test, feature = "testing"))] + pub(crate) async fn cleanup_after_test(&self) -> Result<(), Error> { + assert!( + self.temporary, + "cleanup_after_test called on non-temporary S3 backend" + ); + + self.delete_prefix("").await?; + self.client + .delete_bucket() + .bucket(&self.bucket) + .send() + .await?; + + Ok(()) + } +} + +impl StorageBackendMethods for S3Backend { + async fn exists(&self, path: &str) -> Result { match self .client .head_object() @@ -124,13 +143,12 @@ impl S3Backend { .convert_errors() { Ok(_) => Ok(true), - Err(err) if err.is::() => Ok(false), + Err(err) if err.is::() => Ok(false), Err(other) => Err(other), } } - #[instrument(skip(self))] - pub(super) async fn get_stream( + async fn get_stream( &self, path: &str, range: Option, @@ -181,7 +199,7 @@ impl S3Backend { range.end() ))) } else { - match s3_etag.parse::() { + match s3_etag.parse::() { Ok(etag) => Some(etag), Err(err) => { error!(?err, s3_etag, "Failed to parse ETag from S3"); @@ -212,7 +230,7 @@ impl S3Backend { }) } - pub(super) async fn store_batch(&self, mut batch: Vec) -> Result<(), Error> { + async fn store_batch(&self, mut batch: Vec) -> Result<(), Error> { // Attempt to upload the batch 3 times for _ in 0..3 { let mut futures = FuturesUnordered::new(); @@ -230,7 +248,7 @@ impl S3Backend { self.otel_metrics.uploaded_files.add(1, &[]); }) .map_err(|err| { - warn!("Failed to upload blob to S3: {:?}", err); + warn!(?err, "Failed to upload blob to S3"); // Reintroduce failed blobs for a retry blob }), @@ -253,11 +271,8 @@ impl S3Backend { panic!("failed to upload 3 times, exiting"); } - pub(super) async fn list_prefix<'a>( - &'a self, - prefix: &'a str, - ) -> impl Stream> + 'a { - try_stream! { + async fn list_prefix<'a>(&'a self, prefix: &'a str) -> BoxStream<'a, Result> { + Box::pin(try_stream! { let mut continuation_token = None; loop { let list = self @@ -282,12 +297,11 @@ impl S3Backend { break; } } - } + }) } - pub(super) async fn delete_prefix(&self, prefix: &str) -> Result<(), Error> { + async fn delete_prefix(&self, prefix: &str) -> Result<(), Error> { let stream = self.list_prefix(prefix).await; - pin_mut!(stream); let mut chunks = stream.chunks(900); // 1000 is the limit for the delete_objects API while let Some(batch) = chunks.next().await { @@ -321,34 +335,4 @@ impl S3Backend { } Ok(()) } - - #[cfg(test)] - pub(super) async fn cleanup_after_test(&self) -> Result<(), Error> { - if !self.temporary { - return Ok(()); - } - - if cfg!(not(test)) { - panic!("safeguard to prevent deleting the production bucket"); - } - - self.delete_prefix("").await?; - self.client - .delete_bucket() - .bucket(&self.bucket) - .send() - .await?; - - Ok(()) - } -} - -#[cfg(test)] -mod tests { - // The tests for this module are in src/storage/mod.rs, as part of the backend tests. Please - // add any test checking the public interface there. - - // NOTE: trying to upload a file ending with `/` will behave differently in test and prod. - // NOTE: On s3, it will succeed and create a file called `/`. - // NOTE: On min.io, it will fail with 'Object name contains unsupported characters.' } diff --git a/crates/lib/docs_rs_storage/src/blob.rs b/crates/lib/docs_rs_storage/src/blob.rs new file mode 100644 index 000000000..bd405467b --- /dev/null +++ b/crates/lib/docs_rs_storage/src/blob.rs @@ -0,0 +1,259 @@ +use crate::{ + compression::{CompressionAlgorithm, wrap_reader_for_decompression}, + utils::sized_buffer::SizedBuffer, +}; +use anyhow::Result; +use chrono::{DateTime, Utc}; +use docs_rs_headers::{ETag, compute_etag}; +use mime::Mime; +use std::io; +use tokio::io::{AsyncBufRead, AsyncBufReadExt}; + +/// represents a blob to be uploaded to storage. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct BlobUpload { + pub path: String, + pub mime: Mime, + pub content: Vec, + pub compression: Option, +} + +impl From for BlobUpload { + fn from(value: Blob) -> Self { + Self { + path: value.path, + mime: value.mime, + content: value.content, + compression: value.compression, + } + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Blob { + pub path: String, + pub mime: Mime, + pub date_updated: DateTime, + pub etag: Option, + pub content: Vec, + pub compression: Option, +} + +impl From for Blob { + fn from(value: BlobUpload) -> Self { + Self { + path: value.path, + mime: value.mime, + date_updated: Utc::now(), + etag: compute_etag(&value.content).into(), + content: value.content, + compression: value.compression, + } + } +} + +pub struct StreamingBlob { + pub path: String, + pub mime: Mime, + pub date_updated: DateTime, + pub etag: Option, + pub compression: Option, + pub content_length: usize, + pub content: Box, +} + +impl std::fmt::Debug for StreamingBlob { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("StreamingBlob") + .field("path", &self.path) + .field("mime", &self.mime) + .field("date_updated", &self.date_updated) + .field("etag", &self.etag) + .field("compression", &self.compression) + .finish() + } +} + +impl StreamingBlob { + /// wrap the content stream in a streaming decompressor according to the + /// algorithm found in `compression` attribute. + pub async fn decompress(mut self) -> Result { + let Some(alg) = self.compression else { + return Ok(self); + }; + + self.content = wrap_reader_for_decompression(self.content, alg); + + // We fill the first bytes here to force the compressor to start decompressing. + // This is because we want a failure here in this method when the data is corrupted, + // so we can directly act on that, and users don't have any errors when they just + // stream the data. + // This won't _comsume_ the bytes. The user of this StreamingBlob will still be able + // to stream the whole content. + // + // This doesn't work 100% of the time. We might get other i/o error here, + // or the decompressor might stumble on corrupted data later during streaming. + // + // But: the most common error is that the format "magic bytes" at the beginning + // of the stream are missing, and that's caught here. + let decompressed_buf = self.content.fill_buf().await?; + debug_assert!( + !decompressed_buf.is_empty(), + "we assume if we have > 0 decompressed bytes, start of the decompression works." + ); + + self.compression = None; + // not touching the etag, it should represent the original content + Ok(self) + } + + /// consume the inner stream and materialize the full blob into memory. + pub async fn materialize(mut self, max_size: usize) -> Result { + let mut content = SizedBuffer::new(max_size); + content.reserve(self.content_length); + + tokio::io::copy(&mut self.content, &mut content).await?; + + Ok(Blob { + path: self.path, + mime: self.mime, + date_updated: self.date_updated, + etag: self.etag, // downloading doesn't change the etag + content: content.into_inner(), + compression: self.compression, + }) + } +} + +impl From for StreamingBlob { + fn from(value: Blob) -> Self { + Self { + path: value.path, + mime: value.mime, + date_updated: value.date_updated, + etag: value.etag, + compression: value.compression, + content_length: value.content.len(), + content: Box::new(io::Cursor::new(value.content)), + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::compress_async; + use docs_rs_headers::compute_etag; + + const ZSTD_EOF_BYTES: [u8; 3] = [0x01, 0x00, 0x00]; + + fn streaming_blob( + content: impl Into>, + alg: Option, + ) -> StreamingBlob { + let content = content.into(); + StreamingBlob { + path: "some_path.db".into(), + mime: mime::APPLICATION_OCTET_STREAM, + date_updated: Utc::now(), + compression: alg, + etag: Some(compute_etag(&content)), + content_length: content.len(), + content: Box::new(io::Cursor::new(content)), + } + } + + #[tokio::test] + async fn test_streaming_blob_uncompressed() -> Result<()> { + const CONTENT: &[u8] = b"Hello, world!"; + + // without decompression + { + let stream = streaming_blob(CONTENT, None); + let blob = stream.materialize(usize::MAX).await?; + assert_eq!(blob.content, CONTENT); + assert!(blob.compression.is_none()); + } + + // with decompression, does nothing + { + let stream = streaming_blob(CONTENT, None); + let blob = stream.decompress().await?.materialize(usize::MAX).await?; + assert_eq!(blob.content, CONTENT); + assert!(blob.compression.is_none()); + } + + Ok(()) + } + + #[tokio::test] + async fn test_streaming_broken_zstd_blob() -> Result<()> { + const NOT_ZSTD: &[u8] = b"Hello, world!"; + let alg = CompressionAlgorithm::Zstd; + + // without decompression + // Doesn't fail because we don't call `.decompress` + { + let stream = streaming_blob(NOT_ZSTD, Some(alg)); + let blob = stream.materialize(usize::MAX).await?; + assert_eq!(blob.content, NOT_ZSTD); + assert_eq!(blob.compression, Some(alg)); + } + + // with decompression + // should fail in the `.decompress` call, + // not later when materializing / streaming. + { + let err = streaming_blob(NOT_ZSTD, Some(alg)) + .decompress() + .await + .unwrap_err(); + + assert_eq!(err.kind(), io::ErrorKind::Other); + + assert_eq!( + err.to_string(), + "Unknown frame descriptor", + "unexpected error: {}", + err + ); + } + + Ok(()) + } + + #[tokio::test] + async fn test_streaming_blob_zstd() -> Result<()> { + const CONTENT: &[u8] = b"Hello, world!"; + let mut compressed_content = Vec::new(); + let alg = CompressionAlgorithm::Zstd; + compress_async( + &mut io::Cursor::new(CONTENT.to_vec()), + &mut compressed_content, + alg, + ) + .await?; + + // without decompression + { + let stream = streaming_blob(compressed_content.clone(), Some(alg)); + let blob = stream.materialize(usize::MAX).await?; + assert_eq!(blob.content, compressed_content); + assert_eq!(blob.content.last_chunk::<3>().unwrap(), &ZSTD_EOF_BYTES); + assert_eq!(blob.compression, Some(alg)); + } + + // with decompression + { + let blob = streaming_blob(compressed_content.clone(), Some(alg)) + .decompress() + .await? + .materialize(usize::MAX) + .await?; + assert_eq!(blob.content, CONTENT); + assert!(blob.compression.is_none()); + } + + Ok(()) + } +} diff --git a/src/storage/compression.rs b/crates/lib/docs_rs_storage/src/compression.rs similarity index 76% rename from src/storage/compression.rs rename to crates/lib/docs_rs_storage/src/compression.rs index 8070d7bcd..d00f62a83 100644 --- a/src/storage/compression.rs +++ b/crates/lib/docs_rs_storage/src/compression.rs @@ -53,7 +53,7 @@ impl std::convert::TryFrom for CompressionAlgorithm { } } -pub(crate) fn file_extension_for(algorithm: CompressionAlgorithm) -> &'static str { +pub fn file_extension_for(algorithm: CompressionAlgorithm) -> &'static str { match algorithm { CompressionAlgorithm::Zstd => "zst", CompressionAlgorithm::Bzip2 => "bz2", @@ -61,7 +61,7 @@ pub(crate) fn file_extension_for(algorithm: CompressionAlgorithm) -> &'static st } } -pub(crate) fn compression_from_file_extension(ext: &str) -> Option { +pub fn compression_from_file_extension(ext: &str) -> Option { match ext { "zst" => Some(CompressionAlgorithm::Zstd), "bz2" => Some(CompressionAlgorithm::Bzip2), @@ -170,6 +170,9 @@ pub fn decompress( #[cfg(test)] mod tests { use super::*; + use crate::{StreamingBlob, errors::SizeLimitReached}; + use anyhow::Result; + use chrono::Utc; use strum::IntoEnumIterator; use test_case::test_case; @@ -219,7 +222,7 @@ mod tests { assert!( err.downcast_ref::() .and_then(|io| io.get_ref()) - .and_then(|err| err.downcast_ref::()) + .and_then(|err| err.downcast_ref::()) .is_some() ); } @@ -239,4 +242,62 @@ mod tests { assert_eq!(file_extension_for(alg), expected); assert_eq!(compression_from_file_extension(expected), Some(alg)); } + + #[tokio::test] + #[test_case(CompressionAlgorithm::Zstd)] + #[test_case(CompressionAlgorithm::Bzip2)] + #[test_case(CompressionAlgorithm::Gzip)] + async fn test_async_compression(alg: CompressionAlgorithm) -> Result<()> { + const CONTENT: &[u8] = b"Hello, world! Hello, world! Hello, world! Hello, world!"; + + let compressed_index_content = { + let mut buf: Vec = Vec::new(); + compress_async(&mut io::Cursor::new(CONTENT.to_vec()), &mut buf, alg).await?; + buf + }; + + { + // try low-level async decompression + let mut decompressed_buf: Vec = Vec::new(); + let mut reader = wrap_reader_for_decompression( + io::Cursor::new(compressed_index_content.clone()), + alg, + ); + + tokio::io::copy(&mut reader, &mut io::Cursor::new(&mut decompressed_buf)).await?; + + assert_eq!(decompressed_buf, CONTENT); + } + + { + // try sync decompression + let decompressed_buf: Vec = decompress( + io::Cursor::new(compressed_index_content.clone()), + alg, + usize::MAX, + )?; + + assert_eq!(decompressed_buf, CONTENT); + } + + // try decompress via storage API + let blob = StreamingBlob { + path: "some_path.db".into(), + mime: mime::APPLICATION_OCTET_STREAM, + date_updated: Utc::now(), + etag: None, + compression: Some(alg), + content_length: compressed_index_content.len(), + content: Box::new(io::Cursor::new(compressed_index_content)), + } + .decompress() + .await? + .materialize(usize::MAX) + .await?; + + assert_eq!(blob.compression, None); + assert_eq!(blob.content, CONTENT); + + Ok(()) + } } diff --git a/crates/lib/docs_rs_storage/src/config.rs b/crates/lib/docs_rs_storage/src/config.rs new file mode 100644 index 000000000..6a6d7e364 --- /dev/null +++ b/crates/lib/docs_rs_storage/src/config.rs @@ -0,0 +1,121 @@ +use crate::types::StorageKind; +use docs_rs_env_vars::{env, maybe_env, require_env}; +use std::{ + io, + path::{self, Path, PathBuf}, +}; + +fn ensure_absolute_path(path: PathBuf) -> io::Result { + if path.is_absolute() { + Ok(path) + } else { + Ok(path::absolute(&path)?) + } +} + +#[derive(Debug)] +pub struct Config { + pub temp_dir: PathBuf, + + // Storage params + pub storage_backend: StorageKind, + + // AWS SDK configuration + pub aws_sdk_max_retries: u32, + + // S3 params + pub s3_bucket: String, + pub s3_region: String, + pub s3_endpoint: Option, + + // DO NOT CONFIGURE THIS THROUGH AN ENVIRONMENT VARIABLE! + // Accidentally turning this on outside of the test suite might cause data loss in the + // production environment. + #[cfg(any(test, feature = "testing"))] + pub s3_bucket_is_temporary: bool, + + // Max size of the files served by the docs.rs frontend + pub max_file_size: usize, + pub max_file_size_html: usize, + + // where do we want to store the locally cached index files + // for the remote archives? + pub local_archive_cache_path: PathBuf, + + // expected number of entries in the local archive cache. + // Makes server restarts faster by preallocating some data structures. + // General numbers (as of 2025-12): + // * we have ~1.5 mio releases with archive storage (and 400k without) + // * each release has on average 2 archive files (rustdoc, source) + // so, over all, 3 mio archive index files in S3. + // + // While due to crawlers we will download _all_ of them over time, the old + // metric "releases accessed in the last 10 minutes" was around 50k, if I + // recall correctly. + // We're using a local DashMap to store some locks for these indexes, + // and we already know in advance we need these 50k entries. + // So we can preallocate the DashMap with this number to avoid resizes. + pub local_archive_cache_expected_count: usize, +} + +impl Config { + pub fn from_environment() -> anyhow::Result { + let prefix: PathBuf = require_env("DOCSRS_PREFIX")?; + + Ok(Self { + temp_dir: prefix.join("tmp"), + storage_backend: env("DOCSRS_STORAGE_BACKEND", StorageKind::default())?, + aws_sdk_max_retries: env("DOCSRS_AWS_SDK_MAX_RETRIES", 6u32)?, + s3_bucket: env("DOCSRS_S3_BUCKET", "rust-docs-rs".to_string())?, + s3_region: env("S3_REGION", "us-west-1".to_string())?, + s3_endpoint: maybe_env("S3_ENDPOINT")?, + local_archive_cache_path: ensure_absolute_path(env( + "DOCSRS_ARCHIVE_INDEX_CACHE_PATH", + prefix.join("archive_cache"), + )?)?, + local_archive_cache_expected_count: env( + "DOCSRS_ARCHIVE_INDEX_EXPECTED_COUNT", + 100_000usize, + )?, + max_file_size: env("DOCSRS_MAX_FILE_SIZE", 50 * 1024 * 1024)?, + max_file_size_html: env("DOCSRS_MAX_FILE_SIZE_HTML", 50 * 1024 * 1024)?, + #[cfg(any(test, feature = "testing"))] + s3_bucket_is_temporary: false, + }) + } + + pub fn max_file_size_for(&self, path: impl AsRef) -> usize { + static HTML: &str = "html"; + + if let Some(ext) = path.as_ref().extension() + && ext == HTML + { + self.max_file_size_html + } else { + self.max_file_size + } + } + + #[cfg(any(feature = "testing", test))] + pub fn test_config(kind: StorageKind) -> anyhow::Result { + let mut config = Self::from_environment()?; + config.storage_backend = kind; + + config.local_archive_cache_path = + std::env::temp_dir().join(format!("docsrs-test-index-{}", rand::random::())); + + // Use a temporary S3 bucket, only used when storage_kind is set to S3 in env or later. + config.s3_bucket = format!("docsrs-test-bucket-{}", rand::random::()); + config.s3_bucket_is_temporary = true; + + Ok(config) + } + + #[cfg(any(feature = "testing", test))] + pub fn set(self, f: F) -> Self + where + F: FnOnce(Self) -> Self, + { + f(self) + } +} diff --git a/crates/lib/docs_rs_storage/src/errors.rs b/crates/lib/docs_rs_storage/src/errors.rs new file mode 100644 index 000000000..d8413f2a7 --- /dev/null +++ b/crates/lib/docs_rs_storage/src/errors.rs @@ -0,0 +1,7 @@ +#[derive(Debug, Copy, Clone, thiserror::Error)] +#[error("the size limit for the buffer was reached")] +pub struct SizeLimitReached; + +#[derive(Debug, thiserror::Error)] +#[error("path not found")] +pub struct PathNotFoundError; diff --git a/src/db/file.rs b/crates/lib/docs_rs_storage/src/file.rs similarity index 89% rename from src/db/file.rs rename to crates/lib/docs_rs_storage/src/file.rs index fa6fa2f86..d3f790382 100644 --- a/src/db/file.rs +++ b/crates/lib/docs_rs_storage/src/file.rs @@ -7,8 +7,8 @@ //! It's recommended that you use the S3 bucket in production to avoid running out of disk space. //! However, postgres is still available for testing and backwards compatibility. -use crate::error::Result; -use crate::storage::{AsyncStorage, CompressionAlgorithm}; +use crate::{compression::CompressionAlgorithm, storage::non_blocking::AsyncStorage}; +use anyhow::Result; use docs_rs_mimes::detect_mime; use mime::Mime; use serde_json::Value; @@ -19,12 +19,12 @@ use tracing::instrument; /// Used to return metadata about the file. #[derive(Debug)] pub struct FileEntry { - pub(crate) path: PathBuf, - pub(crate) size: u64, + pub path: PathBuf, + pub size: u64, } impl FileEntry { - pub(crate) fn mime(&self) -> Mime { + pub fn mime(&self) -> Mime { detect_mime(&self.path) } } @@ -58,7 +58,7 @@ pub async fn add_path_into_remote_archive + std::fmt::Debug>( Ok((file_list, algorithm)) } -pub(crate) fn file_list_to_json(files: impl IntoIterator) -> Value { +pub fn file_list_to_json(files: impl IntoIterator) -> Value { Value::Array( files .into_iter() diff --git a/crates/lib/docs_rs_storage/src/lib.rs b/crates/lib/docs_rs_storage/src/lib.rs new file mode 100644 index 000000000..e71378bf4 --- /dev/null +++ b/crates/lib/docs_rs_storage/src/lib.rs @@ -0,0 +1,29 @@ +mod archive_index; +mod backends; +mod blob; +pub mod compression; +mod config; +pub(crate) mod errors; +mod file; +mod metrics; +pub(crate) mod storage; +#[cfg(any(test, feature = "testing"))] +pub mod testing; +pub(crate) mod types; +pub(crate) mod utils; + +pub use blob::{Blob, BlobUpload, StreamingBlob}; +pub use compression::{ + CompressionAlgorithm, compress, compress_async, compression_from_file_extension, decompress, +}; +pub use config::Config; +pub use errors::{PathNotFoundError, SizeLimitReached}; +pub use file::FileEntry; +pub use file::{add_path_into_database, add_path_into_remote_archive, file_list_to_json}; +pub use storage::blocking::Storage; +pub use storage::non_blocking::AsyncStorage; +pub use types::{RustdocJsonFormatVersion, StorageKind}; +pub use utils::{ + file_list::get_file_list, + storage_path::{rustdoc_archive_path, rustdoc_json_path, source_archive_path}, +}; diff --git a/crates/lib/docs_rs_storage/src/metrics.rs b/crates/lib/docs_rs_storage/src/metrics.rs new file mode 100644 index 000000000..644b2be92 --- /dev/null +++ b/crates/lib/docs_rs_storage/src/metrics.rs @@ -0,0 +1,20 @@ +use docs_rs_opentelemetry::AnyMeterProvider; +use opentelemetry::metrics::Counter; + +#[derive(Debug)] +pub(crate) struct StorageMetrics { + pub(crate) uploaded_files: Counter, +} + +impl StorageMetrics { + pub(crate) fn new(meter_provider: &AnyMeterProvider) -> Self { + let meter = meter_provider.meter("storage"); + const PREFIX: &str = "docsrs.storage"; + Self { + uploaded_files: meter + .u64_counter(format!("{PREFIX}.uploaded_files")) + .with_unit("1") + .build(), + } + } +} diff --git a/crates/lib/docs_rs_storage/src/storage/blocking.rs b/crates/lib/docs_rs_storage/src/storage/blocking.rs new file mode 100644 index 000000000..5d6b9f4ae --- /dev/null +++ b/crates/lib/docs_rs_storage/src/storage/blocking.rs @@ -0,0 +1,182 @@ +use crate::{ + blob::Blob, compression::CompressionAlgorithm, file::FileEntry, + storage::non_blocking::AsyncStorage, types::FileRange, +}; +use anyhow::Result; +use docs_rs_types::{BuildId, Version}; +use std::{fmt, path::Path, sync::Arc}; +use tokio::runtime; + +/// Sync wrapper around `AsyncStorage` for parts of the codebase that are not async. +pub struct Storage { + inner: Arc, + runtime: runtime::Handle, +} + +#[allow(dead_code)] +impl Storage { + pub fn new(inner: Arc, runtime: runtime::Handle) -> Self { + Self { inner, runtime } + } + + pub fn exists(&self, path: &str) -> Result { + self.runtime.block_on(self.inner.exists(path)) + } + + pub fn fetch_source_file( + &self, + name: &str, + version: &Version, + latest_build_id: Option, + path: &str, + archive_storage: bool, + ) -> Result { + self.runtime.block_on(self.inner.fetch_source_file( + name, + version, + latest_build_id, + path, + archive_storage, + )) + } + + pub fn rustdoc_file_exists( + &self, + name: &str, + version: &Version, + latest_build_id: Option, + path: &str, + archive_storage: bool, + ) -> Result { + self.runtime.block_on(self.inner.rustdoc_file_exists( + name, + version, + latest_build_id, + path, + archive_storage, + )) + } + + pub fn exists_in_archive( + &self, + archive_path: &str, + latest_build_id: Option, + path: &str, + ) -> Result { + self.runtime.block_on( + self.inner + .exists_in_archive(archive_path, latest_build_id, path), + ) + } + + pub fn get(&self, path: &str, max_size: usize) -> Result { + self.runtime.block_on(self.inner.get(path, max_size)) + } + + pub(crate) fn get_range( + &self, + path: &str, + max_size: usize, + range: FileRange, + compression: Option, + ) -> Result { + self.runtime + .block_on(self.inner.get_range(path, max_size, range, compression)) + } + + pub fn get_from_archive( + &self, + archive_path: &str, + latest_build_id: Option, + path: &str, + max_size: usize, + ) -> Result { + self.runtime.block_on(self.inner.get_from_archive( + archive_path, + latest_build_id, + path, + max_size, + )) + } + + pub fn store_all_in_archive( + &self, + archive_path: &str, + root_dir: &Path, + ) -> Result<(Vec, CompressionAlgorithm)> { + self.runtime + .block_on(self.inner.store_all_in_archive(archive_path, root_dir)) + } + + pub fn store_all( + &self, + prefix: &Path, + root_dir: &Path, + ) -> Result<(Vec, CompressionAlgorithm)> { + self.runtime + .block_on(self.inner.store_all(prefix, root_dir)) + } + + #[cfg(test)] + pub fn store_blobs(&self, blobs: Vec) -> Result<()> { + self.runtime.block_on(self.inner.store_blobs(blobs)) + } + + // Store file into the backend at the given path, uncompressed. + // The path will also be used to determine the mime type. + pub fn store_one_uncompressed( + &self, + path: impl Into + std::fmt::Debug, + content: impl Into>, + ) -> Result<()> { + self.runtime + .block_on(self.inner.store_one_uncompressed(path, content)) + } + + // Store file into the backend at the given path (also used to detect mime type), returns the + // chosen compression algorithm + pub fn store_one( + &self, + path: impl Into + std::fmt::Debug, + content: impl Into>, + ) -> Result { + self.runtime.block_on(self.inner.store_one(path, content)) + } + + // Store file into the backend at the given path (also used to detect mime type), returns the + // chosen compression algorithm + pub fn store_path( + &self, + target_path: impl Into + std::fmt::Debug, + source_path: impl AsRef + std::fmt::Debug, + ) -> Result { + self.runtime + .block_on(self.inner.store_path(target_path, source_path)) + } + + /// sync wrapper for the list_prefix function + /// purely for testing purposes since it collects all files into a Vec. + #[cfg(feature = "testing")] + pub fn list_prefix(&self, prefix: &str) -> impl Iterator> { + use futures_util::stream::StreamExt; + self.runtime + .block_on(async { + self.inner + .list_prefix(prefix) + .await + .collect::>() + .await + }) + .into_iter() + } + + pub fn delete_prefix(&self, prefix: &str) -> Result<()> { + self.runtime.block_on(self.inner.delete_prefix(prefix)) + } +} + +impl std::fmt::Debug for Storage { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "sync wrapper for {:?}", self.inner) + } +} diff --git a/crates/lib/docs_rs_storage/src/storage/mod.rs b/crates/lib/docs_rs_storage/src/storage/mod.rs new file mode 100644 index 000000000..ef1d7b54f --- /dev/null +++ b/crates/lib/docs_rs_storage/src/storage/mod.rs @@ -0,0 +1,2 @@ +pub(crate) mod blocking; +pub(crate) mod non_blocking; diff --git a/src/storage/mod.rs b/crates/lib/docs_rs_storage/src/storage/non_blocking.rs similarity index 55% rename from src/storage/mod.rs rename to crates/lib/docs_rs_storage/src/storage/non_blocking.rs index f16943e98..d5cf64b45 100644 --- a/src/storage/mod.rs +++ b/crates/lib/docs_rs_storage/src/storage/non_blocking.rs @@ -1,253 +1,36 @@ -mod archive_index; -pub(crate) mod compression; -mod database; -mod s3; - -pub use self::compression::{CompressionAlgorithm, CompressionAlgorithms, compress, decompress}; - -use self::{ - compression::{compress_async, wrap_reader_for_decompression}, - database::DatabaseBackend, - s3::S3Backend, +#[cfg(any(test, feature = "testing"))] +use crate::backends::memory::MemoryBackend; +use crate::{ + Config, + archive_index::{self, ARCHIVE_INDEX_FILE_EXTENSION}, + backends::{StorageBackend, StorageBackendMethods, s3::S3Backend}, + blob::{Blob, BlobUpload, StreamingBlob}, + compression::{CompressionAlgorithm, compress, compress_async}, + errors::PathNotFoundError, + file::FileEntry, + metrics::StorageMetrics, + types::{FileRange, StorageKind}, + utils::{ + file_list::get_file_list, + storage_path::{rustdoc_archive_path, source_archive_path}, + }, }; -use crate::{Config, db::file::FileEntry, error::Result}; -use axum_extra::headers; -use chrono::{DateTime, Utc}; +use anyhow::Result; use dashmap::DashMap; -use docs_rs_database::Pool; use docs_rs_mimes::{self as mimes, detect_mime}; use docs_rs_opentelemetry::AnyMeterProvider; use docs_rs_types::{BuildId, Version}; use docs_rs_utils::spawn_blocking; -use fn_error_context::context; use futures_util::stream::BoxStream; -use mime::Mime; -use opentelemetry::metrics::Counter; -use path_slash::PathExt; use std::{ fmt, fs::{self, File}, io::{self, BufReader}, - iter, - num::ParseIntError, - ops::RangeInclusive, path::{Path, PathBuf}, - str::FromStr, sync::Arc, }; -use tokio::{ - io::{AsyncBufRead, AsyncBufReadExt}, - runtime, - sync::Mutex, -}; +use tokio::sync::Mutex; use tracing::{debug, info_span, instrument, trace, warn}; -use walkdir::WalkDir; - -const ARCHIVE_INDEX_FILE_EXTENSION: &str = "index"; - -type FileRange = RangeInclusive; - -#[derive(Debug, thiserror::Error)] -#[error("path not found")] -pub(crate) struct PathNotFoundError; - -/// represents a blob to be uploaded to storage. -#[derive(Clone, Debug, PartialEq, Eq)] -pub(crate) struct BlobUpload { - pub(crate) path: String, - pub(crate) mime: Mime, - pub(crate) content: Vec, - pub(crate) compression: Option, -} - -impl From for BlobUpload { - fn from(value: Blob) -> Self { - Self { - path: value.path, - mime: value.mime, - content: value.content, - compression: value.compression, - } - } -} - -#[derive(Clone, Debug, PartialEq, Eq)] -pub(crate) struct Blob { - pub(crate) path: String, - pub(crate) mime: Mime, - pub(crate) date_updated: DateTime, - pub(crate) etag: Option, - pub(crate) content: Vec, - pub(crate) compression: Option, -} - -pub(crate) struct StreamingBlob { - pub(crate) path: String, - pub(crate) mime: Mime, - pub(crate) date_updated: DateTime, - pub(crate) etag: Option, - pub(crate) compression: Option, - pub(crate) content_length: usize, - pub(crate) content: Box, -} - -impl std::fmt::Debug for StreamingBlob { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("StreamingBlob") - .field("path", &self.path) - .field("mime", &self.mime) - .field("date_updated", &self.date_updated) - .field("etag", &self.etag) - .field("compression", &self.compression) - .finish() - } -} - -impl StreamingBlob { - /// wrap the content stream in a streaming decompressor according to the - /// algorithm found in `compression` attribute. - pub(crate) async fn decompress(mut self) -> Result { - let Some(alg) = self.compression else { - return Ok(self); - }; - - self.content = wrap_reader_for_decompression(self.content, alg); - - // We fill the first bytes here to force the compressor to start decompressing. - // This is because we want a failure here in this method when the data is corrupted, - // so we can directly act on that, and users don't have any errors when they just - // stream the data. - // This won't _comsume_ the bytes. The user of this StreamingBlob will still be able - // to stream the whole content. - // - // This doesn't work 100% of the time. We might get other i/o error here, - // or the decompressor might stumble on corrupted data later during streaming. - // - // But: the most common error is that the format "magic bytes" at the beginning - // of the stream are missing, and that's caught here. - let decompressed_buf = self.content.fill_buf().await?; - debug_assert!( - !decompressed_buf.is_empty(), - "we assume if we have > 0 decompressed bytes, start of the decompression works." - ); - - self.compression = None; - // not touching the etag, it should represent the original content - Ok(self) - } - - /// consume the inner stream and materialize the full blob into memory. - pub(crate) async fn materialize(mut self, max_size: usize) -> Result { - let mut content = crate::utils::sized_buffer::SizedBuffer::new(max_size); - content.reserve(self.content_length); - - tokio::io::copy(&mut self.content, &mut content).await?; - - Ok(Blob { - path: self.path, - mime: self.mime, - date_updated: self.date_updated, - etag: self.etag, // downloading doesn't change the etag - content: content.into_inner(), - compression: self.compression, - }) - } -} - -impl From for StreamingBlob { - fn from(value: Blob) -> Self { - Self { - path: value.path, - mime: value.mime, - date_updated: value.date_updated, - etag: value.etag, - compression: value.compression, - content_length: value.content.len(), - content: Box::new(io::Cursor::new(value.content)), - } - } -} - -pub fn get_file_list>(path: P) -> Box>> { - let path = path.as_ref().to_path_buf(); - if path.is_file() { - let path = if let Some(parent) = path.parent() { - path.strip_prefix(parent).unwrap().to_path_buf() - } else { - path - }; - - Box::new(iter::once(Ok(path))) - } else if path.is_dir() { - Box::new( - WalkDir::new(path.clone()) - .into_iter() - .filter_map(move |result| { - let direntry = match result { - Ok(de) => de, - Err(err) => return Some(Err(err.into())), - }; - - if !direntry.file_type().is_dir() { - Some(Ok(direntry - .path() - .strip_prefix(&path) - .unwrap() - .to_path_buf())) - } else { - None - } - }), - ) - } else { - Box::new(iter::empty()) - } -} - -#[derive(Debug, thiserror::Error)] -#[error("invalid storage backend")] -pub struct InvalidStorageBackendError; - -#[derive(Debug)] -pub enum StorageKind { - Database, - S3, -} - -impl std::str::FromStr for StorageKind { - type Err = InvalidStorageBackendError; - - fn from_str(input: &str) -> Result { - match input { - "database" => Ok(StorageKind::Database), - "s3" => Ok(StorageKind::S3), - _ => Err(InvalidStorageBackendError), - } - } -} - -#[derive(Debug)] -struct StorageMetrics { - uploaded_files: Counter, -} - -impl StorageMetrics { - fn new(meter_provider: &AnyMeterProvider) -> Self { - let meter = meter_provider.meter("storage"); - const PREFIX: &str = "docsrs.storage"; - Self { - uploaded_files: meter - .u64_counter(format!("{PREFIX}.uploaded_files")) - .with_unit("1") - .build(), - } - } -} - -enum StorageBackend { - Database(DatabaseBackend), - S3(Box), -} pub struct AsyncStorage { backend: StorageBackend, @@ -257,21 +40,14 @@ pub struct AsyncStorage { } impl AsyncStorage { - pub async fn new( - pool: Pool, - config: Arc, - otel_meter_provider: &AnyMeterProvider, - ) -> Result { + pub async fn new(config: Arc, otel_meter_provider: &AnyMeterProvider) -> Result { let otel_metrics = StorageMetrics::new(otel_meter_provider); Ok(Self { backend: match config.storage_backend { - StorageKind::Database => { - StorageBackend::Database(DatabaseBackend::new(pool, otel_metrics)) - } - StorageKind::S3 => { - StorageBackend::S3(Box::new(S3Backend::new(&config, otel_metrics).await?)) - } + #[cfg(any(test, feature = "testing"))] + StorageKind::Memory => StorageBackend::Memory(MemoryBackend::new(otel_metrics)), + StorageKind::S3 => StorageBackend::S3(S3Backend::new(&config, otel_metrics).await?), }, locks: DashMap::with_capacity(config.local_archive_cache_expected_count), config, @@ -279,11 +55,8 @@ impl AsyncStorage { } #[instrument] - pub(crate) async fn exists(&self, path: &str) -> Result { - match &self.backend { - StorageBackend::Database(db) => db.exists(path).await, - StorageBackend::S3(s3) => s3.exists(path).await, - } + pub async fn exists(&self, path: &str) -> Result { + self.backend.exists(path).await } /// Fetch a rustdoc file from our blob storage. @@ -296,7 +69,7 @@ impl AsyncStorage { /// * `archive_storage` - if `true`, we will assume we have a remove ZIP archive and an index /// where we can fetch the requested path from inside the ZIP file. #[instrument] - pub(crate) async fn stream_rustdoc_file( + pub async fn stream_rustdoc_file( &self, name: &str, version: &Version, @@ -315,8 +88,7 @@ impl AsyncStorage { }) } - #[context("fetching {path} from {name} {version} (archive: {archive_storage})")] - pub(crate) async fn fetch_source_file( + pub async fn fetch_source_file( &self, name: &str, version: &Version, @@ -331,7 +103,7 @@ impl AsyncStorage { } #[instrument] - pub(crate) async fn stream_source_file( + pub async fn stream_source_file( &self, name: &str, version: &Version, @@ -350,7 +122,7 @@ impl AsyncStorage { } #[instrument] - pub(crate) async fn rustdoc_file_exists( + pub async fn rustdoc_file_exists( &self, name: &str, version: &Version, @@ -369,7 +141,7 @@ impl AsyncStorage { } #[instrument] - pub(crate) async fn exists_in_archive( + pub async fn exists_in_archive( &self, archive_path: &str, latest_build_id: Option, @@ -402,7 +174,7 @@ impl AsyncStorage { /// get, decompress and materialize an object from store #[instrument] - pub(crate) async fn get(&self, path: &str, max_size: usize) -> Result { + pub async fn get(&self, path: &str, max_size: usize) -> Result { self.get_stream(path).await?.materialize(max_size).await } @@ -411,22 +183,19 @@ impl AsyncStorage { /// We don't decompress ourselves, S3 only decompresses with a correct /// `Content-Encoding` header set, which we don't. #[instrument] - pub(crate) async fn get_raw_stream(&self, path: &str) -> Result { - match &self.backend { - StorageBackend::Database(db) => db.get_stream(path, None).await, - StorageBackend::S3(s3) => s3.get_stream(path, None).await, - } + pub async fn get_raw_stream(&self, path: &str) -> Result { + self.backend.get_stream(path, None).await } /// get a decompressing stream to an object in storage. #[instrument] - pub(crate) async fn get_stream(&self, path: &str) -> Result { + pub async fn get_stream(&self, path: &str) -> Result { Ok(self.get_raw_stream(path).await?.decompress().await?) } /// get, decompress and materialize part of an object from store #[instrument] - pub(super) async fn get_range( + pub(crate) async fn get_range( &self, path: &str, max_size: usize, @@ -441,16 +210,13 @@ impl AsyncStorage { /// get a decompressing stream to a range inside an object in storage #[instrument] - pub(super) async fn get_range_stream( + pub(crate) async fn get_range_stream( &self, path: &str, range: FileRange, compression: Option, ) -> Result { - let mut raw_stream = match &self.backend { - StorageBackend::Database(db) => db.get_stream(path, Some(range)).await, - StorageBackend::S3(s3) => s3.get_stream(path, Some(range)).await, - }?; + let mut raw_stream = self.backend.get_stream(path, Some(range)).await?; // `compression` represents the compression of the file-stream inside the archive. // We don't compress the whole archive, so the encoding of the archive's blob is irrelevant // here. @@ -580,7 +346,7 @@ impl AsyncStorage { } #[instrument] - pub(crate) async fn get_from_archive( + pub async fn get_from_archive( &self, archive_path: &str, latest_build_id: Option, @@ -594,7 +360,7 @@ impl AsyncStorage { } #[instrument(skip(self))] - pub(crate) async fn stream_from_archive( + pub async fn stream_from_archive( &self, archive_path: &str, latest_build_id: Option, @@ -655,7 +421,7 @@ impl AsyncStorage { } #[instrument(skip(self))] - pub(crate) async fn store_all_in_archive( + pub async fn store_all_in_archive( &self, archive_path: &str, root_dir: &Path, @@ -729,28 +495,29 @@ impl AsyncStorage { buf }; - self.store_inner(vec![ - BlobUpload { - path: archive_path.to_string(), - mime: mimes::APPLICATION_ZIP.clone(), - content: zip_content, - compression: None, - }, - BlobUpload { - path: remote_index_path, - mime: mime::APPLICATION_OCTET_STREAM, - content: compressed_index_content, - compression: Some(alg), - }, - ]) - .await?; + self.backend + .store_batch(vec![ + BlobUpload { + path: archive_path.to_string(), + mime: mimes::APPLICATION_ZIP.clone(), + content: zip_content, + compression: None, + }, + BlobUpload { + path: remote_index_path, + mime: mime::APPLICATION_OCTET_STREAM, + content: compressed_index_content, + compression: Some(alg), + }, + ]) + .await?; Ok((file_paths, CompressionAlgorithm::Bzip2)) } /// Store all files in `root_dir` into the backend under `prefix`. #[instrument(skip(self))] - pub(crate) async fn store_all( + pub async fn store_all( &self, prefix: &Path, root_dir: &Path, @@ -776,7 +543,7 @@ impl AsyncStorage { let file_size = file.metadata()?.len(); let content = compress(file, alg)?; - let bucket_path = prefix.join(&file_path).to_slash().unwrap().to_string(); + let bucket_path = prefix.join(&file_path).to_string_lossy().to_string(); let file_info = FileEntry { path: file_path, @@ -797,19 +564,19 @@ impl AsyncStorage { }) .await?; - self.store_inner(blobs).await?; + self.backend.store_batch(blobs).await?; Ok((file_paths_and_mimes, alg)) } #[cfg(test)] - pub(crate) async fn store_blobs(&self, blobs: Vec) -> Result<()> { - self.store_inner(blobs).await + pub async fn store_blobs(&self, blobs: Vec) -> Result<()> { + self.backend.store_batch(blobs).await } // Store file into the backend at the given path, uncompressed. // The path will also be used to determine the mime type. #[instrument(skip(self, content))] - pub(crate) async fn store_one_uncompressed( + pub async fn store_one_uncompressed( &self, path: impl Into + std::fmt::Debug, content: impl Into>, @@ -818,13 +585,14 @@ impl AsyncStorage { let content = content.into(); let mime = detect_mime(&path).to_owned(); - self.store_inner(vec![BlobUpload { - path, - mime, - content, - compression: None, - }]) - .await?; + self.backend + .store_batch(vec![BlobUpload { + path, + mime, + content, + compression: None, + }]) + .await?; Ok(()) } @@ -832,7 +600,7 @@ impl AsyncStorage { // Store file into the backend at the given path (also used to detect mime type), returns the // chosen compression algorithm #[instrument(skip(self, content))] - pub(crate) async fn store_one( + pub async fn store_one( &self, path: impl Into + std::fmt::Debug, content: impl Into>, @@ -843,19 +611,20 @@ impl AsyncStorage { let content = compress(&*content, alg)?; let mime = detect_mime(&path).to_owned(); - self.store_inner(vec![BlobUpload { - path, - mime, - content, - compression: Some(alg), - }]) - .await?; + self.backend + .store_batch(vec![BlobUpload { + path, + mime, + content, + compression: Some(alg), + }]) + .await?; Ok(alg) } #[instrument(skip(self))] - pub(crate) async fn store_path( + pub async fn store_path( &self, target_path: impl Into + std::fmt::Debug, source_path: impl AsRef + std::fmt::Debug, @@ -868,45 +637,31 @@ impl AsyncStorage { let mime = detect_mime(&target_path).to_owned(); - self.store_inner(vec![BlobUpload { - path: target_path, - mime, - content, - compression: Some(alg), - }]) - .await?; + self.backend + .store_batch(vec![BlobUpload { + path: target_path, + mime, + content, + compression: Some(alg), + }]) + .await?; Ok(alg) } - async fn store_inner(&self, batch: Vec) -> Result<()> { - match &self.backend { - StorageBackend::Database(db) => db.store_batch(batch).await, - StorageBackend::S3(s3) => s3.store_batch(batch).await, - } - } - - pub(super) async fn list_prefix<'a>( - &'a self, - prefix: &'a str, - ) -> BoxStream<'a, Result> { - match &self.backend { - StorageBackend::Database(db) => Box::pin(db.list_prefix(prefix).await), - StorageBackend::S3(s3) => Box::pin(s3.list_prefix(prefix).await), - } + pub async fn list_prefix<'a>(&'a self, prefix: &'a str) -> BoxStream<'a, Result> { + self.backend.list_prefix(prefix).await } - pub(crate) async fn delete_prefix(&self, prefix: &str) -> Result<()> { - match &self.backend { - StorageBackend::Database(db) => db.delete_prefix(prefix).await, - StorageBackend::S3(s3) => s3.delete_prefix(prefix).await, - } + #[instrument(skip(self))] + pub async fn delete_prefix(&self, prefix: &str) -> Result<()> { + self.backend.delete_prefix(prefix).await } // We're using `&self` instead of consuming `self` or creating a Drop impl because during tests // we leak the web server, and Drop isn't executed in that case (since the leaked web server // still holds a reference to the storage). - #[cfg(test)] + #[cfg(any(test, feature = "testing"))] pub(crate) async fn cleanup_after_test(&self) -> Result<()> { if let StorageBackend::S3(s3) = &self.backend { s3.cleanup_after_test().await?; @@ -918,481 +673,27 @@ impl AsyncStorage { impl std::fmt::Debug for AsyncStorage { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match &self.backend { - StorageBackend::Database(_) => write!(f, "database-backed storage"), + #[cfg(any(test, feature = "testing"))] + StorageBackend::Memory(_) => write!(f, "memory-backed storage"), StorageBackend::S3(_) => write!(f, "S3-backed storage"), } } } -/// Sync wrapper around `AsyncStorage` for parts of the codebase that are not async. -pub struct Storage { - inner: Arc, - runtime: runtime::Handle, -} - -#[allow(dead_code)] -impl Storage { - pub fn new(inner: Arc, runtime: runtime::Handle) -> Self { - Self { inner, runtime } - } - - pub(crate) fn exists(&self, path: &str) -> Result { - self.runtime.block_on(self.inner.exists(path)) - } - - pub(crate) fn fetch_source_file( - &self, - name: &str, - version: &Version, - latest_build_id: Option, - path: &str, - archive_storage: bool, - ) -> Result { - self.runtime.block_on(self.inner.fetch_source_file( - name, - version, - latest_build_id, - path, - archive_storage, - )) - } - - pub(crate) fn rustdoc_file_exists( - &self, - name: &str, - version: &Version, - latest_build_id: Option, - path: &str, - archive_storage: bool, - ) -> Result { - self.runtime.block_on(self.inner.rustdoc_file_exists( - name, - version, - latest_build_id, - path, - archive_storage, - )) - } - - pub(crate) fn exists_in_archive( - &self, - archive_path: &str, - latest_build_id: Option, - path: &str, - ) -> Result { - self.runtime.block_on( - self.inner - .exists_in_archive(archive_path, latest_build_id, path), - ) - } - - pub(crate) fn get(&self, path: &str, max_size: usize) -> Result { - self.runtime.block_on(self.inner.get(path, max_size)) - } - - pub(super) fn get_range( - &self, - path: &str, - max_size: usize, - range: FileRange, - compression: Option, - ) -> Result { - self.runtime - .block_on(self.inner.get_range(path, max_size, range, compression)) - } - - pub(crate) fn get_from_archive( - &self, - archive_path: &str, - latest_build_id: Option, - path: &str, - max_size: usize, - ) -> Result { - self.runtime.block_on(self.inner.get_from_archive( - archive_path, - latest_build_id, - path, - max_size, - )) - } - - pub(crate) fn store_all_in_archive( - &self, - archive_path: &str, - root_dir: &Path, - ) -> Result<(Vec, CompressionAlgorithm)> { - self.runtime - .block_on(self.inner.store_all_in_archive(archive_path, root_dir)) - } - - pub(crate) fn store_all( - &self, - prefix: &Path, - root_dir: &Path, - ) -> Result<(Vec, CompressionAlgorithm)> { - self.runtime - .block_on(self.inner.store_all(prefix, root_dir)) - } - - #[cfg(test)] - pub(crate) fn store_blobs(&self, blobs: Vec) -> Result<()> { - self.runtime.block_on(self.inner.store_blobs(blobs)) - } - - // Store file into the backend at the given path, uncompressed. - // The path will also be used to determine the mime type. - #[instrument(skip(self, content))] - pub(crate) fn store_one_uncompressed( - &self, - path: impl Into + std::fmt::Debug, - content: impl Into>, - ) -> Result<()> { - self.runtime - .block_on(self.inner.store_one_uncompressed(path, content)) - } - - // Store file into the backend at the given path (also used to detect mime type), returns the - // chosen compression algorithm - #[instrument(skip(self, content))] - pub(crate) fn store_one( - &self, - path: impl Into + std::fmt::Debug, - content: impl Into>, - ) -> Result { - self.runtime.block_on(self.inner.store_one(path, content)) - } - - // Store file into the backend at the given path (also used to detect mime type), returns the - // chosen compression algorithm - #[instrument(skip(self))] - pub(crate) fn store_path( - &self, - target_path: impl Into + std::fmt::Debug, - source_path: impl AsRef + std::fmt::Debug, - ) -> Result { - self.runtime - .block_on(self.inner.store_path(target_path, source_path)) - } - - /// sync wrapper for the list_prefix function - /// purely for testing purposes since it collects all files into a Vec. - #[cfg(test)] - pub(crate) fn list_prefix(&self, prefix: &str) -> impl Iterator> { - use futures_util::stream::StreamExt; - self.runtime - .block_on(async { - self.inner - .list_prefix(prefix) - .await - .collect::>() - .await - }) - .into_iter() - } - - #[instrument(skip(self))] - pub(crate) fn delete_prefix(&self, prefix: &str) -> Result<()> { - self.runtime.block_on(self.inner.delete_prefix(prefix)) - } - - // We're using `&self` instead of consuming `self` or creating a Drop impl because during tests - // we leak the web server, and Drop isn't executed in that case (since the leaked web server - // still holds a reference to the storage). - #[cfg(test)] - pub(crate) async fn cleanup_after_test(&self) -> Result<()> { - self.inner.cleanup_after_test().await - } -} - -impl std::fmt::Debug for Storage { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "sync wrapper for {:?}", self.inner) - } -} - -pub(crate) fn rustdoc_archive_path(name: &str, version: &Version) -> String { - format!("rustdoc/{name}/{version}.zip") -} - -#[derive(strum::Display, Debug, PartialEq, Eq, Clone, Copy)] -#[strum(serialize_all = "snake_case")] -pub(crate) enum RustdocJsonFormatVersion { - #[strum(serialize = "{0}")] - Version(u16), - Latest, -} - -impl FromStr for RustdocJsonFormatVersion { - type Err = ParseIntError; - fn from_str(s: &str) -> Result { - if s == "latest" { - Ok(RustdocJsonFormatVersion::Latest) - } else { - s.parse::().map(RustdocJsonFormatVersion::Version) - } - } -} - -pub(crate) fn rustdoc_json_path( - name: &str, - version: &Version, - target: &str, - format_version: RustdocJsonFormatVersion, - compression_algorithm: Option, -) -> String { - let mut path = format!( - "rustdoc-json/{name}/{version}/{target}/{name}_{version}_{target}_{format_version}.json" - ); - - if let Some(alg) = compression_algorithm { - path.push('.'); - path.push_str(compression::file_extension_for(alg)); - } - - path -} - -pub(crate) fn source_archive_path(name: &str, version: &Version) -> String { - format!("sources/{name}/{version}.zip") -} - #[cfg(test)] mod test { use super::*; - use crate::test::TestEnvironment; - use docs_rs_headers::compute_etag; - use std::env; - use test_case::test_case; - - const ZSTD_EOF_BYTES: [u8; 3] = [0x01, 0x00, 0x00]; - - fn streaming_blob( - content: impl Into>, - alg: Option, - ) -> StreamingBlob { - let content = content.into(); - StreamingBlob { - path: "some_path.db".into(), - mime: mime::APPLICATION_OCTET_STREAM, - date_updated: Utc::now(), - compression: alg, - etag: Some(compute_etag(&content)), - content_length: content.len(), - content: Box::new(io::Cursor::new(content)), - } - } - - #[tokio::test] - async fn test_streaming_blob_uncompressed() -> Result<()> { - const CONTENT: &[u8] = b"Hello, world!"; - - // without decompression - { - let stream = streaming_blob(CONTENT, None); - let blob = stream.materialize(usize::MAX).await?; - assert_eq!(blob.content, CONTENT); - assert!(blob.compression.is_none()); - } - - // with decompression, does nothing - { - let stream = streaming_blob(CONTENT, None); - let blob = stream.decompress().await?.materialize(usize::MAX).await?; - assert_eq!(blob.content, CONTENT); - assert!(blob.compression.is_none()); - } - - Ok(()) - } - - #[tokio::test] - async fn test_streaming_broken_zstd_blob() -> Result<()> { - const NOT_ZSTD: &[u8] = b"Hello, world!"; - let alg = CompressionAlgorithm::Zstd; - - // without decompression - // Doesn't fail because we don't call `.decompress` - { - let stream = streaming_blob(NOT_ZSTD, Some(alg)); - let blob = stream.materialize(usize::MAX).await?; - assert_eq!(blob.content, NOT_ZSTD); - assert_eq!(blob.compression, Some(alg)); - } - - // with decompression - // should fail in the `.decompress` call, - // not later when materializing / streaming. - { - let err = streaming_blob(NOT_ZSTD, Some(alg)) - .decompress() - .await - .unwrap_err(); - - assert_eq!(err.kind(), io::ErrorKind::Other); - - assert_eq!( - err.to_string(), - "Unknown frame descriptor", - "unexpected error: {}", - err - ); - } - - Ok(()) - } - - #[tokio::test] - async fn test_streaming_blob_zstd() -> Result<()> { - const CONTENT: &[u8] = b"Hello, world!"; - let mut compressed_content = Vec::new(); - let alg = CompressionAlgorithm::Zstd; - compress_async( - &mut io::Cursor::new(CONTENT.to_vec()), - &mut compressed_content, - alg, - ) - .await?; - - // without decompression - { - let stream = streaming_blob(compressed_content.clone(), Some(alg)); - let blob = stream.materialize(usize::MAX).await?; - assert_eq!(blob.content, compressed_content); - assert_eq!(blob.content.last_chunk::<3>().unwrap(), &ZSTD_EOF_BYTES); - assert_eq!(blob.compression, Some(alg)); - } - - // with decompression - { - let blob = streaming_blob(compressed_content.clone(), Some(alg)) - .decompress() - .await? - .materialize(usize::MAX) - .await?; - assert_eq!(blob.content, CONTENT); - assert!(blob.compression.is_none()); - } - - Ok(()) - } - - #[tokio::test] - #[test_case(CompressionAlgorithm::Zstd)] - #[test_case(CompressionAlgorithm::Bzip2)] - #[test_case(CompressionAlgorithm::Gzip)] - async fn test_async_compression(alg: CompressionAlgorithm) -> Result<()> { - const CONTENT: &[u8] = b"Hello, world! Hello, world! Hello, world! Hello, world!"; - - let compressed_index_content = { - let mut buf: Vec = Vec::new(); - compress_async(&mut io::Cursor::new(CONTENT.to_vec()), &mut buf, alg).await?; - buf - }; - - { - // try low-level async decompression - let mut decompressed_buf: Vec = Vec::new(); - let mut reader = wrap_reader_for_decompression( - io::Cursor::new(compressed_index_content.clone()), - alg, - ); - - tokio::io::copy(&mut reader, &mut io::Cursor::new(&mut decompressed_buf)).await?; - - assert_eq!(decompressed_buf, CONTENT); - } - - { - // try sync decompression - let decompressed_buf: Vec = decompress( - io::Cursor::new(compressed_index_content.clone()), - alg, - usize::MAX, - )?; - - assert_eq!(decompressed_buf, CONTENT); - } - - // try decompress via storage API - let blob = StreamingBlob { - path: "some_path.db".into(), - mime: mime::APPLICATION_OCTET_STREAM, - date_updated: Utc::now(), - etag: None, - compression: Some(alg), - content_length: compressed_index_content.len(), - content: Box::new(io::Cursor::new(compressed_index_content)), - } - .decompress() - .await? - .materialize(usize::MAX) - .await?; - - assert_eq!(blob.compression, None); - assert_eq!(blob.content, CONTENT); - - Ok(()) - } - - #[test_case("latest", RustdocJsonFormatVersion::Latest)] - #[test_case("42", RustdocJsonFormatVersion::Version(42))] - fn test_json_format_version(input: &str, expected: RustdocJsonFormatVersion) { - // test Display - assert_eq!(expected.to_string(), input); - // test FromStr - assert_eq!(expected, input.parse().unwrap()); - } - - #[test] - fn test_get_file_list() -> Result<()> { - crate::test::init_logger(); - let dir = env::current_dir().unwrap(); - - let files: Vec<_> = get_file_list(&dir).collect::>>()?; - assert!(!files.is_empty()); - - let files: Vec<_> = get_file_list(dir.join("Cargo.toml")).collect::>>()?; - assert_eq!(files[0], std::path::Path::new("Cargo.toml")); - - Ok(()) - } - - #[test] - fn test_mime_types() { - check_mime(".gitignore", "text/plain"); - check_mime("hello.toml", "text/toml"); - check_mime("hello.css", "text/css"); - check_mime("hello.js", "text/javascript"); - check_mime("hello.html", "text/html"); - check_mime("hello.hello.md", "text/markdown"); - check_mime("hello.markdown", "text/markdown"); - check_mime("hello.json", "application/json"); - check_mime("hello.txt", "text/plain"); - check_mime("file.rs", "text/rust"); - check_mime("important.svg", "image/svg+xml"); - } - - fn check_mime(path: &str, expected_mime: &str) { - let detected_mime = detect_mime(Path::new(&path)); - assert_eq!(detected_mime, expected_mime); - } + use crate::testing::TestStorage; + use tokio::fs; #[tokio::test(flavor = "multi_thread")] async fn test_outdated_local_archive_index_gets_redownloaded() -> Result<()> { - use tokio::fs; - - let env = TestEnvironment::with_config( - TestEnvironment::base_config() - .storage_backend(StorageKind::S3) - .build()?, - ) - .await?; - - let storage = env.async_storage(); + let metrics = docs_rs_opentelemetry::testing::TestMetrics::new(); + let storage = TestStorage::from_kind(StorageKind::S3, metrics.provider()).await?; // virtual latest build id, used for local caching of the index files const LATEST_BUILD_ID: Option = Some(BuildId(42)); - let cache_root = env.config().local_archive_cache_path.clone(); + let cache_root = storage.config.local_archive_cache_path.clone(); let cache_filename = |archive_name: &str| { cache_root.join(format!( @@ -1425,14 +726,14 @@ mod test { // create two archives with indexes that contain the same filename create_archive( - storage, + &storage, "test1.zip", &["file1.txt", "file2.txt", "important.txt"], ) .await?; create_archive( - storage, + &storage, "test2.zip", &["important.txt", "another_file_1.txt", "another_file_2.txt"], ) @@ -1512,7 +813,7 @@ mod test { } } -/// Backend tests are a set of tests executed on all the supported storage backends. They ensure +/// Backend tests are a set of tests executed on all the supportedootorage backends. They ensure /// docs.rs behaves the same no matter the storage backend currently used. /// /// To add a new test create the function without adding the `#[test]` attribute, and add the @@ -1521,31 +822,32 @@ mod test { /// This is the preferred way to test whether backends work. #[cfg(test)] mod backend_tests { - use docs_rs_headers::compute_etag; - use super::*; - use crate::test::TestEnvironment; + use crate::{PathNotFoundError, errors::SizeLimitReached}; + use docs_rs_headers::compute_etag; + use docs_rs_opentelemetry::testing::TestMetrics; + use futures_util::TryStreamExt as _; fn get_file_info(files: &[FileEntry], path: impl AsRef) -> Option<&FileEntry> { let path = path.as_ref(); files.iter().find(|info| info.path == path) } - fn test_exists(storage: &Storage) -> Result<()> { - assert!(!storage.exists("path/to/file.txt").unwrap()); + async fn test_exists(storage: &AsyncStorage) -> Result<()> { + assert!(!storage.exists("path/to/file.txt").await.unwrap()); let blob = BlobUpload { path: "path/to/file.txt".into(), mime: mime::TEXT_PLAIN, content: "Hello world!".into(), compression: None, }; - storage.store_blobs(vec![blob])?; - assert!(storage.exists("path/to/file.txt")?); + storage.store_blobs(vec![blob]).await?; + assert!(storage.exists("path/to/file.txt").await?); Ok(()) } - fn test_get_object(storage: &Storage) -> Result<()> { + async fn test_get_object(storage: &AsyncStorage) -> Result<()> { let path: &str = "foo/bar.txt"; let blob = BlobUpload { path: path.into(), @@ -1554,9 +856,9 @@ mod backend_tests { content: b"test content\n".to_vec(), }; - storage.store_blobs(vec![blob.clone()])?; + storage.store_blobs(vec![blob.clone()]).await?; - let found = storage.get(path, usize::MAX)?; + let found = storage.get(path, usize::MAX).await?; assert_eq!(blob.mime, found.mime); assert_eq!(blob.content, found.content); // while our db backend just does MD5, @@ -1567,6 +869,7 @@ mod backend_tests { assert!( storage .get(path, usize::MAX) + .await .unwrap_err() .downcast_ref::() .is_some() @@ -1576,7 +879,7 @@ mod backend_tests { Ok(()) } - fn test_get_range(storage: &Storage) -> Result<()> { + async fn test_get_range(storage: &AsyncStorage) -> Result<()> { let blob = BlobUpload { path: "foo/bar.txt".into(), mime: mime::TEXT_PLAIN, @@ -1586,12 +889,14 @@ mod backend_tests { let full_etag = compute_etag(&blob.content); - storage.store_blobs(vec![blob.clone()])?; + storage.store_blobs(vec![blob.clone()]).await?; let mut etags = Vec::new(); for range in [0..=4, 5..=12] { - let partial_blob = storage.get_range("foo/bar.txt", usize::MAX, range.clone(), None)?; + let partial_blob = storage + .get_range("foo/bar.txt", usize::MAX, range.clone(), None) + .await?; let range = (*range.start() as usize)..=(*range.end() as usize); assert_eq!(blob.content[range], partial_blob.content); @@ -1609,6 +914,7 @@ mod backend_tests { assert!( storage .get_range(path, usize::MAX, 0..=4, None) + .await .unwrap_err() .downcast_ref::() .is_some() @@ -1618,37 +924,45 @@ mod backend_tests { Ok(()) } - fn test_list_prefix(storage: &Storage) -> Result<()> { + async fn test_list_prefix(storage: &AsyncStorage) -> Result<()> { static FILENAMES: &[&str] = &["baz.txt", "some/bar.txt"]; - storage.store_blobs( - FILENAMES - .iter() - .map(|&filename| BlobUpload { - path: filename.into(), - mime: mime::TEXT_PLAIN, - compression: None, - content: b"test content\n".to_vec(), - }) - .collect(), - )?; + storage + .store_blobs( + FILENAMES + .iter() + .map(|&filename| BlobUpload { + path: filename.into(), + mime: mime::TEXT_PLAIN, + compression: None, + content: b"test content\n".to_vec(), + }) + .collect(), + ) + .await?; assert_eq!( - storage.list_prefix("").collect::>>()?, + storage + .list_prefix("") + .await + .try_collect::>() + .await?, FILENAMES ); assert_eq!( storage .list_prefix("some/") - .collect::>>()?, + .await + .try_collect::>() + .await?, &["some/bar.txt"] ); Ok(()) } - fn test_too_long_filename(storage: &Storage) -> Result<()> { + async fn test_too_long_filename(storage: &AsyncStorage) -> Result<()> { // minio returns ErrKeyTooLongError when the key is over 1024 bytes long. // When testing, minio just gave me `XMinioInvalidObjectName`, so I'll check that too. let long_filename = "ATCG".repeat(512); @@ -1656,6 +970,7 @@ mod backend_tests { assert!( storage .get(&long_filename, 42) + .await .unwrap_err() .is::() ); @@ -1663,7 +978,7 @@ mod backend_tests { Ok(()) } - fn test_get_too_big(storage: &Storage) -> Result<()> { + async fn test_get_too_big(storage: &AsyncStorage) -> Result<()> { const MAX_SIZE: usize = 1024; let small_blob = BlobUpload { @@ -1679,25 +994,28 @@ mod backend_tests { compression: None, }; - storage.store_blobs(vec![small_blob.clone(), big_blob])?; + storage + .store_blobs(vec![small_blob.clone(), big_blob]) + .await?; - let blob = storage.get("small-blob.bin", MAX_SIZE)?; + let blob = storage.get("small-blob.bin", MAX_SIZE).await?; assert_eq!(blob.content.len(), small_blob.content.len()); assert!( storage .get("big-blob.bin", MAX_SIZE) + .await .unwrap_err() .downcast_ref::() .and_then(|io| io.get_ref()) - .and_then(|err| err.downcast_ref::()) + .and_then(|err| err.downcast_ref::()) .is_some() ); Ok(()) } - fn test_store_blobs(env: &TestEnvironment, storage: &Storage) -> Result<()> { + async fn test_store_blobs(storage: &AsyncStorage, metrics: &TestMetrics) -> Result<()> { const NAMES: &[&str] = &[ "a", "b", @@ -1716,15 +1034,15 @@ mod backend_tests { }) .collect::>(); - storage.store_blobs(blobs.clone()).unwrap(); + storage.store_blobs(blobs.clone()).await.unwrap(); for blob in &blobs { - let actual = storage.get(&blob.path, usize::MAX)?; + let actual = storage.get(&blob.path, usize::MAX).await?; assert_eq!(blob.path, actual.path); assert_eq!(blob.mime, actual.mime); } - let collected_metrics = env.collected_metrics(); + let collected_metrics = metrics.collected_metrics(); assert_eq!( collected_metrics @@ -1737,14 +1055,21 @@ mod backend_tests { Ok(()) } - fn test_exists_without_remote_archive(storage: &Storage) -> Result<()> { + async fn test_exists_without_remote_archive(storage: &AsyncStorage) -> Result<()> { // when remote and local index don't exist, any `exists_in_archive` should // return `false` - assert!(!storage.exists_in_archive("some_archive_name", None, "some_file_name")?); + assert!( + !storage + .exists_in_archive("some_archive_name", None, "some_file_name") + .await? + ); Ok(()) } - fn test_store_all_in_archive(env: &TestEnvironment, storage: &Storage) -> Result<()> { + async fn test_store_all_in_archive( + storage: &AsyncStorage, + metrics: &TestMetrics, + ) -> Result<()> { let dir = tempfile::Builder::new() .prefix("docs.rs-upload-archive-test") .tempdir()?; @@ -1758,15 +1083,19 @@ mod backend_tests { } let local_index_location = storage - .inner .config .local_archive_cache_path .join(format!("folder/test.zip.0.{ARCHIVE_INDEX_FILE_EXTENSION}")); - let (stored_files, compression_alg) = - storage.store_all_in_archive("folder/test.zip", dir.path())?; + let (stored_files, compression_alg) = storage + .store_all_in_archive("folder/test.zip", dir.path()) + .await?; - assert!(storage.exists(&format!("folder/test.zip.{ARCHIVE_INDEX_FILE_EXTENSION}"))?); + assert!( + storage + .exists(&format!("folder/test.zip.{ARCHIVE_INDEX_FILE_EXTENSION}")) + .await? + ); assert_eq!(compression_alg, CompressionAlgorithm::Bzip2); assert_eq!(stored_files.len(), files.len()); @@ -1789,23 +1118,35 @@ mod backend_tests { // the first exists-query will download and store the index assert!(!local_index_location.exists()); - assert!(storage.exists_in_archive("folder/test.zip", None, "Cargo.toml",)?); + assert!( + storage + .exists_in_archive("folder/test.zip", None, "Cargo.toml") + .await? + ); // the second one will use the local index assert!(local_index_location.exists()); - assert!(storage.exists_in_archive("folder/test.zip", None, "src/main.rs",)?); + assert!( + storage + .exists_in_archive("folder/test.zip", None, "src/main.rs") + .await? + ); - let file = storage.get_from_archive("folder/test.zip", None, "Cargo.toml", usize::MAX)?; + let file = storage + .get_from_archive("folder/test.zip", None, "Cargo.toml", usize::MAX) + .await?; assert_eq!(file.content, b"data"); assert_eq!(file.mime, "text/toml"); assert_eq!(file.path, "folder/test.zip/Cargo.toml"); - let file = storage.get_from_archive("folder/test.zip", None, "src/main.rs", usize::MAX)?; + let file = storage + .get_from_archive("folder/test.zip", None, "src/main.rs", usize::MAX) + .await?; assert_eq!(file.content, b"data"); assert_eq!(file.mime, "text/rust"); assert_eq!(file.path, "folder/test.zip/src/main.rs"); - let collected_metrics = env.collected_metrics(); + let collected_metrics = metrics.collected_metrics(); assert_eq!( collected_metrics @@ -1818,7 +1159,7 @@ mod backend_tests { Ok(()) } - fn test_store_all(env: &TestEnvironment, storage: &Storage) -> Result<()> { + async fn test_store_all(storage: &AsyncStorage, metrics: &TestMetrics) -> Result<()> { let dir = tempfile::Builder::new() .prefix("docs.rs-upload-test") .tempdir()?; @@ -1831,7 +1172,7 @@ mod backend_tests { fs::write(path, "data")?; } - let (stored_files, algs) = storage.store_all(Path::new("prefix"), dir.path())?; + let (stored_files, algs) = storage.store_all(Path::new("prefix"), dir.path()).await?; assert_eq!(stored_files.len(), files.len()); for name in &files { assert!(get_file_info(&stored_files, name).is_some()); @@ -1845,19 +1186,19 @@ mod backend_tests { "text/rust" ); - let file = storage.get("prefix/Cargo.toml", usize::MAX)?; + let file = storage.get("prefix/Cargo.toml", usize::MAX).await?; assert_eq!(file.content, b"data"); assert_eq!(file.mime, "text/toml"); assert_eq!(file.path, "prefix/Cargo.toml"); - let file = storage.get("prefix/src/main.rs", usize::MAX)?; + let file = storage.get("prefix/src/main.rs", usize::MAX).await?; assert_eq!(file.content, b"data"); assert_eq!(file.mime, "text/rust"); assert_eq!(file.path, "prefix/src/main.rs"); assert_eq!(algs, CompressionAlgorithm::default()); - let collected_metrics = env.collected_metrics(); + let collected_metrics = metrics.collected_metrics(); assert_eq!( collected_metrics .get_metric("storage", "docsrs.storage.uploaded_files")? @@ -1869,7 +1210,7 @@ mod backend_tests { Ok(()) } - fn test_batched_uploads(storage: &Storage) -> Result<()> { + async fn test_batched_uploads(storage: &AsyncStorage) -> Result<()> { let uploads: Vec<_> = (0..=100) .map(|i| { let content = format!("const IDX: usize = {i};").as_bytes().to_vec(); @@ -1882,21 +1223,21 @@ mod backend_tests { }) .collect(); - storage.store_blobs(uploads.clone())?; + storage.store_blobs(uploads.clone()).await?; for blob in &uploads { - let stored = storage.get(&blob.path, usize::MAX)?; + let stored = storage.get(&blob.path, usize::MAX).await?; assert_eq!(&stored.content, &blob.content); } Ok(()) } - fn test_delete_prefix_without_matches(storage: &Storage) -> Result<()> { - storage.delete_prefix("prefix_without_objects") + async fn test_delete_prefix_without_matches(storage: &AsyncStorage) -> Result<()> { + storage.delete_prefix("prefix_without_objects").await } - fn test_delete_prefix(storage: &Storage) -> Result<()> { + async fn test_delete_prefix(storage: &AsyncStorage) -> Result<()> { test_deletion( storage, "foo/bar/", @@ -1910,9 +1251,10 @@ mod backend_tests { &["foo.txt", "foo/bar.txt", "bar.txt"], &["foo/bar/baz.txt", "foo/bar/foobar.txt"], ) + .await } - fn test_delete_percent(storage: &Storage) -> Result<()> { + async fn test_delete_percent(storage: &AsyncStorage) -> Result<()> { // PostgreSQL treats "%" as a special char when deleting a prefix. Make sure any "%" in the // provided prefix is properly escaped. test_deletion( @@ -1922,36 +1264,40 @@ mod backend_tests { &["foo/bar.txt"], &["foo/%/bar.txt"], ) + .await } - fn test_deletion( - storage: &Storage, + async fn test_deletion( + storage: &AsyncStorage, prefix: &str, start: &[&str], present: &[&str], missing: &[&str], ) -> Result<()> { - storage.store_blobs( - start - .iter() - .map(|path| BlobUpload { - path: (*path).to_string(), - content: b"foo\n".to_vec(), - compression: None, - mime: mime::TEXT_PLAIN, - }) - .collect(), - )?; + storage + .store_blobs( + start + .iter() + .map(|path| BlobUpload { + path: (*path).to_string(), + content: b"foo\n".to_vec(), + compression: None, + mime: mime::TEXT_PLAIN, + }) + .collect(), + ) + .await?; - storage.delete_prefix(prefix)?; + storage.delete_prefix(prefix).await?; for existing in present { - assert!(storage.get(existing, usize::MAX).is_ok()); + assert!(storage.get(existing, usize::MAX).await.is_ok()); } for missing in missing { assert!( storage .get(missing, usize::MAX) + .await .unwrap_err() .downcast_ref::() .is_some() @@ -1971,17 +1317,17 @@ mod backend_tests { ) => { $( mod $backend { - use crate::test::TestEnvironment; - use crate::storage::{StorageKind}; - - fn get_env() -> anyhow::Result { - crate::test::TestEnvironment::with_config_and_runtime( - TestEnvironment::base_config() - .storage_backend($config) - .build()? - ) + use crate::types::StorageKind; + use crate::testing::TestStorage; + use docs_rs_opentelemetry::testing::TestMetrics; + + async fn get_storage() -> anyhow::Result<(TestStorage, TestMetrics)> { + let metrics = TestMetrics::new(); + let storage = TestStorage::from_kind($config, metrics.provider()).await?; + Ok((storage, metrics)) } + backend_tests!(@tests $tests); backend_tests!(@tests_with_metrics $tests_with_metrics); } @@ -1989,19 +1335,19 @@ mod backend_tests { }; (@tests { $($test:ident,)* }) => { $( - #[test] - fn $test() -> anyhow::Result<()> { - let env = get_env()?; - super::$test(&*env.storage()) + #[tokio::test(flavor = "multi_thread")] + async fn $test() -> anyhow::Result<()> { + let (storage, _metrics) = get_storage().await?; + super::$test(&storage).await } )* }; (@tests_with_metrics { $($test:ident,)* }) => { $( - #[test] - fn $test() -> anyhow::Result<()> { - let env = get_env()?; - super::$test(&env, &*env.storage()) + #[tokio::test(flavor = "multi_thread")] + async fn $test() -> anyhow::Result<()> { + let (storage, metrics) = get_storage().await?; + super::$test(&storage, &metrics).await } )* }; @@ -2010,7 +1356,7 @@ mod backend_tests { backend_tests! { backends { s3 => StorageKind::S3, - database => StorageKind::Database, + memory => StorageKind::Memory, } tests { diff --git a/crates/lib/docs_rs_storage/src/testing/mod.rs b/crates/lib/docs_rs_storage/src/testing/mod.rs new file mode 100644 index 000000000..744a75571 --- /dev/null +++ b/crates/lib/docs_rs_storage/src/testing/mod.rs @@ -0,0 +1,5 @@ +mod test_env; +mod utils; + +pub use test_env::TestStorage; +pub use utils::check_archive_consistency; diff --git a/crates/lib/docs_rs_storage/src/testing/test_env.rs b/crates/lib/docs_rs_storage/src/testing/test_env.rs new file mode 100644 index 000000000..ebb6e6925 --- /dev/null +++ b/crates/lib/docs_rs_storage/src/testing/test_env.rs @@ -0,0 +1,68 @@ +use crate::{AsyncStorage, Config, StorageKind}; +use anyhow::Result; +use docs_rs_opentelemetry::AnyMeterProvider; +use std::{ops::Deref, sync::Arc}; +use tokio::{runtime, task::block_in_place}; + +pub struct TestStorage { + runtime: runtime::Handle, + config: Arc, + storage: Arc, +} + +impl Deref for TestStorage { + type Target = AsyncStorage; + + fn deref(&self) -> &Self::Target { + &self.storage + } +} + +impl TestStorage { + pub async fn from_kind(kind: StorageKind, meter_provider: &AnyMeterProvider) -> Result { + docs_rs_logging::testing::init(); + Self::from_config(Arc::new(Config::test_config(kind)?), meter_provider).await + } + + pub async fn from_config( + config: Arc, + meter_provider: &AnyMeterProvider, + ) -> Result { + let storage = Arc::new(AsyncStorage::new(config.clone(), meter_provider).await?); + let runtime = runtime::Handle::current(); + + Ok(Self { + config, + runtime, + storage, + }) + } + + pub fn config(&self) -> &Config { + &self.config + } + + pub fn storage(&self) -> Arc { + self.storage.clone() + } +} + +impl Drop for TestStorage { + fn drop(&mut self) { + let storage = self.storage.clone(); + let runtime = self.runtime.clone(); + + block_in_place(move || { + runtime.block_on(async move { + storage + .cleanup_after_test() + .await + .expect("failed to cleanup after tests"); + }); + }); + + if self.config.local_archive_cache_path.exists() { + std::fs::remove_dir_all(&self.config.local_archive_cache_path).unwrap(); + } + } +} diff --git a/crates/lib/docs_rs_storage/src/testing/utils.rs b/crates/lib/docs_rs_storage/src/testing/utils.rs new file mode 100644 index 000000000..ebab0393f --- /dev/null +++ b/crates/lib/docs_rs_storage/src/testing/utils.rs @@ -0,0 +1,15 @@ +use std::io; +use zip; + +/// try decompressing the zip & read the content +pub fn check_archive_consistency(compressed_body: &[u8]) -> anyhow::Result<()> { + let mut zip = zip::ZipArchive::new(io::Cursor::new(compressed_body))?; + for i in 0..zip.len() { + let mut file = zip.by_index(i)?; + + let mut buf = Vec::new(); + io::copy(&mut file, &mut buf)?; + } + + Ok(()) +} diff --git a/crates/lib/docs_rs_storage/src/types.rs b/crates/lib/docs_rs_storage/src/types.rs new file mode 100644 index 000000000..2235b68bc --- /dev/null +++ b/crates/lib/docs_rs_storage/src/types.rs @@ -0,0 +1,55 @@ +use std::{num::ParseIntError, ops::RangeInclusive, str::FromStr}; +use strum::EnumString; + +pub type FileRange = RangeInclusive; + +#[derive(Debug, Copy, Clone, EnumString)] +#[strum(serialize_all = "snake_case")] +pub enum StorageKind { + #[cfg(any(test, feature = "testing"))] + Memory, + S3, +} + +impl Default for StorageKind { + fn default() -> Self { + #[cfg(any(test, feature = "testing"))] + return StorageKind::Memory; + #[cfg(not(any(test, feature = "testing")))] + return StorageKind::S3; + } +} + +#[derive(strum::Display, Debug, PartialEq, Eq, Clone, Copy)] +#[strum(serialize_all = "snake_case")] +pub enum RustdocJsonFormatVersion { + #[strum(serialize = "{0}")] + Version(u16), + Latest, +} + +impl FromStr for RustdocJsonFormatVersion { + type Err = ParseIntError; + fn from_str(s: &str) -> Result { + if s == "latest" { + Ok(RustdocJsonFormatVersion::Latest) + } else { + s.parse::().map(RustdocJsonFormatVersion::Version) + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use test_case::test_case; + + #[test_case("latest", RustdocJsonFormatVersion::Latest)] + #[test_case("42", RustdocJsonFormatVersion::Version(42))] + fn test_json_format_version(input: &str, expected: RustdocJsonFormatVersion) { + // test Display + assert_eq!(expected.to_string(), input); + // test FromStr + assert_eq!(expected, input.parse().unwrap()); + } +} diff --git a/crates/lib/docs_rs_storage/src/utils/file_list.rs b/crates/lib/docs_rs_storage/src/utils/file_list.rs new file mode 100644 index 000000000..3484b0020 --- /dev/null +++ b/crates/lib/docs_rs_storage/src/utils/file_list.rs @@ -0,0 +1,61 @@ +use anyhow::Result; +use std::{ + iter, + path::{Path, PathBuf}, +}; +use walkdir::WalkDir; + +pub fn get_file_list>(path: P) -> Box>> { + let path = path.as_ref().to_path_buf(); + if path.is_file() { + let path = if let Some(parent) = path.parent() { + path.strip_prefix(parent).unwrap().to_path_buf() + } else { + path + }; + + Box::new(iter::once(Ok(path))) + } else if path.is_dir() { + Box::new( + WalkDir::new(path.clone()) + .into_iter() + .filter_map(move |result| { + let direntry = match result { + Ok(de) => de, + Err(err) => return Some(Err(err.into())), + }; + + if !direntry.file_type().is_dir() { + Some(Ok(direntry + .path() + .strip_prefix(&path) + .unwrap() + .to_path_buf())) + } else { + None + } + }), + ) + } else { + Box::new(iter::empty()) + } +} + +#[cfg(test)] +mod test { + use super::*; + use std::env; + + #[test] + fn test_get_file_list() -> Result<()> { + let dir = env::current_dir().unwrap(); + + let files: Vec<_> = get_file_list(&dir).collect::>>()?; + assert!(!files.is_empty()); + + let files: Vec<_> = get_file_list(dir.join("Cargo.toml")).collect::>>()?; + assert_eq!(files[0], std::path::Path::new("Cargo.toml")); + + Ok(()) + } +} diff --git a/crates/lib/docs_rs_storage/src/utils/mod.rs b/crates/lib/docs_rs_storage/src/utils/mod.rs new file mode 100644 index 000000000..65bab4072 --- /dev/null +++ b/crates/lib/docs_rs_storage/src/utils/mod.rs @@ -0,0 +1,3 @@ +pub(crate) mod file_list; +pub(crate) mod sized_buffer; +pub(crate) mod storage_path; diff --git a/src/utils/sized_buffer.rs b/crates/lib/docs_rs_storage/src/utils/sized_buffer.rs similarity index 86% rename from src/utils/sized_buffer.rs rename to crates/lib/docs_rs_storage/src/utils/sized_buffer.rs index 2c71b12ea..f89dd5e00 100644 --- a/src/utils/sized_buffer.rs +++ b/crates/lib/docs_rs_storage/src/utils/sized_buffer.rs @@ -1,3 +1,4 @@ +use crate::errors::SizeLimitReached; use std::{ io::{self, Write}, pin::Pin, @@ -11,14 +12,14 @@ pub(crate) struct SizedBuffer { } impl SizedBuffer { - pub(crate) fn new(limit: usize) -> Self { + pub fn new(limit: usize) -> Self { SizedBuffer { inner: Vec::new(), limit, } } - pub(crate) fn reserve(&mut self, amount: usize) { + pub fn reserve(&mut self, amount: usize) { if self.inner.len() + amount > self.limit { self.inner.reserve_exact(self.limit - self.inner.len()); } else { @@ -26,7 +27,7 @@ impl SizedBuffer { } } - pub(crate) fn into_inner(self) -> Vec { + pub fn into_inner(self) -> Vec { self.inner } } @@ -34,7 +35,7 @@ impl SizedBuffer { impl Write for SizedBuffer { fn write(&mut self, buf: &[u8]) -> io::Result { if self.inner.len() + buf.len() > self.limit { - Err(io::Error::other(crate::error::SizeLimitReached)) + Err(io::Error::other(SizeLimitReached)) } else { self.inner.write(buf) } @@ -88,12 +89,7 @@ mod tests { // Ensure adding a third chunk fails let error = buffer.write(&[0; 500]).unwrap_err(); - assert!( - error - .get_ref() - .unwrap() - .is::() - ); + assert!(error.get_ref().unwrap().is::()); // Ensure all the third chunk was discarded assert_eq!(1000, buffer.inner.len()); diff --git a/crates/lib/docs_rs_storage/src/utils/storage_path.rs b/crates/lib/docs_rs_storage/src/utils/storage_path.rs new file mode 100644 index 000000000..1cb62e848 --- /dev/null +++ b/crates/lib/docs_rs_storage/src/utils/storage_path.rs @@ -0,0 +1,32 @@ +use crate::{ + compression::{self, CompressionAlgorithm}, + types::RustdocJsonFormatVersion, +}; +use docs_rs_types::Version; + +pub fn rustdoc_archive_path(name: &str, version: &Version) -> String { + format!("rustdoc/{name}/{version}.zip") +} + +pub fn rustdoc_json_path( + name: &str, + version: &Version, + target: &str, + format_version: RustdocJsonFormatVersion, + compression_algorithm: Option, +) -> String { + let mut path = format!( + "rustdoc-json/{name}/{version}/{target}/{name}_{version}_{target}_{format_version}.json" + ); + + if let Some(alg) = compression_algorithm { + path.push('.'); + path.push_str(compression::file_extension_for(alg)); + } + + path +} + +pub fn source_archive_path(name: &str, version: &Version) -> String { + format!("sources/{name}/{version}.zip") +} diff --git a/crates/lib/docs_rs_storage/tests/regex/body.html b/crates/lib/docs_rs_storage/tests/regex/body.html new file mode 100644 index 000000000..64a141dbc --- /dev/null +++ b/crates/lib/docs_rs_storage/tests/regex/body.html @@ -0,0 +1,86 @@ +

[][src]Struct regex::CaptureMatches

pub struct CaptureMatches<'r, 't>(_);

An iterator that yields all non-overlapping capture groups matching a +particular regular expression.

+

The iterator stops when no more matches can be found.

+

'r is the lifetime of the compiled regular expression and 't is the +lifetime of the matched string.

+

Trait Implementations

impl<'r, 't> Iterator for CaptureMatches<'r, 't>[src]

type Item = Captures<'t>

The type of the elements being iterated over.

+

Auto Trait Implementations

impl<'r, 't> !Sync for CaptureMatches<'r, 't>

impl<'r, 't> !Send for CaptureMatches<'r, 't>

impl<'r, 't> Unpin for CaptureMatches<'r, 't>

impl<'r, 't> !UnwindSafe for CaptureMatches<'r, 't>

impl<'r, 't> !RefUnwindSafe for CaptureMatches<'r, 't>

Blanket Implementations

impl<T, U> Into<U> for T where
    U: From<T>, 
[src]

impl<I> IntoIterator for I where
    I: Iterator
[src]

type Item = <I as Iterator>::Item

The type of the elements being iterated over.

+

type IntoIter = I

Which kind of iterator are we turning this into?

+

impl<T> From<T> for T[src]

impl<T, U> TryFrom<U> for T where
    U: Into<T>, 
[src]

type Error = Infallible

The type returned in the event of a conversion error.

+

impl<T, U> TryInto<U> for T where
    U: TryFrom<T>, 
[src]

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.

+

impl<T> BorrowMut<T> for T where
    T: ?Sized
[src]

impl<T> Borrow<T> for T where
    T: ?Sized
[src]

impl<T> Any for T where
    T: 'static + ?Sized
[src]

\ No newline at end of file diff --git a/crates/lib/docs_rs_storage/tests/regex/head.html b/crates/lib/docs_rs_storage/tests/regex/head.html new file mode 100644 index 000000000..29115c2a3 --- /dev/null +++ b/crates/lib/docs_rs_storage/tests/regex/head.html @@ -0,0 +1 @@ +regex::CaptureMatches - Rust \ No newline at end of file diff --git a/migrations/20251220165302_remove-files.down.sql b/migrations/20251220165302_remove-files.down.sql new file mode 100644 index 000000000..2b7048a8c --- /dev/null +++ b/migrations/20251220165302_remove-files.down.sql @@ -0,0 +1,10 @@ +-- CREATE TABLE files ( +-- path character varying(4096) NOT NULL, +-- mime character varying(100) NOT NULL, +-- date_updated timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL, +-- content bytea, +-- compression integer +-- ); + +-- ALTER TABLE ONLY files +-- ADD CONSTRAINT files_pkey PRIMARY KEY (path); diff --git a/migrations/20251220165302_remove-files.up.sql b/migrations/20251220165302_remove-files.up.sql new file mode 100644 index 000000000..1f18541a6 --- /dev/null +++ b/migrations/20251220165302_remove-files.up.sql @@ -0,0 +1,8 @@ +-- on the off-chance that someone is self-hosting docs.rs, and +-- using database-storage, and is using `sqlx migrate run` instead of our +-- own migrate-subcommand, + +-- we shouldn't drop the files. + +-- DROP TABLE files; + diff --git a/src/bin/cratesfyi.rs b/src/bin/cratesfyi.rs index adec74104..5596583be 100644 --- a/src/bin/cratesfyi.rs +++ b/src/bin/cratesfyi.rs @@ -3,7 +3,7 @@ use chrono::NaiveDate; use clap::{Parser, Subcommand, ValueEnum}; use docs_rs::{ Config, Context, Index, PackageKind, RustwideBuilder, - db::{self, Overrides, add_path_into_database}, + db::{self, Overrides}, queue_rebuilds_faulty_rustdoc, start_web_server, utils::{ ConfigName, daemon::start_background_service_metric_collector, get_config, @@ -11,6 +11,7 @@ use docs_rs::{ remove_crate_priority, set_config, set_crate_priority, }, }; +use docs_rs_storage::add_path_into_database; use docs_rs_types::{CrateId, Version}; use futures_util::StreamExt; use std::{env, fmt::Write, net::SocketAddr, path::PathBuf, sync::Arc}; diff --git a/src/build_queue.rs b/src/build_queue.rs index 3b21950e3..5615ad3d1 100644 --- a/src/build_queue.rs +++ b/src/build_queue.rs @@ -3,7 +3,6 @@ use crate::{ db::{delete_crate, delete_version, update_latest_version_id}, docbuilder::{BuilderMetrics, PackageKind}, error::Result, - storage::AsyncStorage, utils::{ConfigName, get_config, get_crate_priority, report_error, set_config}, }; use anyhow::Context as _; @@ -12,6 +11,7 @@ use crates_index_diff::{Change, CrateVersion}; use docs_rs_database::{AsyncPoolClient, Pool}; use docs_rs_fastly::{Cdn, CdnBehaviour as _}; use docs_rs_opentelemetry::AnyMeterProvider; +use docs_rs_storage::AsyncStorage; use docs_rs_types::{CrateId, KrateName, Version}; use docs_rs_utils::retry; use fn_error_context::context; diff --git a/src/config.rs b/src/config.rs index a5a2ff221..ac43a2d73 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,11 +1,6 @@ -use crate::storage::StorageKind; use anyhow::{Context as _, Result, bail}; use docs_rs_env_vars::{env, maybe_env, require_env}; -use std::{ - io, - path::{self, Path, PathBuf}, - time::Duration, -}; +use std::{path::PathBuf, sync::Arc, time::Duration}; #[derive(Debug, derive_builder::Builder)] #[builder(pattern = "owned")] @@ -17,24 +12,6 @@ pub struct Config { /// How long to wait between registry checks pub(crate) delay_between_registry_fetches: Duration, - // Storage params - pub(crate) storage_backend: StorageKind, - - // AWS SDK configuration - pub(crate) aws_sdk_max_retries: u32, - - // S3 params - pub(crate) s3_bucket: String, - pub(crate) s3_region: String, - pub(crate) s3_endpoint: Option, - - // DO NOT CONFIGURE THIS THROUGH AN ENVIRONMENT VARIABLE! - // Accidentally turning this on outside of the test suite might cause data loss in the - // production environment. - #[cfg(test)] - #[builder(default)] - pub(crate) s3_bucket_is_temporary: bool, - // Access token for APIs for crates.io (careful: use // constant_time_eq for comparisons!) pub(crate) cratesio_token: Option, @@ -43,9 +20,6 @@ pub struct Config { pub(crate) request_timeout: Option, pub(crate) report_request_timeouts: bool, - // Max size of the files served by the docs.rs frontend - pub(crate) max_file_size: usize, - pub(crate) max_file_size_html: usize, // The most memory that can be used to parse an HTML file pub(crate) max_parse_memory: usize, // Time between 'git gc --auto' calls in seconds @@ -63,25 +37,6 @@ pub struct Config { // For unit-tests the number has to be higher. pub(crate) random_crate_search_view_size: u32, - // where do we want to store the locally cached index files - // for the remote archives? - pub(crate) local_archive_cache_path: PathBuf, - - // expected number of entries in the local archive cache. - // Makes server restarts faster by preallocating some data structures. - // General numbers (as of 2025-12): - // * we have ~1.5 mio releases with archive storage (and 400k without) - // * each release has on average 2 archive files (rustdoc, source) - // so, over all, 3 mio archive index files in S3. - // - // While due to crawlers we will download _all_ of them over time, the old - // metric "releases accessed in the last 10 minutes" was around 50k, if I - // recall correctly. - // We're using a local DashMap to store some locks for these indexes, - // and we already know in advance we need these 50k entries. - // So we can preallocate the DashMap with this number to avoid resizes. - pub(crate) local_archive_cache_expected_count: usize, - // Where to collect metrics for the metrics initiative. // When empty, we won't collect metrics. pub(crate) compiler_metrics_collection_path: Option, @@ -121,6 +76,7 @@ pub struct Config { pub(crate) registry_api: docs_rs_registry_api::Config, pub(crate) database: docs_rs_database::Config, pub(crate) repository_stats: docs_rs_repository_stats::Config, + pub(crate) storage: Arc, } impl Config { @@ -160,14 +116,7 @@ impl Config { .registry_index_path(env("REGISTRY_INDEX_PATH", prefix.join("crates.io-index"))?) .registry_url(maybe_env("REGISTRY_URL")?) .prefix(prefix.clone()) - .storage_backend(env("DOCSRS_STORAGE_BACKEND", StorageKind::Database)?) - .aws_sdk_max_retries(env("DOCSRS_AWS_SDK_MAX_RETRIES", 6u32)?) - .s3_bucket(env("DOCSRS_S3_BUCKET", "rust-docs-rs".to_string())?) - .s3_region(env("S3_REGION", "us-west-1".to_string())?) - .s3_endpoint(maybe_env("S3_ENDPOINT")?) .cratesio_token(maybe_env("DOCSRS_CRATESIO_TOKEN")?) - .max_file_size(env("DOCSRS_MAX_FILE_SIZE", 50 * 1024 * 1024)?) - .max_file_size_html(env("DOCSRS_MAX_FILE_SIZE_HTML", 50 * 1024 * 1024)?) // LOL HTML only uses as much memory as the size of the start tag! // https://github.com/rust-lang/docs.rs/pull/930#issuecomment-667729380 .max_parse_memory(env("DOCSRS_MAX_PARSE_MEMORY", 5 * 1024 * 1024)?) @@ -181,14 +130,6 @@ impl Config { "CACHE_CONTROL_STALE_WHILE_REVALIDATE", )?) .cache_invalidatable_responses(env("DOCSRS_CACHE_INVALIDATEABLE_RESPONSES", true)?) - .local_archive_cache_path(ensure_absolute_path(env( - "DOCSRS_ARCHIVE_INDEX_CACHE_PATH", - prefix.join("archive_cache"), - )?)?) - .local_archive_cache_expected_count(env( - "DOCSRS_ARCHIVE_INDEX_EXPECTED_COUNT", - 100_000usize, - )?) .compiler_metrics_collection_path(maybe_env("DOCSRS_COMPILER_METRICS_PATH")?) .temp_dir(temp_dir) .rustwide_workspace(env( @@ -215,26 +156,7 @@ impl Config { .opentelemetry(docs_rs_opentelemetry::Config::from_environment()?) .registry_api(docs_rs_registry_api::Config::from_environment()?) .database(docs_rs_database::Config::from_environment()?) - .repository_stats(docs_rs_repository_stats::Config::from_environment()?)) - } - - pub fn max_file_size_for(&self, path: impl AsRef) -> usize { - static HTML: &str = "html"; - - if let Some(ext) = path.as_ref().extension() - && ext == HTML - { - self.max_file_size_html - } else { - self.max_file_size - } - } -} - -fn ensure_absolute_path(path: PathBuf) -> io::Result { - if path.is_absolute() { - Ok(path) - } else { - Ok(path::absolute(&path)?) + .repository_stats(docs_rs_repository_stats::Config::from_environment()?) + .storage(Arc::new(docs_rs_storage::Config::from_environment()?))) } } diff --git a/src/context.rs b/src/context.rs index 8d47e2afb..8643509d0 100644 --- a/src/context.rs +++ b/src/context.rs @@ -1,10 +1,11 @@ -use crate::{AsyncBuildQueue, AsyncStorage, BuildQueue, Config, Storage}; +use crate::{AsyncBuildQueue, BuildQueue, Config}; use anyhow::Result; use docs_rs_database::Pool; use docs_rs_fastly::Cdn; use docs_rs_opentelemetry::{AnyMeterProvider, get_meter_provider}; use docs_rs_registry_api::RegistryApi; use docs_rs_repository_stats::RepositoryStatsUpdater; +use docs_rs_storage::{AsyncStorage, Storage}; use std::sync::Arc; use tokio::runtime; @@ -33,7 +34,10 @@ impl Context { .then(|| Cdn::from_config(&config.fastly, &meter_provider)) .transpose()?; - Self::from_parts(config, meter_provider, pool, cdn).await + let async_storage = + Arc::new(AsyncStorage::new(config.storage.clone(), &meter_provider).await?); + + Self::from_parts(config, meter_provider, pool, async_storage, cdn).await } /// Create a new context environment from the given configuration, for running tests. @@ -42,8 +46,16 @@ impl Context { config: Config, meter_provider: AnyMeterProvider, pool: Pool, + async_storage: Arc, ) -> Result { - Self::from_parts(config, meter_provider, pool, Some(Cdn::mock())).await + Self::from_parts( + config, + meter_provider, + pool, + async_storage, + Some(Cdn::mock()), + ) + .await } /// private function for context environment generation, allows passing in a @@ -53,13 +65,11 @@ impl Context { config: Config, meter_provider: AnyMeterProvider, pool: Pool, + async_storage: Arc, cdn: Option, ) -> Result { let config = Arc::new(config); - let async_storage = - Arc::new(AsyncStorage::new(pool.clone(), config.clone(), &meter_provider).await?); - let cdn = cdn.map(Arc::new); let async_build_queue = Arc::new(AsyncBuildQueue::new( pool.clone(), diff --git a/src/db/add_package.rs b/src/db/add_package.rs index 4183caa63..fe57fa4c0 100644 --- a/src/db/add_package.rs +++ b/src/db/add_package.rs @@ -1,12 +1,12 @@ use crate::{ docbuilder::DocCoverage, error::Result, - storage::CompressionAlgorithm, web::crate_details::{latest_release, releases_for_crate}, }; use anyhow::{Context, anyhow}; use docs_rs_cargo_metadata::{MetadataPackage, ReleaseDependencyList}; use docs_rs_registry_api::{CrateData, CrateOwner, ReleaseData}; +use docs_rs_storage::CompressionAlgorithm; use docs_rs_types::{BuildId, BuildStatus, CrateId, Feature, ReleaseId, Version}; use docs_rs_utils::rustc_version::parse_rustc_date; use futures_util::stream::TryStreamExt; diff --git a/src/db/delete.rs b/src/db/delete.rs index e8a581774..2674a9009 100644 --- a/src/db/delete.rs +++ b/src/db/delete.rs @@ -1,9 +1,6 @@ -use crate::{ - Config, - error::Result, - storage::{AsyncStorage, rustdoc_archive_path, source_archive_path}, -}; +use crate::{Config, error::Result}; use anyhow::Context as _; +use docs_rs_storage::{AsyncStorage, rustdoc_archive_path, source_archive_path}; use docs_rs_types::{CrateId, Version}; use fn_error_context::context; use sqlx::Connection; @@ -41,7 +38,7 @@ pub async fn delete_crate( storage.delete_prefix(&remote_folder).await?; // remove existing local archive index files. - let local_index_folder = config.local_archive_cache_path.join(&remote_folder); + let local_index_folder = config.storage.local_archive_cache_path.join(&remote_folder); if local_index_folder.exists() { tokio::fs::remove_dir_all(&local_index_folder) .await @@ -82,7 +79,7 @@ pub async fn delete_version( .await?; } - let local_archive_cache = &config.local_archive_cache_path; + let local_archive_cache = &config.storage.local_archive_cache_path; let mut paths = vec![source_archive_path(name, version)]; if is_library { paths.push(rustdoc_archive_path(name, version)); @@ -206,9 +203,9 @@ async fn delete_crate_from_database( #[cfg(test)] mod tests { use super::*; - use crate::storage::{CompressionAlgorithm, rustdoc_json_path}; use crate::test::{KRATE, V1, V2, async_wrapper, fake_release_that_failed_before_build}; use docs_rs_registry_api::{CrateOwner, OwnerKind}; + use docs_rs_storage::{CompressionAlgorithm, RustdocJsonFormatVersion, rustdoc_json_path}; use docs_rs_types::ReleaseId; use test_case::test_case; @@ -383,7 +380,7 @@ mod tests { "a", version, "x86_64-unknown-linux-gnu", - crate::storage::RustdocJsonFormatVersion::Latest, + RustdocJsonFormatVersion::Latest, Some(CompressionAlgorithm::Zstd), )) .await @@ -459,6 +456,7 @@ mod tests { assert!(!env.async_storage().exists(&archive_index).await?); assert!( !env.config() + .storage .local_archive_cache_path .join(&archive_index) .exists() diff --git a/src/db/mod.rs b/src/db/mod.rs index 35006dfb3..c2ce468b8 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -10,14 +10,12 @@ pub(crate) use self::add_package::{ pub use self::{ add_package::{update_build_status, update_crate_data_in_database}, delete::{delete_crate, delete_version}, - file::{add_path_into_database, add_path_into_remote_archive}, overrides::Overrides, }; mod add_package; pub mod blacklist; pub mod delete; -pub(crate) mod file; mod overrides; static MIGRATOR: Migrator = sqlx::migrate!(); diff --git a/src/docbuilder/rustwide_builder.rs b/src/docbuilder/rustwide_builder.rs index 348f4a602..86a6f734f 100644 --- a/src/docbuilder/rustwide_builder.rs +++ b/src/docbuilder/rustwide_builder.rs @@ -1,19 +1,13 @@ use crate::{ - AsyncStorage, Config, Context, RUSTDOC_STATIC_STORAGE_PREFIX, Storage, + Config, Context, RUSTDOC_STATIC_STORAGE_PREFIX, db::{ - add_doc_coverage, add_path_into_remote_archive, - blacklist::is_blacklisted, - file::{add_path_into_database, file_list_to_json}, - finish_build, finish_release, initialize_build, initialize_crate, initialize_release, - update_build_with_error, update_crate_data_in_database, + add_doc_coverage, blacklist::is_blacklisted, finish_build, finish_release, + initialize_build, initialize_crate, initialize_release, update_build_with_error, + update_crate_data_in_database, }, docbuilder::Limits, error::Result, metrics::{BUILD_TIME_HISTOGRAM_BUCKETS, DOCUMENTATION_SIZE_BUCKETS}, - storage::{ - CompressionAlgorithm, RustdocJsonFormatVersion, compress, get_file_list, - rustdoc_archive_path, rustdoc_json_path, source_archive_path, - }, utils::{ConfigName, copy_dir_all, get_config, report_error, set_config}, }; use anyhow::{Context as _, Error, anyhow, bail}; @@ -22,6 +16,11 @@ use docs_rs_database::Pool; use docs_rs_opentelemetry::AnyMeterProvider; use docs_rs_registry_api::RegistryApi; use docs_rs_repository_stats::RepositoryStatsUpdater; +use docs_rs_storage::{ + AsyncStorage, CompressionAlgorithm, RustdocJsonFormatVersion, Storage, add_path_into_database, + add_path_into_remote_archive, compress, file_list_to_json, get_file_list, rustdoc_archive_path, + rustdoc_json_path, source_archive_path, +}; use docs_rs_types::{BuildId, BuildStatus, CrateId, ReleaseId, Version}; use docs_rs_utils::{retry, rustc_version::parse_rustc_version}; use docsrs_metadata::{BuildTargets, DEFAULT_TARGETS, HOST_TARGET, Metadata}; @@ -1446,9 +1445,9 @@ impl Default for BuildPackageSummary { #[cfg(test)] mod tests { use super::*; - use crate::storage::{CompressionAlgorithm, compression}; use crate::test::{AxumRouterTestExt, TestEnvironment}; use docs_rs_registry_api::ReleaseData; + use docs_rs_storage::{CompressionAlgorithm, compression}; use docs_rs_types::{BuildStatus, Feature, ReleaseId, Version}; use pretty_assertions::assert_eq; use std::{io, iter}; diff --git a/src/error.rs b/src/error.rs index 2c47bb94f..a99978b28 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,7 +1 @@ -//! Errors used in docs.rs - pub(crate) use anyhow::Result; - -#[derive(Debug, Copy, Clone, thiserror::Error)] -#[error("the size limit for the buffer was reached")] -pub(crate) struct SizeLimitReached; diff --git a/src/lib.rs b/src/lib.rs index fdc75946a..2fdc1fb62 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,7 +14,6 @@ pub use self::context::Context; pub use self::docbuilder::PackageKind; pub use self::docbuilder::{BuildPackageSummary, RustwideBuilder}; pub use self::index::Index; -pub use self::storage::{AsyncStorage, Storage}; pub use self::web::start_web_server; pub use docs_rs_utils::{ @@ -30,7 +29,6 @@ mod docbuilder; mod error; pub mod index; pub mod metrics; -pub mod storage; #[cfg(test)] mod test; pub mod utils; diff --git a/src/storage/database.rs b/src/storage/database.rs deleted file mode 100644 index 33a3bdb0f..000000000 --- a/src/storage/database.rs +++ /dev/null @@ -1,151 +0,0 @@ -use super::{BlobUpload, FileRange, StorageMetrics, StreamingBlob}; -use crate::error::Result; -use chrono::{DateTime, Utc}; -use docs_rs_database::Pool; -use docs_rs_headers::compute_etag; -use futures_util::stream::{Stream, TryStreamExt}; -use sqlx::Acquire; -use std::io; - -pub(crate) struct DatabaseBackend { - pool: Pool, - otel_metrics: StorageMetrics, -} - -impl DatabaseBackend { - pub(crate) fn new(pool: Pool, otel_metrics: StorageMetrics) -> Self { - Self { pool, otel_metrics } - } - - pub(super) async fn exists(&self, path: &str) -> Result { - Ok(sqlx::query_scalar!( - r#"SELECT COUNT(*) > 0 as "has_count!" FROM files WHERE path = $1"#, - path - ) - .fetch_one(&self.pool) - .await?) - } - - pub(super) async fn get_stream( - &self, - path: &str, - range: Option, - ) -> Result { - struct Result { - path: String, - mime: String, - date_updated: DateTime, - compression: Option, - content: Option>, - } - - let result = if let Some(r) = range { - let range_start = i32::try_from(*r.start())?; - - sqlx::query_as!( - Result, - r#"SELECT - path, mime, date_updated, compression, - substring(content from $2 for $3) as content - FROM files - WHERE path = $1;"#, - path, - range_start + 1, // postgres substring is 1-indexed - (r.end() - r.start() + 1) as i32 - ) - .fetch_optional(&self.pool) - .await? - .ok_or(super::PathNotFoundError)? - } else { - // The size limit is checked at the database level, to avoid receiving data altogether if - // the limit is exceeded. - sqlx::query_as!( - Result, - r#"SELECT - path, - mime, - date_updated, - compression, - content - FROM files - WHERE path = $1;"#, - path, - ) - .fetch_optional(&self.pool) - .await? - .ok_or(super::PathNotFoundError)? - }; - - let compression = result.compression.map(|i| { - i.try_into() - .expect("invalid compression algorithm stored in database") - }); - let content = result.content.unwrap_or_default(); - let content_len = content.len(); - - let etag = compute_etag(&content); - Ok(StreamingBlob { - path: result.path, - mime: result - .mime - .parse() - .unwrap_or(mime::APPLICATION_OCTET_STREAM), - date_updated: result.date_updated, - etag: Some(etag), - content: Box::new(io::Cursor::new(content)), - content_length: content_len, - compression, - }) - } - - pub(super) async fn store_batch(&self, batch: Vec) -> Result<()> { - let mut conn = self.pool.get_async().await?; - let mut trans = conn.begin().await?; - for blob in batch { - let compression = blob.compression.map(|alg| alg as i32); - sqlx::query!( - "INSERT INTO files (path, mime, content, compression) - VALUES ($1, $2, $3, $4) - ON CONFLICT (path) DO UPDATE - SET mime = EXCLUDED.mime, content = EXCLUDED.content, compression = EXCLUDED.compression", - &blob.path, - &blob.mime.to_string(), - &blob.content, - compression, - ) - .execute(&mut *trans).await?; - self.otel_metrics.uploaded_files.add(1, &[]); - } - trans.commit().await?; - Ok(()) - } - - pub(super) async fn list_prefix<'a>( - &'a self, - prefix: &'a str, - ) -> impl Stream> + 'a { - sqlx::query!( - "SELECT path - FROM files - WHERE path LIKE $1 - ORDER BY path;", - format!("{}%", prefix.replace('%', "\\%")) - ) - .fetch(&self.pool) - .map_err(Into::into) - .map_ok(|row| row.path) - } - - pub(crate) async fn delete_prefix(&self, prefix: &str) -> Result<()> { - sqlx::query!( - "DELETE FROM files WHERE path LIKE $1;", - format!("{}%", prefix.replace('%', "\\%")) - ) - .execute(&self.pool) - .await?; - Ok(()) - } -} - -// The tests for this module are in src/storage/mod.rs, as part of the backend tests. Please add -// any test checking the public interface there. diff --git a/src/test/fakes.rs b/src/test/fakes.rs index 3263b3a16..5a0b864eb 100644 --- a/src/test/fakes.rs +++ b/src/test/fakes.rs @@ -1,21 +1,19 @@ use super::TestDatabase; use crate::{ - db::{ - file::{FileEntry, file_list_to_json}, - initialize_build, initialize_crate, initialize_release, update_build_status, - }, + db::{initialize_build, initialize_crate, initialize_release, update_build_status}, docbuilder::{DocCoverage, RUSTDOC_JSON_COMPRESSION_ALGORITHMS}, error::Result, - storage::{ - AsyncStorage, CompressionAlgorithm, RustdocJsonFormatVersion, compress, - rustdoc_archive_path, rustdoc_json_path, source_archive_path, - }, }; use anyhow::{Context, bail}; use base64::{Engine, engine::general_purpose::STANDARD as b64}; use chrono::{DateTime, Utc}; use docs_rs_cargo_metadata::{Dependency, MetadataPackage, Target}; use docs_rs_registry_api::{CrateData, CrateOwner, ReleaseData}; +use docs_rs_storage::{ + AsyncStorage, CompressionAlgorithm, FileEntry, RustdocJsonFormatVersion, + add_path_into_database, add_path_into_remote_archive, compress, file_list_to_json, + rustdoc_archive_path, rustdoc_json_path, source_archive_path, +}; use docs_rs_types::{BuildId, BuildStatus, ReleaseId, Version, VersionReq}; use std::{collections::HashMap, fmt, iter, sync::Arc}; use tracing::debug; @@ -415,15 +413,14 @@ impl<'a> FakeRelease<'a> { }; debug!("store in archive: {:?}", archive); let (files_list, new_alg) = - crate::db::add_path_into_remote_archive(storage, &archive, source_directory) - .await?; + add_path_into_remote_archive(storage, &archive, source_directory).await?; Ok((files_list, new_alg)) } else { let prefix = match kind { FileKind::Rustdoc => "rustdoc", FileKind::Sources => "sources", }; - crate::db::add_path_into_database( + add_path_into_database( storage, format!("{}/{}/{}/", prefix, package.name, package.version), source_directory, diff --git a/src/test/mod.rs b/src/test/mod.rs index b75097c2d..81cc72a0d 100644 --- a/src/test/mod.rs +++ b/src/test/mod.rs @@ -7,7 +7,6 @@ use crate::{ config::ConfigBuilder, db, error::Result, - storage::{AsyncStorage, Storage, StorageKind}, web::{build_axum_app, cache, page::TemplateData}, }; use anyhow::{Context as _, anyhow}; @@ -19,8 +18,9 @@ use docs_rs_fastly::Cdn; use docs_rs_headers::{IfNoneMatch, SURROGATE_CONTROL, SurrogateKeys}; use docs_rs_opentelemetry::{ AnyMeterProvider, - testing::{CollectedMetrics, setup_test_meter_provider}, + testing::{CollectedMetrics, TestMetrics}, }; +use docs_rs_storage::{AsyncStorage, Storage, StorageKind, testing::TestStorage}; use docs_rs_types::Version; use fn_error_context::context; use futures_util::stream::TryStreamExt; @@ -29,10 +29,9 @@ use http::{ header::{CACHE_CONTROL, CONTENT_TYPE}, }; use http_body_util::BodyExt; -use opentelemetry_sdk::metrics::InMemoryMetricExporter; use serde::de::DeserializeOwned; use sqlx::Connection as _; -use std::{collections::HashMap, fs, future::Future, panic, rc::Rc, str::FromStr, sync::Arc}; +use std::{collections::HashMap, fs, future::Future, panic, rc::Rc, sync::Arc}; use tokio::{runtime, task::block_in_place}; use tower::ServiceExt; use tracing::error; @@ -411,29 +410,19 @@ impl AxumRouterTestExt for axum::Router { } pub(crate) struct TestEnvironment { - // NOTE: the database has to come before the context, + // NOTE: the database & storage have to come before the context, // otherwise it can happen that we can't cleanup the test database // because the tokio runtime from the context is gone. db: TestDatabase, + _storage: TestStorage, pub context: Context, owned_runtime: Option>, - collected_metrics: InMemoryMetricExporter, + test_metrics: TestMetrics, } pub(crate) fn init_logger() { - use tracing_subscriber::{EnvFilter, filter::Directive}; - rustwide::logging::init_with(tracing_log::LogTracer::new()); - let subscriber = tracing_subscriber::FmtSubscriber::builder() - .with_env_filter( - EnvFilter::builder() - .with_default_directive(Directive::from_str("docs_rs=info").unwrap()) - .with_env_var("DOCSRS_LOG") - .from_env_lossy(), - ) - .with_test_writer() - .finish(); - let _ = tracing::subscriber::set_global_default(subscriber); + docs_rs_logging::testing::init(); } impl TestEnvironment { @@ -463,38 +452,40 @@ impl TestEnvironment { // create index directory fs::create_dir_all(config.registry_index_path.clone())?; - let (metric_exporter, meter_provider) = setup_test_meter_provider(); + let test_metrics = TestMetrics::new(); - let test_db = TestDatabase::new(&config, &meter_provider) + let test_db = TestDatabase::new(&config, test_metrics.provider()) .await .context("can't initialize test database")?; + let test_storage = + TestStorage::from_config(config.storage.clone(), test_metrics.provider()) + .await + .context("can't initialize test storage")?; + Ok(Self { - context: Context::from_test_config(config, meter_provider, test_db.pool().clone()) - .await?, + context: Context::from_test_config( + config, + test_metrics.provider().clone(), + test_db.pool().clone(), + test_storage.storage(), + ) + .await?, db: test_db, + _storage: test_storage, owned_runtime: None, - collected_metrics: metric_exporter, + test_metrics, }) } pub(crate) fn base_config() -> ConfigBuilder { - let mut database_config = - docs_rs_database::Config::from_environment().expect("can't load database config"); - // Use less connections for each test compared to production. - database_config.max_pool_size = 8; - database_config.min_pool_idle = 2; - Config::from_env() .expect("can't load base config from environment") - .database(database_config) - // Use the database for storage, as it's faster than S3. - .storage_backend(StorageKind::Database) - // Use a temporary S3 bucket. - .s3_bucket(format!("docsrs-test-bucket-{}", rand::random::())) - .s3_bucket_is_temporary(true) - .local_archive_cache_path( - std::env::temp_dir().join(format!("docsrs-test-index-{}", rand::random::())), + .database(docs_rs_database::Config::test_config().expect("can't load database config")) + .storage( + docs_rs_storage::Config::test_config(StorageKind::Memory) + .expect("can't load storage config") + .into(), ) // set stale content serving so Cache::ForeverInCdn and Cache::ForeverInCdnAndStaleInBrowser // are actually different. @@ -538,8 +529,7 @@ impl TestEnvironment { } pub(crate) fn collected_metrics(&self) -> CollectedMetrics { - self.context.meter_provider.force_flush().unwrap(); - CollectedMetrics(self.collected_metrics.get_finished_metrics().unwrap()) + self.test_metrics.collected_metrics() } pub(crate) async fn web_app(&self) -> Router { @@ -554,26 +544,6 @@ impl TestEnvironment { } } -impl Drop for TestEnvironment { - fn drop(&mut self) { - let storage = self.context.storage.clone(); - let runtime = self.runtime(); - - block_in_place(move || { - runtime.block_on(async move { - storage - .cleanup_after_test() - .await - .expect("failed to cleanup after tests"); - }); - }); - - if self.context.config.local_archive_cache_path.exists() { - fs::remove_dir_all(&self.context.config.local_archive_cache_path).unwrap(); - } - } -} - #[derive(Debug)] pub(crate) struct TestDatabase { pool: Pool, diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 065cf20d9..e1c3fac50 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -16,7 +16,6 @@ pub mod daemon; mod html; mod queue; pub(crate) mod queue_builder; -pub(crate) mod sized_buffer; use anyhow::Result; use serde::{Serialize, de::DeserializeOwned}; diff --git a/src/web/build_details.rs b/src/web/build_details.rs index f4a39fc2a..64685a5aa 100644 --- a/src/web/build_details.rs +++ b/src/web/build_details.rs @@ -1,5 +1,5 @@ use crate::{ - AsyncStorage, Config, impl_axum_webpage, + Config, impl_axum_webpage, web::{ MetaData, cache::CachePolicy, @@ -14,6 +14,7 @@ use anyhow::Context as _; use askama::Template; use axum::{extract::Extension, response::IntoResponse}; use chrono::{DateTime, Utc}; +use docs_rs_storage::AsyncStorage; use docs_rs_types::{BuildId, BuildStatus}; use futures_util::TryStreamExt; use serde::Deserialize; diff --git a/src/web/crate_details.rs b/src/web/crate_details.rs index b8edee94f..0efd2e391 100644 --- a/src/web/crate_details.rs +++ b/src/web/crate_details.rs @@ -1,6 +1,5 @@ use crate::{ - AsyncStorage, impl_axum_webpage, - storage::PathNotFoundError, + impl_axum_webpage, web::{ MatchedRelease, MetaData, cache::CachePolicy, @@ -23,6 +22,7 @@ use chrono::{DateTime, Utc}; use docs_rs_cargo_metadata::{Dependency, ReleaseDependencyList}; use docs_rs_headers::CanonicalUrl; use docs_rs_registry_api::OwnerKind; +use docs_rs_storage::{AsyncStorage, PathNotFoundError}; use docs_rs_types::{BuildId, BuildStatus, CrateId, KrateName, ReleaseId, ReqVersion, Version}; use futures_util::stream::TryStreamExt; use log::warn; diff --git a/src/web/error.rs b/src/web/error.rs index 01fc79281..94f25a9d7 100644 --- a/src/web/error.rs +++ b/src/web/error.rs @@ -1,7 +1,4 @@ -use crate::{ - storage::PathNotFoundError, - web::{AxumErrorPage, cache::CachePolicy, releases::Search}, -}; +use crate::web::{AxumErrorPage, cache::CachePolicy, releases::Search}; use anyhow::{Result, anyhow}; use axum::{ Json, @@ -9,6 +6,7 @@ use axum::{ response::{IntoResponse, Response as AxumResponse}, }; use docs_rs_database::PoolError; +use docs_rs_storage::PathNotFoundError; use docs_rs_uri::EscapedURI; use std::borrow::Cow; diff --git a/src/web/extractors/path.rs b/src/web/extractors/path.rs index 44a9a47aa..0263cc21f 100644 --- a/src/web/extractors/path.rs +++ b/src/web/extractors/path.rs @@ -1,8 +1,5 @@ //! custom axum extractors for path parameters -use crate::{ - storage::{CompressionAlgorithm, compression::compression_from_file_extension}, - web::error::AxumNope, -}; +use crate::web::error::AxumNope; use anyhow::anyhow; use axum::{ RequestPartsExt, @@ -10,14 +7,14 @@ use axum::{ http::request::Parts, }; use derive_more::Deref; +use docs_rs_storage::{CompressionAlgorithm, compression_from_file_extension}; /// custom axum `Path` extractor that uses our own AxumNope::BadRequest /// as error response instead of a plain text "bad request" #[allow(clippy::disallowed_types)] mod path_impl { - use serde::de::DeserializeOwned; - use super::*; + use serde::de::DeserializeOwned; #[derive(FromRequestParts)] #[from_request(via(axum::extract::Path), rejection(AxumNope))] diff --git a/src/web/extractors/rustdoc.rs b/src/web/extractors/rustdoc.rs index 7b18b3019..c34f58007 100644 --- a/src/web/extractors/rustdoc.rs +++ b/src/web/extractors/rustdoc.rs @@ -1,15 +1,13 @@ //! special rustdoc extractors -use crate::{ - storage::CompressionAlgorithm, - web::{MatchedRelease, MetaData, error::AxumNope, extractors::Path}, -}; +use crate::web::{MatchedRelease, MetaData, error::AxumNope, extractors::Path}; use anyhow::Result; use axum::{ RequestPartsExt, extract::{FromRequestParts, MatchedPath}, http::{Uri, request::Parts}, }; +use docs_rs_storage::CompressionAlgorithm; use docs_rs_types::{BuildId, KrateName, ReqVersion}; use docs_rs_uri::{EscapedURI, url_decode}; use itertools::Itertools as _; diff --git a/src/web/file.rs b/src/web/file.rs index 13fcea8b2..fc8dba13f 100644 --- a/src/web/file.rs +++ b/src/web/file.rs @@ -1,11 +1,7 @@ //! Database based file handler use super::cache::CachePolicy; -use crate::{ - Config, - error::Result, - storage::{AsyncStorage, Blob, StreamingBlob}, -}; +use crate::{Config, error::Result}; use axum::{ body::Body, extract::Extension, @@ -17,6 +13,7 @@ use axum_extra::{ headers::{ContentType, LastModified}, }; use docs_rs_headers::IfNoneMatch; +use docs_rs_storage::{AsyncStorage, Blob, StreamingBlob}; use std::time::SystemTime; use tokio_util::io::ReaderStream; use tracing::warn; @@ -36,13 +33,11 @@ impl File { path: &str, config: &Config, ) -> Result { - let max_size = if path.ends_with(".html") { - config.max_file_size_html - } else { - config.max_file_size - }; - - Ok(File(storage.get(path, max_size).await?)) + Ok(File( + storage + .get(path, config.storage.max_file_size_for(path)) + .await?, + )) } } @@ -135,12 +130,11 @@ impl StreamingFile { #[cfg(test)] mod tests { use super::*; - use crate::{ - storage::CompressionAlgorithm, test::TestEnvironment, web::cache::STATIC_ASSET_CACHE_POLICY, - }; + use crate::{test::TestEnvironment, web::cache::STATIC_ASSET_CACHE_POLICY}; use axum_extra::headers::{ETag, HeaderMapExt as _}; use chrono::Utc; use docs_rs_headers::compute_etag; + use docs_rs_storage::{CompressionAlgorithm, StorageKind}; use http::header::{CACHE_CONTROL, ETAG, LAST_MODIFIED}; use std::{io, rc::Rc}; @@ -259,8 +253,15 @@ mod tests { let env = Rc::new( TestEnvironment::with_config( TestEnvironment::base_config() - .max_file_size(MAX_SIZE) - .max_file_size_html(MAX_HTML_SIZE) + .storage( + docs_rs_storage::Config::test_config(StorageKind::Memory)? + .set(|mut cfg| { + cfg.max_file_size = MAX_SIZE; + cfg.max_file_size_html = MAX_HTML_SIZE; + cfg + }) + .into(), + ) .build()?, ) .await?, @@ -299,7 +300,7 @@ mod tests { .unwrap_err() .downcast_ref::() .and_then(|io| io.get_ref()) - .and_then(|err| err.downcast_ref::()) + .and_then(|err| err.downcast_ref::()) .is_some() }; diff --git a/src/web/rustdoc.rs b/src/web/rustdoc.rs index 0d8ba07a4..47fc2a725 100644 --- a/src/web/rustdoc.rs +++ b/src/web/rustdoc.rs @@ -1,12 +1,7 @@ //! rustdoc handlerr use crate::{ - AsyncStorage, BUILD_VERSION, Config, RUSTDOC_STATIC_STORAGE_PREFIX, - storage::{ - CompressionAlgorithm, RustdocJsonFormatVersion, StreamingBlob, rustdoc_archive_path, - rustdoc_json_path, - }, - utils, + BUILD_VERSION, Config, RUSTDOC_STATIC_STORAGE_PREFIX, utils, web::{ MetaData, axum_cached_redirect, cache::{CachePolicy, STATIC_ASSET_CACHE_POLICY}, @@ -41,6 +36,10 @@ use axum_extra::{ use docs_rs_cargo_metadata::Dependency; use docs_rs_headers::{ETagComputer, IfNoneMatch, X_ROBOTS_TAG}; use docs_rs_registry_api::OwnerKind; +use docs_rs_storage::{ + AsyncStorage, CompressionAlgorithm, PathNotFoundError, RustdocJsonFormatVersion, StreamingBlob, + rustdoc_archive_path, rustdoc_json_path, +}; use docs_rs_types::{KrateName, ReqVersion}; use docs_rs_uri::EscapedURI; use http::{HeaderMap, HeaderValue, Uri, header::CONTENT_DISPOSITION, uri::Authority}; @@ -331,7 +330,7 @@ pub(crate) async fn rustdoc_redirector_handler( .into_response(if_none_match.as_deref(), STATIC_ASSET_CACHE_POLICY)), Err(err) => { if !matches!(err.downcast_ref(), Some(AxumNope::ResourceNotFound)) - && !matches!(err.downcast_ref(), Some(crate::storage::PathNotFoundError)) + && !matches!(err.downcast_ref(), Some(PathNotFoundError)) { error!(inner_path, ?err, "got error serving file"); } @@ -669,7 +668,7 @@ pub(crate) async fn rustdoc_html_server_handler( Ok(file) => file, Err(err) => { if !matches!(err.downcast_ref(), Some(AxumNope::ResourceNotFound)) - && !matches!(err.downcast_ref(), Some(crate::storage::PathNotFoundError)) + && !matches!(err.downcast_ref(), Some(PathNotFoundError)) { error!("got error serving {}: {}", storage_path, err); } @@ -972,7 +971,7 @@ pub(crate) async fn json_download_handler( StreamingFile(file).into_response(if_none_match.as_deref(), cache_policy), None, ), - Err(err) if matches!(err.downcast_ref(), Some(crate::storage::PathNotFoundError)) => { + Err(err) if matches!(err.downcast_ref(), Some(PathNotFoundError)) => { // we have old files on the bucket where we stored zstd compressed files, // with content-encoding=zstd & just a `.json` file extension. // As a fallback, we redirect to that, if zstd was requested (which is also the default). @@ -1074,7 +1073,6 @@ mod test { use crate::{ Config, docbuilder::{RUSTDOC_JSON_COMPRESSION_ALGORITHMS, read_format_version_from_rustdoc_json}, - storage::decompress, test::*, web::cache::CachePolicy, }; @@ -1082,28 +1080,16 @@ mod test { use chrono::{NaiveDate, Utc}; use docs_rs_cargo_metadata::Dependency; use docs_rs_registry_api::{CrateOwner, OwnerKind}; + use docs_rs_storage::{decompress, testing::check_archive_consistency}; use docs_rs_types::Version; use docs_rs_uri::encode_url_path; use kuchikiki::traits::TendrilSink; use pretty_assertions::assert_eq; use reqwest::StatusCode; - use std::{collections::BTreeMap, io, str::FromStr as _}; + use std::{collections::BTreeMap, str::FromStr as _}; use test_case::test_case; use tracing::info; - /// try decompressing the zip & read the content - fn check_archive_consistency(compressed_body: &[u8]) -> anyhow::Result<()> { - let mut zip = zip::ZipArchive::new(io::Cursor::new(compressed_body))?; - for i in 0..zip.len() { - let mut file = zip.by_index(i)?; - - let mut buf = Vec::new(); - io::copy(&mut file, &mut buf)?; - } - - Ok(()) - } - async fn try_latest_version_redirect( krate: &str, path: &str, diff --git a/src/web/source.rs b/src/web/source.rs index a2f99c890..3639c7890 100644 --- a/src/web/source.rs +++ b/src/web/source.rs @@ -1,6 +1,5 @@ use crate::{ - AsyncStorage, Config, impl_axum_webpage, - storage::PathNotFoundError, + Config, impl_axum_webpage, web::{ MetaData, cache::{CachePolicy, STATIC_ASSET_CACHE_POLICY}, @@ -20,6 +19,7 @@ use askama::Template; use axum::{Extension, response::IntoResponse}; use axum_extra::{TypedHeader, headers::HeaderMapExt}; use docs_rs_headers::{CanonicalUrl, IfNoneMatch}; +use docs_rs_storage::{AsyncStorage, PathNotFoundError}; use docs_rs_types::{BuildId, ReqVersion, Version}; use mime::Mime; use std::{cmp::Ordering, sync::Arc}; @@ -294,7 +294,7 @@ pub(crate) async fn source_browser_handler( )); return Ok(response); } else { - let max_file_size = config.max_file_size_for(&stream.path); + let max_file_size = config.storage.max_file_size_for(&stream.path); // otherwise we'll now download the content to render it into our template. match stream.materialize(max_file_size).await { @@ -313,7 +313,7 @@ pub(crate) async fn source_browser_handler( // if file is too large, set is_file_too_large to true if err.downcast_ref::().is_some_and(|err| { err.get_ref() - .map(|err| err.is::()) + .map(|err| err.is::()) .unwrap_or(false) }) => { @@ -369,6 +369,7 @@ mod tests { use anyhow::Result; use axum_extra::headers::{ContentType, ETag, HeaderMapExt as _}; use docs_rs_headers::IfNoneMatch; + use docs_rs_storage::StorageKind; use docs_rs_types::KrateName; use docs_rs_uri::encode_url_path; use kuchikiki::traits::TendrilSink; @@ -856,8 +857,15 @@ mod tests { async fn large_file_test() -> Result<()> { let env = TestEnvironment::with_config( TestEnvironment::base_config() - .max_file_size(1) - .max_file_size_html(1) + .storage( + docs_rs_storage::Config::test_config(StorageKind::Memory)? + .set(|mut cfg| { + cfg.max_file_size = 1; + cfg.max_file_size_html = 1; + cfg + }) + .into(), + ) .build()?, ) .await?;