Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions changelogs/current.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ bug_fixes:
- area: oauth2
change: |
Fixed OAuth2 refresh requests so host rewriting no longer overrides the original ``Host`` header value.
- area: watch-dog
change: |
Fixed a bug where the worker-threads' watch-dogs were configured using the main-thread's configuration.
This change can be reverted by setting the runtime guard ``envoy.restart_features.worker_threads_watchdog_fix``
to ``false``.
- area: ext_proc
change: |
Fixed a bug to support two ext_proc filters configured in the chain. This change can be reverted by setting
Expand Down
3 changes: 2 additions & 1 deletion mobile/library/common/engine_common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ class ServerLite : public Server::InstanceBase {
std::unique_ptr<Envoy::Server::OverloadManager> createNullOverloadManager() override {
return std::make_unique<Envoy::Server::NullOverloadManager>(threadLocal(), true);
}
std::unique_ptr<Server::GuardDog> maybeCreateGuardDog(absl::string_view) override {
std::unique_ptr<Server::GuardDog>
maybeCreateGuardDog(absl::string_view, const Server::Configuration::Watchdog&) override {
return nullptr;
}
std::unique_ptr<Server::HdsDelegateApi>
Expand Down
1 change: 1 addition & 0 deletions source/common/runtime/runtime_features.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ RUNTIME_GUARD(envoy_restart_features_move_locality_schedulers_to_lb);
RUNTIME_GUARD(envoy_restart_features_raise_file_limits);
RUNTIME_GUARD(envoy_restart_features_use_eds_cache_for_ads);
RUNTIME_GUARD(envoy_restart_features_validate_http3_pseudo_headers);
RUNTIME_GUARD(envoy_restart_features_worker_threads_watchdog_fix);

// Begin false flags. Most of them should come with a TODO to flip true.

Expand Down
7 changes: 4 additions & 3 deletions source/server/instance_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ std::unique_ptr<OverloadManager> InstanceImpl::createNullOverloadManager() {
return std::make_unique<NullOverloadManager>(threadLocal(), false);
}

std::unique_ptr<Server::GuardDog> InstanceImpl::maybeCreateGuardDog(absl::string_view name) {
return std::make_unique<Server::GuardDogImpl>(*stats().rootScope(),
config().mainThreadWatchdogConfig(), api(), name);
std::unique_ptr<Server::GuardDog>
InstanceImpl::maybeCreateGuardDog(absl::string_view name,
const Server::Configuration::Watchdog& config) {
return std::make_unique<Server::GuardDogImpl>(*stats().rootScope(), config, api(), name);
}

std::unique_ptr<HdsDelegateApi>
Expand Down
4 changes: 3 additions & 1 deletion source/server/instance_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ class InstanceImpl : public InstanceBase {
void maybeCreateHeapShrinker() override;
absl::StatusOr<std::unique_ptr<OverloadManager>> createOverloadManager() override;
std::unique_ptr<OverloadManager> createNullOverloadManager() override;
std::unique_ptr<Server::GuardDog> maybeCreateGuardDog(absl::string_view name) override;
std::unique_ptr<Server::GuardDog>
maybeCreateGuardDog(absl::string_view name,
const Server::Configuration::Watchdog& config) override;
std::unique_ptr<HdsDelegateApi>
maybeCreateHdsDelegate(Configuration::ServerFactoryContext& server_context, Stats::Scope& scope,
Grpc::RawAsyncClientPtr&& async_client, Envoy::Stats::Store& stats,
Expand Down
8 changes: 6 additions & 2 deletions source/server/server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -864,8 +864,12 @@ absl::Status InstanceBase::initializeOrThrow(Network::Address::InstanceConstShar

// GuardDog (deadlock detection) object and thread setup before workers are
// started and before our own run() loop runs.
main_thread_guard_dog_ = maybeCreateGuardDog("main_thread");
worker_guard_dog_ = maybeCreateGuardDog("workers");
main_thread_guard_dog_ = maybeCreateGuardDog("main_thread", config_.mainThreadWatchdogConfig());
if (Runtime::runtimeFeatureEnabled("envoy.restart_features.worker_threads_watchdog_fix")) {
worker_guard_dog_ = maybeCreateGuardDog("workers", config_.workerWatchdogConfig());
} else {
worker_guard_dog_ = maybeCreateGuardDog("workers", config_.mainThreadWatchdogConfig());
}
return absl::OkStatus();
}

Expand Down
6 changes: 2 additions & 4 deletions source/server/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,8 @@ class InstanceBase : Logger::Loggable<Logger::Id::main>,
virtual void maybeCreateHeapShrinker() PURE;
virtual absl::StatusOr<std::unique_ptr<OverloadManager>> createOverloadManager() PURE;
virtual std::unique_ptr<OverloadManager> createNullOverloadManager() PURE;
virtual std::unique_ptr<Server::GuardDog> maybeCreateGuardDog(absl::string_view name) PURE;
virtual std::unique_ptr<Server::GuardDog>
maybeCreateGuardDog(absl::string_view name, const Server::Configuration::Watchdog& config) PURE;
virtual std::unique_ptr<HdsDelegateApi>
maybeCreateHdsDelegate(Configuration::ServerFactoryContext& server_context, Stats::Scope& scope,
Grpc::RawAsyncClientPtr&& async_client, Envoy::Stats::Store& stats,
Expand Down Expand Up @@ -343,9 +344,6 @@ class InstanceBase : Logger::Loggable<Logger::Id::main>,
ServerLifecycleNotifier::HandlePtr
registerCallback(Stage stage, StageCallbackWithCompletion callback) override;

protected:
const Configuration::MainImpl& config() { return config_; }

private:
Network::DnsResolverSharedPtr getOrCreateDnsResolver();

Expand Down
1 change: 1 addition & 0 deletions test/integration/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -1830,6 +1830,7 @@ envoy_cc_test(
":base_overload_integration_test_lib",
":http_protocol_integration_lib",
"//test/common/config:dummy_config_proto_cc_proto",
"//test/integration/filters:block_filter_lib",
"//test/test_common:test_runtime_lib",
"@envoy_api//envoy/config/bootstrap/v3:pkg_cc_proto",
"@envoy_api//envoy/config/overload/v3:pkg_cc_proto",
Expand Down
19 changes: 19 additions & 0 deletions test/integration/filters/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,25 @@ envoy_cc_test_library(
],
)

envoy_proto_library(
name = "block_filter_proto",
srcs = ["block_filter.proto"],
)

envoy_cc_test_library(
name = "block_filter_lib",
srcs = ["block_filter.cc"],
deps = [
":block_filter_proto_cc_proto",
"//envoy/http:filter_interface",
"//envoy/registry",
"//envoy/server:filter_config_interface",
"//source/common/protobuf:utility_lib",
"//source/extensions/filters/http/common:factory_base_lib",
"//source/extensions/filters/http/common:pass_through_filter_lib",
],
)

envoy_proto_library(
name = "crash_filter_proto",
srcs = ["crash_filter.proto"],
Expand Down
68 changes: 68 additions & 0 deletions test/integration/filters/block_filter.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#include <chrono>
#include <thread>

#include "envoy/http/filter.h"
#include "envoy/registry/registry.h"
#include "envoy/server/filter_config.h"

#include "source/common/protobuf/utility.h"
#include "source/extensions/filters/http/common/factory_base.h"
#include "source/extensions/filters/http/common/pass_through_filter.h"

#include "test/integration/filters/block_filter.pb.h"
#include "test/integration/filters/block_filter.pb.validate.h"

#include "absl/time/time.h"

namespace Envoy {

/**
* A test filter that blocks the thread for a configured duration in decodeHeaders.
* This is useful for triggering watchdog events in integration tests.
* Note that the filter uses real thread sleep to make sure the watchdog thread
* picks up the issue).
*/
class BlockFilter : public Http::PassThroughFilter {
public:
BlockFilter(std::chrono::milliseconds block_duration) : block_duration_(block_duration) {}

Http::FilterHeadersStatus decodeHeaders(Http::RequestHeaderMap&, bool) override {
if (block_duration_.count() > 0) {
// Blocking the thread synchronously to simulate a non-responsive thread.
// We use sleep_for here instead of advanceTimeWait because the watchdog runs on its own
// thread and needs real time to elapse to trigger a miss/megamiss event when it checks
// on the responsiveness of the worker threads.
ENVOY_LOG_MISC(info, "BlockFilter: starting sleep for {}ms", block_duration_.count());
absl::SleepFor(absl::Milliseconds(block_duration_.count()));
ENVOY_LOG_MISC(info, "BlockFilter: finished sleep");
}
return Http::FilterHeadersStatus::Continue;
}

private:
const std::chrono::milliseconds block_duration_;
};

/**
* Factory for BlockFilter.
*/
class BlockFilterFactory : public Extensions::HttpFilters::Common::DualFactoryBase<
test::integration::filters::BlockFilterConfig> {
public:
BlockFilterFactory() : DualFactoryBase("block-filter") {}

private:
absl::StatusOr<Http::FilterFactoryCb> createFilterFactoryFromProtoTyped(
const test::integration::filters::BlockFilterConfig& proto_config, const std::string&,
DualInfo, Server::Configuration::ServerFactoryContext&) override {
auto block_duration = std::chrono::milliseconds(
DurationUtil::durationToMilliseconds(proto_config.block_duration()));
return [block_duration](Http::FilterChainFactoryCallbacks& callbacks) -> void {
callbacks.addStreamFilter(std::make_shared<BlockFilter>(block_duration));
};
}
};

REGISTER_FACTORY(BlockFilterFactory, Server::Configuration::NamedHttpFilterConfigFactory);

} // namespace Envoy
13 changes: 13 additions & 0 deletions test/integration/filters/block_filter.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
syntax = "proto3";

package test.integration.filters;

import "google/protobuf/duration.proto";

// Configuration for BlockFilter that blocks the worker thread for a specific
// time (using real thread sleep)..
message BlockFilterConfig {
// Duration to block the thread in decodeHeaders.
// If the value is 0, the thread will not wait.
google.protobuf.Duration block_duration = 1;
}
71 changes: 71 additions & 0 deletions test/integration/overload_integration_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "source/common/protobuf/utility.h"

#include "test/integration/base_overload_integration_test.h"
#include "test/integration/filters/block_filter.pb.h"
#include "test/integration/http_protocol_integration.h"
#include "test/integration/ssl_utility.h"
#include "test/test_common/test_runtime.h"
Expand Down Expand Up @@ -1491,4 +1492,74 @@ TEST_P(LoadShedPointIntegrationTest, Http3ServerDispatchSendsGoAwayCompletingPen
0);
}

// Verifies that worker thread watchdog configuration is correctly applied and triggers megamiss
// events when a worker thread is non-responsive.
TEST_P(OverloadIntegrationTest, WorkerWatchdogMegaMiss) {
config_helper_.addConfigModifier([](envoy::config::bootstrap::v3::Bootstrap& bootstrap) {
auto* watchdogs = bootstrap.mutable_watchdogs();
// Configure a short megamiss timeout for workers.
watchdogs->mutable_worker_watchdog()->mutable_megamiss_timeout()->set_nanos(100 * 1000 * 1000);
// Configure a long megamiss timeout for the main thread to avoid accidental triggers.
watchdogs->mutable_main_thread_watchdog()->mutable_megamiss_timeout()->set_seconds(60);
});

// Use BlockFilter to block the worker thread for 400ms, which is longer than the megamiss
// timeout.
config_helper_.prependFilter(
absl::StrCat("name: block-filter\ntyped_config: \n",
" \"@type\": type.googleapis.com/test.integration.filters.BlockFilterConfig\n",
" block_duration: 0.4s\n"));

initialize();

codec_client_ = makeHttpConnection(makeClientConnection((lookupPort("http"))));
auto response = codec_client_->makeHeaderOnlyRequest(default_request_headers_);

// Verify that the worker-specific megamiss counter is incremented.
test_server_->waitForCounterGe("server.worker_0.watchdog_mega_miss", 1);
// Verify that the global workers megamiss counter is incremented.
test_server_->waitForCounterGe("workers.watchdog_mega_miss", 1);

EXPECT_TRUE(response->waitForEndStream(std::chrono::seconds(20)));
EXPECT_TRUE(response->complete());
}

// Verifies that when the runtime guard is disabled, worker threads fallback to the main thread
// watchdog configuration.
TEST_P(OverloadIntegrationTest, WorkerWatchdogMegaMissDisabled) {
TestScopedRuntime scoped_runtime;
scoped_runtime.mergeValues({{"envoy.restart_features.worker_threads_watchdog_fix", "false"}});

config_helper_.addConfigModifier([](envoy::config::bootstrap::v3::Bootstrap& bootstrap) {
auto* watchdogs = bootstrap.mutable_watchdogs();
// Configure a short megamiss timeout for workers.
watchdogs->mutable_worker_watchdog()->mutable_megamiss_timeout()->set_nanos(100 * 1000 * 1000);
// Configure a long megamiss timeout for the main thread.
// Since the fix is disabled, workers should use this long timeout.
watchdogs->mutable_main_thread_watchdog()->mutable_megamiss_timeout()->set_seconds(60);
});

// Use BlockFilter to block the worker thread for 400ms.
config_helper_.prependFilter(
absl::StrCat("name: block-filter\ntyped_config: \n",
" \"@type\": type.googleapis.com/test.integration.filters.BlockFilterConfig\n",
" block_duration: 0.4s\n"));

initialize();

codec_client_ = makeHttpConnection(makeClientConnection((lookupPort("http"))));
auto response = codec_client_->makeHeaderOnlyRequest(default_request_headers_);

// Since workers are using the main thread watchdog config (60s timeout),
// a 400ms block should NOT trigger a megamiss.
// We wait a bit to be sure, then check the counter is still 0.
absl::SleepFor(absl::Milliseconds(600));

EXPECT_EQ(test_server_->counter("server.worker_0.watchdog_mega_miss")->value(), 0);
EXPECT_EQ(test_server_->counter("workers.watchdog_mega_miss")->value(), 0);

EXPECT_TRUE(response->waitForEndStream(std::chrono::seconds(20)));
EXPECT_TRUE(response->complete());
}

} // namespace Envoy
Loading