Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/erlang-checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,5 @@ jobs:
thrift-version: ${{ needs.setup.outputs.thrift-version }}
run-ct-with-compose: true
use-coveralls: true
cache-version: v9
cache-version: v10
upload-coverage: false
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -120,5 +120,8 @@ distclean: clean-build-image

test: eunit common-test

dbg:
$(REBAR) ct -v --suite=apps/hellgate/test/hg_invoice_tests_SUITE.erl --group=all_non_destructive_tests --case=payment_big_cascade_success

cover-report:
$(REBAR) cover
4 changes: 3 additions & 1 deletion apps/hellgate/src/hellgate.app.src
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@
limiter_proto,
opentelemetry_api,
opentelemetry_exporter,
opentelemetry
opentelemetry,
opentelemetry_experimental,
opentelemetry_api_experimental
]},
{env, []},
{modules, []},
Expand Down
61 changes: 59 additions & 2 deletions apps/hellgate/src/hellgate.erl
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,72 @@ get_prometheus_route() ->

-spec start(normal, any()) -> {ok, pid()} | {error, any()}.
start(_StartType, _StartArgs) ->
ok = setup_metrics(),
supervisor:start_link(?MODULE, []).
case ensure_otel_log_handler() of
ok ->
ok = setup_metrics(),
supervisor:start_link(?MODULE, []);
{error, Reason} ->
logger:error("Failed to add otel_logs handler: ~p", [Reason]),
{error, Reason}
end.

-spec stop(any()) -> ok.
stop(_State) ->
ok = flush_otel_logs(),
ok.

%%

setup_metrics() ->
ok = woody_ranch_prometheus_collector:setup(),
ok = woody_hackney_prometheus_collector:setup().

ensure_otel_log_handler() ->
case logger:get_handler_config(otel_logs) of
{ok, _} ->
ok;
_ ->
MaxQueue = application:get_env(hellgate, otel_log_max_queue_size, 2048),
DelayMs = application:get_env(hellgate, otel_log_scheduled_delay_ms, 1000),
TimeoutMs = application:get_env(hellgate, otel_log_exporting_timeout_ms, 300000),
LogLevel = application:get_env(hellgate, otel_log_level, debug),
LoggerHandlerConfig = #{
level => LogLevel,
config => #{
exporter =>
{otel_exporter_logs_otlp, #{
protocol => http_protobuf,
ssl_options => []
}},
max_queue_size => MaxQueue,
scheduled_delay_ms => DelayMs,
exporting_timeout_ms => TimeoutMs
}
},
case logger:add_handler(otel_logs, otel_log_handler, LoggerHandlerConfig) of
ok ->
ok;
{error, {already_exist, _}} ->
ok;
{error, Reason} ->
{error, {otel_log_handler_failed, Reason}}
end
end.

%% @doc Ждём отправки буферизованных логов перед остановкой.
%% otel_log_handler батчит логи и отправляет по таймеру (scheduled_delay_ms).
%% Явного API для flush у otel_log_handler нет, поэтому ждём один полный цикл
%% батчинга + запас на сетевую отправку (export overhead).
-define(FLUSH_EXPORT_OVERHEAD_MS, 700).
-define(FLUSH_MAX_WAIT_MS, 5000).

flush_otel_logs() ->
case logger:get_handler_config(otel_logs) of
{ok, HandlerCfg} ->
_ = logger:info("Waiting for OTEL logs exporter to flush"),
DelayMs = maps:get(scheduled_delay_ms, HandlerCfg, 1000),
timer:sleep(erlang:min(?FLUSH_MAX_WAIT_MS, DelayMs + ?FLUSH_EXPORT_OVERHEAD_MS)),
ok;
_ ->
ok
end.
2 changes: 1 addition & 1 deletion apps/hellgate/test/hg_ct_helper.erl
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ start_app(party_client = AppName) ->
{
start_app(AppName, [
{services, #{
party_management => "http://party-management:8022/v1/processing/partymgmt"
party_management => <<"http://party-management:8022/v1/processing/partymgmt">>
}},
{woody, #{
% disabled | safe | aggressive
Expand Down
3 changes: 3 additions & 0 deletions apps/hellgate/test/hg_invoice_tests_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,9 @@ groups() ->

-spec init_per_suite(config()) -> config().
init_per_suite(C) ->
%% NOTE Comment out primary logger's level change to revert to default
%% verbosity of info level.
ok = logger:set_primary_config(level, debug),
% _ = dbg:tracer(),
% _ = dbg:p(all, c),
% _ = dbg:tpl({'hg_invoice_payment', 'p', '_'}, x),
Expand Down
18 changes: 18 additions & 0 deletions apps/hg_progressor/src/hg_progressor.erl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

-include_lib("mg_proto/include/mg_proto_state_processing_thrift.hrl").
-include_lib("progressor/include/progressor.hrl").
-include_lib("opentelemetry_api/include/opentelemetry.hrl").

%% automaton call wrapper
-export([call_automaton/2]).
Expand Down Expand Up @@ -132,12 +133,29 @@ process({CallType, BinArgs, Process}, #{ns := NS} = Options, BinCtx) ->
Args = marshal(args, {CallType, BinArgs, Machine}),
WoodyContext = hg_woody_service_wrapper:ensure_woody_deadline_set(WoodyContext0, Options),
ok = hg_context:save(hg_woody_service_wrapper:create_context(WoodyContext, Options)),
%% attach_otel_context выше восстанавливает родительский span из RPC; start_span использует текущий контекст
Tracer = opentelemetry:get_application_tracer(?MODULE),
SpanCtx = otel_tracer:start_span(Tracer, mk_span_name(Func, NS), #{kind => ?SPAN_KIND_INTERNAL}),
_ = otel_tracer:set_current_span(SpanCtx),
try
handle_result(hg_machine:handle_function(Func, {Args}, Options), LastEventID)
after
_ = otel_span:end_span(SpanCtx, undefined),
hg_context:cleanup()
end.

mk_span_name('ProcessSignal', NS) ->
iolist_to_binary(["signal ", ns_to_binary(NS)]);
mk_span_name('ProcessCall', NS) ->
iolist_to_binary(["call ", ns_to_binary(NS)]);
mk_span_name('ProcessRepair', NS) ->
iolist_to_binary(["repair ", ns_to_binary(NS)]);
mk_span_name(Func, NS) ->
iolist_to_binary([atom_to_binary(Func), " ", ns_to_binary(NS)]).

ns_to_binary(NS) when is_atom(NS) -> atom_to_binary(NS);
ns_to_binary(NS) when is_binary(NS) -> NS.

%% Internal functions

decode_rpc_context(<<>>) ->
Expand Down
96 changes: 78 additions & 18 deletions compose.tracing.yaml
Original file line number Diff line number Diff line change
@@ -1,42 +1,102 @@
# UI: Grafana http://localhost:3000 (admin/admin)
services:
# OpenTelemetry Collector: single OTLP endpoint, fans out to Tempo + Loki
otel-collector:
image: otel/opentelemetry-collector-contrib:0.112.0
command: ["--config=/etc/otel/config.yaml"]
volumes:
- ./test/tracing/otel-collector-config.yaml:/etc/otel/config.yaml:ro
ports:
- "4317:4317" # OTLP gRPC
- "4318:4318" # OTLP HTTP
healthcheck:
test: ["CMD", "/otelcol-contrib", "--version"]
interval: 5s
timeout: 2s
retries: 20
start_period: 5s
depends_on:
tempo:
condition: service_healthy
loki:
condition: service_healthy

dmt:
environment: &otlp_enabled
OTEL_TRACES_EXPORTER: otlp
OTEL_LOGS_EXPORTER: otlp
OTEL_TRACES_SAMPLER: parentbased_always_off
OTEL_EXPORTER_OTLP_PROTOCOL: http_protobuf
OTEL_EXPORTER_OTLP_ENDPOINT: http://jaeger:4318
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4318
OTEL_EXPORTER_OTLP_LOGS_ENDPOINT: http://otel-collector:4318/v1/logs
OTEL_SERVICE_NAME: dmt

bender:
environment: *otlp_enabled
environment:
<<: *otlp_enabled
OTEL_SERVICE_NAME: bender

limiter:
environment: *otlp_enabled
environment:
<<: *otlp_enabled
OTEL_SERVICE_NAME: limiter

party-management:
environment: *otlp_enabled
environment:
<<: *otlp_enabled
OTEL_SERVICE_NAME: party-management

testrunner:
environment:
<<: *otlp_enabled
OTEL_SERVICE_NAME: hellgate_testrunner
OTEL_SERVICE_NAME: hellgate
OTEL_TRACES_SAMPLER: parentbased_always_on
depends_on:
jaeger:
otel-collector:
condition: service_healthy
grafana:
condition: service_started

jaeger:
image: jaegertracing/all-in-one:1.47
environment:
- COLLECTOR_OTLP_ENABLED=true
tempo:
image: grafana/tempo:2.6.1
command: ["-config.file=/etc/tempo.yaml"]
volumes:
- ./test/tracing/tempo.yaml:/etc/tempo.yaml:ro
ports:
- 3200:3200
healthcheck:
test: "/go/bin/all-in-one-linux status"
interval: 2s
timeout: 1s
test: ["CMD-SHELL", "wget -q -O- http://localhost:3200/ready || exit 1"]
interval: 5s
timeout: 2s
retries: 20
start_period: 5s

loki:
image: grafana/loki:3.1.1
command: ["-config.file=/etc/loki/config.yaml"]
volumes:
- ./test/tracing/loki.yaml:/etc/loki/config.yaml:ro
ports:
- 4317:4317 # OTLP gRPC receiver
- 4318:4318 # OTLP http receiver
- 5778:5778
- 14250:14250
- 16686:16686
- 3100:3100
healthcheck:
test: ["CMD-SHELL", "wget -q -O- http://localhost:3100/ready || exit 1"]
interval: 5s
timeout: 2s
retries: 20
start_period: 5s

grafana:
image: grafana/grafana:12.3.3
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false
volumes:
- ./test/tracing/grafana/provisioning:/etc/grafana/provisioning:ro
ports:
- 3000:3000
depends_on:
loki:
condition: service_healthy
tempo:
condition: service_healthy
11 changes: 10 additions & 1 deletion config/sys.config
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
{storage, scoper_storage_logger}
]},

{opentelemetry, [
{processors, [{otel_batch_processor, #{scheduled_delay_ms => 1000}}]}
]},

{hg_proto, [
{services, #{
limiter => "http://limiter:8022/v1/limiter",
Expand All @@ -43,6 +47,11 @@
% Should be greater than any other timeouts
idle_timeout => infinity
}},
%% OTEL log handler configuration
{otel_log_level, info},
{otel_log_max_queue_size, 2048},
{otel_log_scheduled_delay_ms, 1000},
{otel_log_exporting_timeout_ms, 300000},
{scoper_event_handler_options, #{
event_handler_opts => #{
formatter_opts => #{
Expand Down Expand Up @@ -162,7 +171,7 @@

{party_client, [
{services, #{
party_management => "http://party-management:8022/v1/processing/partymgmt"
party_management => <<"http://party-management:8022/v1/processing/partymgmt">>
}},
{woody, #{
% disabled | safe | aggressive
Expand Down
28 changes: 24 additions & 4 deletions rebar.config
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,32 @@
{prometheus, "4.11.0"},
{prometheus_cowboy, "0.1.9"},

%% OpenTelemetry deps
{opentelemetry_api, "1.4.0"},
{opentelemetry, "1.5.0"},
{opentelemetry_exporter, "1.8.0"},
%% OpenTelemetry deps.
{opentelemetry_api, "1.5.0"},
{opentelemetry, "1.7.0"},
{opentelemetry_exporter,
{git_subdir, "https://github.com/valitydev/opentelemetry-erlang.git",
{branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_exporter"}},
{opentelemetry_api_experimental, "0.5.1"},
{opentelemetry_experimental,
{git_subdir, "https://github.com/valitydev/opentelemetry-erlang.git",
{branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_experimental"}},
{eqwalizer_support,
{git_subdir, "https://github.com/whatsapp/eqwalizer.git", {branch, "main"}, "eqwalizer_support"}}
]}.

%% opentelemetry_experimental из git требует ~> 0.5.2 и старые otel (1.4/1.5),
%% но opentelemetry_exporter требует 1.5.0/1.7.0. Переопределяем на новые версии.
{overrides, [
{override, opentelemetry_experimental, [
{deps, [
{opentelemetry_api, "1.5.0"},
{opentelemetry, "1.7.0"},
{opentelemetry_api_experimental, "0.5.1"}
]}
]}
]}.

{xref_checks, [
% mandatory
undefined_function_calls,
Expand Down Expand Up @@ -89,7 +107,9 @@
{runtime_tools, load},
{tools, load},
{canal, load},
opentelemetry_exporter,
{opentelemetry, temporary},
opentelemetry_experimental,
logger_logstash_formatter,
sasl,
herd,
Expand Down
Loading
Loading