mirror of
https://github.com/bitwarden/server
synced 2025-12-31 07:33:43 +00:00
Refactor IntegrationHandlerResult to provide more detail around failures (#6736)
* Refactor IntegrationHandlerResult to provide more detail around failures * ServiceUnavailable now retryable, more explicit http status handling, more consistency with different handlers, additional xmldocs * Address PR feedback
This commit is contained in:
@@ -0,0 +1,37 @@
|
||||
namespace Bit.Core.AdminConsole.Models.Data.EventIntegrations;
|
||||
|
||||
/// <summary>
|
||||
/// Categories of event integration failures used for classification and retry logic.
|
||||
/// </summary>
|
||||
public enum IntegrationFailureCategory
|
||||
{
|
||||
/// <summary>
|
||||
/// Service is temporarily unavailable (503, upstream outage, maintenance).
|
||||
/// </summary>
|
||||
ServiceUnavailable,
|
||||
|
||||
/// <summary>
|
||||
/// Authentication failed (401, 403, invalid_auth, token issues).
|
||||
/// </summary>
|
||||
AuthenticationFailed,
|
||||
|
||||
/// <summary>
|
||||
/// Configuration error (invalid config, channel_not_found, etc.).
|
||||
/// </summary>
|
||||
ConfigurationError,
|
||||
|
||||
/// <summary>
|
||||
/// Rate limited (429, rate_limited).
|
||||
/// </summary>
|
||||
RateLimited,
|
||||
|
||||
/// <summary>
|
||||
/// Transient error (timeouts, 500, network errors).
|
||||
/// </summary>
|
||||
TransientError,
|
||||
|
||||
/// <summary>
|
||||
/// Permanent failure unrelated to authentication/config (e.g., unrecoverable payload/format issue).
|
||||
/// </summary>
|
||||
PermanentFailure
|
||||
}
|
||||
@@ -1,16 +1,84 @@
|
||||
namespace Bit.Core.AdminConsole.Models.Data.EventIntegrations;
|
||||
|
||||
/// <summary>
|
||||
/// Represents the result of an integration handler operation, including success status,
|
||||
/// failure categorization, and retry metadata. Use the <see cref="Succeed"/> factory method
|
||||
/// for successful operations or <see cref="Fail"/> for failures with automatic retry-ability
|
||||
/// determination based on the failure category.
|
||||
/// </summary>
|
||||
public class IntegrationHandlerResult
|
||||
{
|
||||
public IntegrationHandlerResult(bool success, IIntegrationMessage message)
|
||||
/// <summary>
|
||||
/// True if the integration send succeeded, false otherwise.
|
||||
/// </summary>
|
||||
public bool Success { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The integration message that was processed.
|
||||
/// </summary>
|
||||
public IIntegrationMessage Message { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional UTC date/time indicating when a failed operation should be retried.
|
||||
/// Will be used by the retry queue to delay re-sending the message.
|
||||
/// Usually set based on the Retry-After header from rate-limited responses.
|
||||
/// </summary>
|
||||
public DateTime? DelayUntilDate { get; private init; }
|
||||
|
||||
/// <summary>
|
||||
/// Category of the failure. Null for successful results.
|
||||
/// </summary>
|
||||
public IntegrationFailureCategory? Category { get; private init; }
|
||||
|
||||
/// <summary>
|
||||
/// Detailed failure reason or error message. Empty for successful results.
|
||||
/// </summary>
|
||||
public string? FailureReason { get; private init; }
|
||||
|
||||
/// <summary>
|
||||
/// Indicates whether the operation is retryable.
|
||||
/// Computed from the failure category.
|
||||
/// </summary>
|
||||
public bool Retryable => Category switch
|
||||
{
|
||||
IntegrationFailureCategory.RateLimited => true,
|
||||
IntegrationFailureCategory.TransientError => true,
|
||||
IntegrationFailureCategory.ServiceUnavailable => true,
|
||||
IntegrationFailureCategory.AuthenticationFailed => false,
|
||||
IntegrationFailureCategory.ConfigurationError => false,
|
||||
IntegrationFailureCategory.PermanentFailure => false,
|
||||
null => false,
|
||||
_ => false
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Creates a successful result.
|
||||
/// </summary>
|
||||
public static IntegrationHandlerResult Succeed(IIntegrationMessage message)
|
||||
{
|
||||
return new IntegrationHandlerResult(success: true, message: message);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a failed result with a failure category and reason.
|
||||
/// </summary>
|
||||
public static IntegrationHandlerResult Fail(
|
||||
IIntegrationMessage message,
|
||||
IntegrationFailureCategory category,
|
||||
string failureReason,
|
||||
DateTime? delayUntil = null)
|
||||
{
|
||||
return new IntegrationHandlerResult(success: false, message: message)
|
||||
{
|
||||
Category = category,
|
||||
FailureReason = failureReason,
|
||||
DelayUntilDate = delayUntil
|
||||
};
|
||||
}
|
||||
|
||||
private IntegrationHandlerResult(bool success, IIntegrationMessage message)
|
||||
{
|
||||
Success = success;
|
||||
Message = message;
|
||||
}
|
||||
|
||||
public bool Success { get; set; } = false;
|
||||
public bool Retryable { get; set; } = false;
|
||||
public IIntegrationMessage Message { get; set; }
|
||||
public DateTime? DelayUntilDate { get; set; }
|
||||
public string FailureReason { get; set; } = string.Empty;
|
||||
}
|
||||
|
||||
@@ -29,46 +29,87 @@ public abstract class IntegrationHandlerBase<T> : IIntegrationHandler<T>
|
||||
IntegrationMessage<T> message,
|
||||
TimeProvider timeProvider)
|
||||
{
|
||||
var result = new IntegrationHandlerResult(success: response.IsSuccessStatusCode, message);
|
||||
|
||||
if (response.IsSuccessStatusCode) return result;
|
||||
|
||||
switch (response.StatusCode)
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
case HttpStatusCode.TooManyRequests:
|
||||
case HttpStatusCode.RequestTimeout:
|
||||
case HttpStatusCode.InternalServerError:
|
||||
case HttpStatusCode.BadGateway:
|
||||
case HttpStatusCode.ServiceUnavailable:
|
||||
case HttpStatusCode.GatewayTimeout:
|
||||
result.Retryable = true;
|
||||
result.FailureReason = response.ReasonPhrase ?? $"Failure with status code: {(int)response.StatusCode}";
|
||||
|
||||
if (response.Headers.TryGetValues("Retry-After", out var values))
|
||||
{
|
||||
var value = values.FirstOrDefault();
|
||||
if (int.TryParse(value, out var seconds))
|
||||
{
|
||||
// Retry-after was specified in seconds. Adjust DelayUntilDate by the requested number of seconds.
|
||||
result.DelayUntilDate = timeProvider.GetUtcNow().AddSeconds(seconds).UtcDateTime;
|
||||
}
|
||||
else if (DateTimeOffset.TryParseExact(value,
|
||||
"r", // "r" is the round-trip format: RFC1123
|
||||
CultureInfo.InvariantCulture,
|
||||
DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal,
|
||||
out var retryDate))
|
||||
{
|
||||
// Retry-after was specified as a date. Adjust DelayUntilDate to the specified date.
|
||||
result.DelayUntilDate = retryDate.UtcDateTime;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
result.Retryable = false;
|
||||
result.FailureReason = response.ReasonPhrase ?? $"Failure with status code {(int)response.StatusCode}";
|
||||
break;
|
||||
return IntegrationHandlerResult.Succeed(message);
|
||||
}
|
||||
|
||||
return result;
|
||||
var category = ClassifyHttpStatusCode(response.StatusCode);
|
||||
var failureReason = response.ReasonPhrase ?? $"Failure with status code {(int)response.StatusCode}";
|
||||
|
||||
if (category is not (IntegrationFailureCategory.RateLimited
|
||||
or IntegrationFailureCategory.TransientError
|
||||
or IntegrationFailureCategory.ServiceUnavailable) ||
|
||||
!response.Headers.TryGetValues("Retry-After", out var values)
|
||||
)
|
||||
{
|
||||
return IntegrationHandlerResult.Fail(message: message, category: category, failureReason: failureReason);
|
||||
}
|
||||
|
||||
// Handle Retry-After header for rate-limited and retryable errors
|
||||
DateTime? delayUntil = null;
|
||||
var value = values.FirstOrDefault();
|
||||
if (int.TryParse(value, out var seconds))
|
||||
{
|
||||
// Retry-after was specified in seconds
|
||||
delayUntil = timeProvider.GetUtcNow().AddSeconds(seconds).UtcDateTime;
|
||||
}
|
||||
else if (DateTimeOffset.TryParseExact(value,
|
||||
"r", // "r" is the round-trip format: RFC1123
|
||||
CultureInfo.InvariantCulture,
|
||||
DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal,
|
||||
out var retryDate))
|
||||
{
|
||||
// Retry-after was specified as a date
|
||||
delayUntil = retryDate.UtcDateTime;
|
||||
}
|
||||
|
||||
return IntegrationHandlerResult.Fail(
|
||||
message,
|
||||
category,
|
||||
failureReason,
|
||||
delayUntil
|
||||
);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Classifies an <see cref="HttpStatusCode"/> as an <see cref="IntegrationFailureCategory"/> to drive
|
||||
/// retry behavior and operator-facing failure reporting.
|
||||
/// </summary>
|
||||
/// <param name="statusCode">The HTTP status code.</param>
|
||||
/// <returns>The corresponding <see cref="IntegrationFailureCategory"/>.</returns>
|
||||
protected static IntegrationFailureCategory ClassifyHttpStatusCode(HttpStatusCode statusCode)
|
||||
{
|
||||
var explicitCategory = statusCode switch
|
||||
{
|
||||
HttpStatusCode.Unauthorized => IntegrationFailureCategory.AuthenticationFailed,
|
||||
HttpStatusCode.Forbidden => IntegrationFailureCategory.AuthenticationFailed,
|
||||
HttpStatusCode.NotFound => IntegrationFailureCategory.ConfigurationError,
|
||||
HttpStatusCode.Gone => IntegrationFailureCategory.ConfigurationError,
|
||||
HttpStatusCode.MovedPermanently => IntegrationFailureCategory.ConfigurationError,
|
||||
HttpStatusCode.TemporaryRedirect => IntegrationFailureCategory.ConfigurationError,
|
||||
HttpStatusCode.PermanentRedirect => IntegrationFailureCategory.ConfigurationError,
|
||||
HttpStatusCode.TooManyRequests => IntegrationFailureCategory.RateLimited,
|
||||
HttpStatusCode.RequestTimeout => IntegrationFailureCategory.TransientError,
|
||||
HttpStatusCode.InternalServerError => IntegrationFailureCategory.TransientError,
|
||||
HttpStatusCode.BadGateway => IntegrationFailureCategory.TransientError,
|
||||
HttpStatusCode.GatewayTimeout => IntegrationFailureCategory.TransientError,
|
||||
HttpStatusCode.ServiceUnavailable => IntegrationFailureCategory.ServiceUnavailable,
|
||||
HttpStatusCode.NotImplemented => IntegrationFailureCategory.PermanentFailure,
|
||||
_ => (IntegrationFailureCategory?)null
|
||||
};
|
||||
|
||||
if (explicitCategory is not null)
|
||||
{
|
||||
return explicitCategory.Value;
|
||||
}
|
||||
|
||||
return (int)statusCode switch
|
||||
{
|
||||
>= 300 and <= 399 => IntegrationFailureCategory.ConfigurationError,
|
||||
>= 400 and <= 499 => IntegrationFailureCategory.ConfigurationError,
|
||||
>= 500 and <= 599 => IntegrationFailureCategory.ServiceUnavailable,
|
||||
_ => IntegrationFailureCategory.ServiceUnavailable
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,6 +85,17 @@ public class AzureServiceBusIntegrationListenerService<TConfiguration> : Backgro
|
||||
{
|
||||
// Non-recoverable failure or exceeded the max number of retries
|
||||
// Return false to indicate this message should be dead-lettered
|
||||
_logger.LogWarning(
|
||||
"Integration failure - non-recoverable error or max retries exceeded. " +
|
||||
"MessageId: {MessageId}, IntegrationType: {IntegrationType}, OrganizationId: {OrgId}, " +
|
||||
"FailureCategory: {Category}, Reason: {Reason}, RetryCount: {RetryCount}, MaxRetries: {MaxRetries}",
|
||||
message.MessageId,
|
||||
message.IntegrationType,
|
||||
message.OrganizationId,
|
||||
result.Category,
|
||||
result.FailureReason,
|
||||
message.RetryCount,
|
||||
_maxRetries);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,14 +106,32 @@ public class RabbitMqIntegrationListenerService<TConfiguration> : BackgroundServ
|
||||
{
|
||||
// Exceeded the max number of retries; fail and send to dead letter queue
|
||||
await _rabbitMqService.PublishToDeadLetterAsync(channel, message, cancellationToken);
|
||||
_logger.LogWarning("Max retry attempts reached. Sent to DLQ.");
|
||||
_logger.LogWarning(
|
||||
"Integration failure - max retries exceeded. " +
|
||||
"MessageId: {MessageId}, IntegrationType: {IntegrationType}, OrganizationId: {OrgId}, " +
|
||||
"FailureCategory: {Category}, Reason: {Reason}, RetryCount: {RetryCount}, MaxRetries: {MaxRetries}",
|
||||
message.MessageId,
|
||||
message.IntegrationType,
|
||||
message.OrganizationId,
|
||||
result.Category,
|
||||
result.FailureReason,
|
||||
message.RetryCount,
|
||||
_maxRetries);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fatal error (i.e. not retryable) occurred. Send message to dead letter queue without any retries
|
||||
await _rabbitMqService.PublishToDeadLetterAsync(channel, message, cancellationToken);
|
||||
_logger.LogWarning("Non-retryable failure. Sent to DLQ.");
|
||||
_logger.LogWarning(
|
||||
"Integration failure - non-retryable. " +
|
||||
"MessageId: {MessageId}, IntegrationType: {IntegrationType}, OrganizationId: {OrgId}, " +
|
||||
"FailureCategory: {Category}, Reason: {Reason}",
|
||||
message.MessageId,
|
||||
message.IntegrationType,
|
||||
message.OrganizationId,
|
||||
result.Category,
|
||||
result.FailureReason);
|
||||
}
|
||||
|
||||
// Message has been sent to retry or dead letter queues.
|
||||
|
||||
@@ -6,15 +6,6 @@ public class SlackIntegrationHandler(
|
||||
ISlackService slackService)
|
||||
: IntegrationHandlerBase<SlackIntegrationConfigurationDetails>
|
||||
{
|
||||
private static readonly HashSet<string> _retryableErrors = new(StringComparer.Ordinal)
|
||||
{
|
||||
"internal_error",
|
||||
"message_limit_exceeded",
|
||||
"rate_limited",
|
||||
"ratelimited",
|
||||
"service_unavailable"
|
||||
};
|
||||
|
||||
public override async Task<IntegrationHandlerResult> HandleAsync(IntegrationMessage<SlackIntegrationConfigurationDetails> message)
|
||||
{
|
||||
var slackResponse = await slackService.SendSlackMessageByChannelIdAsync(
|
||||
@@ -25,24 +16,61 @@ public class SlackIntegrationHandler(
|
||||
|
||||
if (slackResponse is null)
|
||||
{
|
||||
return new IntegrationHandlerResult(success: false, message: message)
|
||||
{
|
||||
FailureReason = "Slack response was null"
|
||||
};
|
||||
return IntegrationHandlerResult.Fail(
|
||||
message,
|
||||
IntegrationFailureCategory.TransientError,
|
||||
"Slack response was null"
|
||||
);
|
||||
}
|
||||
|
||||
if (slackResponse.Ok)
|
||||
{
|
||||
return new IntegrationHandlerResult(success: true, message: message);
|
||||
return IntegrationHandlerResult.Succeed(message);
|
||||
}
|
||||
|
||||
var result = new IntegrationHandlerResult(success: false, message: message) { FailureReason = slackResponse.Error };
|
||||
var category = ClassifySlackError(slackResponse.Error);
|
||||
return IntegrationHandlerResult.Fail(
|
||||
message,
|
||||
category,
|
||||
slackResponse.Error
|
||||
);
|
||||
}
|
||||
|
||||
if (_retryableErrors.Contains(slackResponse.Error))
|
||||
/// <summary>
|
||||
/// Classifies a Slack API error code string as an <see cref="IntegrationFailureCategory"/> to drive
|
||||
/// retry behavior and operator-facing failure reporting.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Slack responses commonly return an <c>error</c> string when <c>ok</c> is false. This method maps
|
||||
/// known Slack error codes to failure categories.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Any unrecognized error codes default to <see cref="IntegrationFailureCategory.TransientError"/> to avoid
|
||||
/// incorrectly marking new/unknown Slack failures as non-retryable.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
/// <param name="error">The Slack error code string (e.g. <c>invalid_auth</c>, <c>rate_limited</c>).</param>
|
||||
/// <returns>The corresponding <see cref="IntegrationFailureCategory"/>.</returns>
|
||||
private static IntegrationFailureCategory ClassifySlackError(string error)
|
||||
{
|
||||
return error switch
|
||||
{
|
||||
result.Retryable = true;
|
||||
}
|
||||
|
||||
return result;
|
||||
"invalid_auth" => IntegrationFailureCategory.AuthenticationFailed,
|
||||
"access_denied" => IntegrationFailureCategory.AuthenticationFailed,
|
||||
"token_expired" => IntegrationFailureCategory.AuthenticationFailed,
|
||||
"token_revoked" => IntegrationFailureCategory.AuthenticationFailed,
|
||||
"account_inactive" => IntegrationFailureCategory.AuthenticationFailed,
|
||||
"not_authed" => IntegrationFailureCategory.AuthenticationFailed,
|
||||
"channel_not_found" => IntegrationFailureCategory.ConfigurationError,
|
||||
"is_archived" => IntegrationFailureCategory.ConfigurationError,
|
||||
"rate_limited" => IntegrationFailureCategory.RateLimited,
|
||||
"ratelimited" => IntegrationFailureCategory.RateLimited,
|
||||
"message_limit_exceeded" => IntegrationFailureCategory.RateLimited,
|
||||
"internal_error" => IntegrationFailureCategory.TransientError,
|
||||
"service_unavailable" => IntegrationFailureCategory.ServiceUnavailable,
|
||||
"fatal_error" => IntegrationFailureCategory.ServiceUnavailable,
|
||||
_ => IntegrationFailureCategory.TransientError
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
using Bit.Core.AdminConsole.Models.Data.EventIntegrations;
|
||||
using System.Text.Json;
|
||||
using Bit.Core.AdminConsole.Models.Data.EventIntegrations;
|
||||
using Microsoft.Rest;
|
||||
|
||||
namespace Bit.Core.Services;
|
||||
@@ -18,24 +19,48 @@ public class TeamsIntegrationHandler(
|
||||
channelId: message.Configuration.ChannelId
|
||||
);
|
||||
|
||||
return new IntegrationHandlerResult(success: true, message: message);
|
||||
return IntegrationHandlerResult.Succeed(message);
|
||||
}
|
||||
catch (HttpOperationException ex)
|
||||
{
|
||||
var result = new IntegrationHandlerResult(success: false, message: message);
|
||||
var statusCode = (int)ex.Response.StatusCode;
|
||||
result.Retryable = statusCode is 429 or >= 500 and < 600;
|
||||
result.FailureReason = ex.Message;
|
||||
|
||||
return result;
|
||||
var category = ClassifyHttpStatusCode(ex.Response.StatusCode);
|
||||
return IntegrationHandlerResult.Fail(
|
||||
message,
|
||||
category,
|
||||
ex.Message
|
||||
);
|
||||
}
|
||||
catch (ArgumentException ex)
|
||||
{
|
||||
return IntegrationHandlerResult.Fail(
|
||||
message,
|
||||
IntegrationFailureCategory.ConfigurationError,
|
||||
ex.Message
|
||||
);
|
||||
}
|
||||
catch (UriFormatException ex)
|
||||
{
|
||||
return IntegrationHandlerResult.Fail(
|
||||
message,
|
||||
IntegrationFailureCategory.ConfigurationError,
|
||||
ex.Message
|
||||
);
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
return IntegrationHandlerResult.Fail(
|
||||
message,
|
||||
IntegrationFailureCategory.PermanentFailure,
|
||||
ex.Message
|
||||
);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
var result = new IntegrationHandlerResult(success: false, message: message);
|
||||
result.Retryable = false;
|
||||
result.FailureReason = ex.Message;
|
||||
|
||||
return result;
|
||||
return IntegrationHandlerResult.Fail(
|
||||
message,
|
||||
IntegrationFailureCategory.TransientError,
|
||||
ex.Message
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user