Add heuristics for delayed notifications.

This commit is contained in:
Clark 2023-07-28 13:11:46 -04:00 committed by Greyson Parrelli
parent c012ead143
commit 66792f2d56
8 changed files with 189 additions and 14 deletions

View file

@ -153,6 +153,16 @@ class LocalMetricsDatabase private constructor(
writableDatabase.delete(TABLE_NAME, null, null)
}
fun getOldestMetricTime(eventName: String): Long {
readableDatabase.rawQuery("SELECT $CREATED_AT FROM $TABLE_NAME WHERE $EVENT_NAME = ? ORDER BY $CREATED_AT ASC", SqlUtil.buildArgs(eventName)).use { cursor ->
return if (cursor.moveToFirst()) {
cursor.getLong(0)
} else {
0
}
}
}
fun getMetrics(): List<EventMetrics> {
val db = readableDatabase
@ -213,7 +223,7 @@ class LocalMetricsDatabase private constructor(
}
}
private fun eventPercent(eventName: String, percent: Int): Long {
fun eventPercent(eventName: String, percent: Int): Long {
return percentile(EventTotals.VIEW_NAME, "$EVENT_NAME = '$eventName'", percent)
}

View file

@ -53,7 +53,12 @@ class FcmFetchForegroundService : Service() {
private var foregroundServiceState: State = State.STOPPED
fun startServiceIfNecessary(context: Context) {
/**
* Attempts to start the foreground service if it isn't already running.
*
* @return false if we failed to start the foreground service
*/
fun startServiceIfNecessary(context: Context): Boolean {
synchronized(this) {
when (foregroundServiceState) {
State.STOPPING -> foregroundServiceState = State.RESTARTING
@ -64,11 +69,13 @@ class FcmFetchForegroundService : Service() {
} catch (e: IllegalStateException) {
Log.e(TAG, "Failed to start foreground service", e)
State.STOPPED
return false
}
}
else -> Log.i(TAG, "Already started foreground service")
}
}
return true
}
fun stopServiceIfNecessary(context: Context) {

View file

@ -51,22 +51,21 @@ object FcmFetchManager {
@Volatile
private var highPriority = false
/**
* @return True if a service was successfully started, otherwise false.
*/
@JvmStatic
fun startBackgroundService(context: Context) {
Log.i(TAG, "Starting in the background.")
context.startService(Intent(context, FcmFetchBackgroundService::class.java))
SignalLocalMetrics.FcmServiceStartSuccess.onFcmStarted()
}
/**
* @return True if a service was successfully started, otherwise false.
*/
@JvmStatic
fun startForegroundService(context: Context) {
Log.i(TAG, "Starting in the foreground.")
FcmFetchForegroundService.startServiceIfNecessary(context)
if (FcmFetchForegroundService.startServiceIfNecessary(context)) {
SignalLocalMetrics.FcmServiceStartSuccess.onFcmStarted()
} else {
SignalLocalMetrics.FcmServiceStartFailure.onFcmFailedToStart()
}
}
private fun postMayHaveMessagesNotification(context: Context) {

View file

@ -16,6 +16,7 @@ import org.thoughtcrime.securesms.jobs.SubmitRateLimitPushChallengeJob;
import org.thoughtcrime.securesms.keyvalue.SignalStore;
import org.thoughtcrime.securesms.registration.PushChallengeRequest;
import org.thoughtcrime.securesms.util.NetworkUtil;
import org.thoughtcrime.securesms.util.SignalLocalMetrics;
import java.util.Locale;
@ -85,6 +86,7 @@ public class FcmReceiveService extends FirebaseMessagingService {
}
} catch (Exception e) {
Log.w(TAG, "Failed to start service.", e);
SignalLocalMetrics.FcmServiceStartFailure.onFcmFailedToStart();
}
FcmFetchManager.enqueueFetch(context, highPriority);

View file

@ -298,6 +298,8 @@ class IncomingMessageObserver(private val context: Application) {
val localReceiveMetric = SignalLocalMetrics.MessageReceive.start()
val result = MessageDecryptor.decrypt(context, bufferedProtocolStore, envelope, serverDeliveredTimestamp)
localReceiveMetric.onEnvelopeDecrypted()
SignalLocalMetrics.MessageLatency.onMessageReceived(envelope.serverTimestamp, serverDeliveredTimestamp)
when (result) {
is MessageDecryptor.Result.Success -> {
val job = PushProcessMessageJobV2.processOrDefer(messageContentProcessor, result, localReceiveMetric)

View file

@ -0,0 +1,112 @@
/*
* Copyright 2023 Signal Messenger, LLC
* SPDX-License-Identifier: AGPL-3.0-only
*/
package org.thoughtcrime.securesms.notifications
import org.signal.core.util.logging.Log
import org.thoughtcrime.securesms.database.LocalMetricsDatabase
import org.thoughtcrime.securesms.dependencies.ApplicationDependencies
import org.thoughtcrime.securesms.util.SignalLocalMetrics
/**
* Heuristic for estimating if a user has been experiencing issues with delayed notifications.
*
* This uses local metrics based off of message latency, failed service starts, and queue drain
* timeouts.
*
* This will enable us to attempt to improve notifications for users who are experiencing these issues.
*/
object SlowNotificationHeuristics {
private val TAG = Log.tag(SlowNotificationHeuristics::class.java)
fun isHavingDelayedNotifications(configuration: Configuration): Boolean {
val db = LocalMetricsDatabase.getInstance(ApplicationDependencies.getApplication())
val metrics = db.getMetrics()
val failedServiceStarts = hasRepeatedFailedServiceStarts(metrics, configuration.minimumEventAgeMs, configuration.minimumServiceEventCount, configuration.serviceStartFailurePercentage)
val failedQueueDrains = isFailingToDrainQueue(metrics, configuration.minimumEventAgeMs, configuration.weeklyFailedQueueDrains)
val longMessageLatency = hasLongMessageLatency(metrics, configuration.minimumEventAgeMs, configuration.messageLatencyPercentage, configuration.minimumMessageLatencyEvents, configuration.messageLatencyThreshold)
if (failedServiceStarts || failedQueueDrains || longMessageLatency) {
Log.w(TAG, "User seems to be having delayed notifications: failed-service-starts=$failedServiceStarts failedQueueDrains=$failedQueueDrains longMessageLatency=$longMessageLatency")
return true
}
return false
}
private fun hasRepeatedFailedServiceStarts(metrics: List<LocalMetricsDatabase.EventMetrics>, minimumEventAgeMs: Long, minimumEventCount: Int, failurePercentage: Float): Boolean {
if (!haveEnoughData(SignalLocalMetrics.FcmServiceStartSuccess.NAME, minimumEventAgeMs) && !haveEnoughData(SignalLocalMetrics.FcmServiceStartFailure.NAME, minimumEventAgeMs)) {
Log.d(TAG, "insufficient data for service starts")
return false
}
val successes = metrics.filter { it.name == SignalLocalMetrics.FcmServiceStartSuccess.NAME }
val failures = metrics.filter { it.name == SignalLocalMetrics.FcmServiceStartFailure.NAME }
if ((successes.size + failures.size) < minimumEventCount) {
Log.d(TAG, "insufficient service start events")
return false
}
if (failures.size / (failures.size + successes.size) >= failurePercentage) {
Log.w(TAG, "User often unable start FCM service. ${failures.size} failed : ${successes.size} successful")
return true
}
return false
}
private fun isFailingToDrainQueue(metrics: List<LocalMetricsDatabase.EventMetrics>, minimumEventAgeMs: Long, failureThreshold: Int): Boolean {
if (!haveEnoughData(SignalLocalMetrics.PushWebsocketFetch.SUCCESS_EVENT, minimumEventAgeMs) && !haveEnoughData(SignalLocalMetrics.PushWebsocketFetch.TIMEOUT_EVENT, minimumEventAgeMs)) {
Log.d(TAG, "insufficient data for failed queue drains")
return false
}
val failures = metrics.filter { it.name == SignalLocalMetrics.PushWebsocketFetch.TIMEOUT_EVENT }
if (failures.size < failureThreshold) {
return false
}
Log.w(TAG, "User has repeatedly failed to drain queue ${failures.size} events")
return true
}
private fun hasLongMessageLatency(metrics: List<LocalMetricsDatabase.EventMetrics>, minimumEventAgeMs: Long, percentage: Int, messageThreshold: Int, durationThreshold: Long): Boolean {
if (!haveEnoughData(SignalLocalMetrics.MessageLatency.NAME, minimumEventAgeMs)) {
Log.d(TAG, "insufficient data for message latency")
return false
}
val eventCount = metrics.count { it.name == SignalLocalMetrics.MessageLatency.NAME }
if (eventCount < messageThreshold) {
Log.d(TAG, "not enough messages for message latency")
return false
}
val db = LocalMetricsDatabase.getInstance(ApplicationDependencies.getApplication())
val averageLatency = db.eventPercent(SignalLocalMetrics.MessageLatency.NAME, percentage.coerceAtMost(100).coerceAtLeast(0))
val longMessageLatency = averageLatency > durationThreshold
if (longMessageLatency) {
Log.w(TAG, "User has high average message latency of $averageLatency ms over $eventCount events")
}
return longMessageLatency
}
private fun haveEnoughData(eventName: String, minimumEventAgeMs: Long): Boolean {
val db = LocalMetricsDatabase.getInstance(ApplicationDependencies.getApplication())
val oldestEvent = db.getOldestMetricTime(eventName)
return !(oldestEvent == 0L || oldestEvent > System.currentTimeMillis() - minimumEventAgeMs)
}
}
data class Configuration(
val minimumEventAgeMs: Long,
val minimumServiceEventCount: Int,
val serviceStartFailurePercentage: Float,
val weeklyFailedQueueDrains: Int,
val minimumMessageLatencyEvents: Int,
val messageLatencyThreshold: Long,
val messageLatencyPercentage: Int
)

View file

@ -87,14 +87,15 @@ object LocalMetrics {
*
* If an event with the provided ID does not exist, this is effectively a no-op.
*/
fun splitWithDuration(id: String, split: String, duration: Long) {
val time = System.currentTimeMillis()
@JvmOverloads
fun splitWithDuration(id: String, split: String, duration: Long, timeunit: TimeUnit = TimeUnit.MILLISECONDS) {
val time = SystemClock.elapsedRealtimeNanos()
executor.execute {
val lastTime: Long? = lastSplitTimeById[id]
val splitDoesNotExist: Boolean = eventsById[id]?.splits?.none { it.name == split } ?: true
if (lastTime != null && splitDoesNotExist) {
eventsById[id]?.splits?.add(LocalMetricsSplit(split, duration))
eventsById[id]?.splits?.add(LocalMetricsSplit(split, TimeUnit.NANOSECONDS.convert(duration, timeunit)))
lastSplitTimeById[id] = time
}
}

View file

@ -203,9 +203,51 @@ public final class SignalLocalMetrics {
}
}
public static final class MessageLatency {
public static final String NAME = "message-latency";
private static final String SPLIT_LATENCY = "latency";
public static void onMessageReceived(long serverReceiveTimestamp, long serverDeliverTimestamp) {
long latency = serverDeliverTimestamp - serverReceiveTimestamp;
String id = NAME + System.currentTimeMillis();
LocalMetrics.getInstance().start(id, NAME);
LocalMetrics.getInstance().splitWithDuration(id, SPLIT_LATENCY, latency);
LocalMetrics.getInstance().end(id);
}
}
public static final class FcmServiceStartFailure {
public static final String NAME = "fcm-service-start-failure";
private static final String SPLIT_EVENT = "event";
public static void onFcmFailedToStart() {
String id = NAME + System.currentTimeMillis();
LocalMetrics.getInstance().start(id, NAME);
LocalMetrics.getInstance().splitWithDuration(id, SPLIT_EVENT, 1);
LocalMetrics.getInstance().end(id);
}
}
public static final class FcmServiceStartSuccess {
public static final String NAME = "fcm-service-start-success";
private static final String SPLIT_EVENT = "event";
public static void onFcmStarted() {
String id = NAME + System.currentTimeMillis();
LocalMetrics.getInstance().start(id, NAME);
LocalMetrics.getInstance().splitWithDuration(id, SPLIT_EVENT, 1);
LocalMetrics.getInstance().end(id);
}
}
public static final class PushWebsocketFetch {
private static final String SUCCESS_EVENT = "push-websocket-fetch";
private static final String TIMEOUT_EVENT = "timed-out-fetch";
public static final String SUCCESS_EVENT = "push-websocket-fetch";
public static final String TIMEOUT_EVENT = "timed-out-fetch";
private static final String SPLIT_BATCH_PROCESSED = "batches-processed";
private static final String SPLIT_PROCESS_TIME = "fetch-time";