Add heuristics for delayed notifications.
This commit is contained in:
parent
c012ead143
commit
66792f2d56
8 changed files with 189 additions and 14 deletions
|
@ -153,6 +153,16 @@ class LocalMetricsDatabase private constructor(
|
|||
writableDatabase.delete(TABLE_NAME, null, null)
|
||||
}
|
||||
|
||||
fun getOldestMetricTime(eventName: String): Long {
|
||||
readableDatabase.rawQuery("SELECT $CREATED_AT FROM $TABLE_NAME WHERE $EVENT_NAME = ? ORDER BY $CREATED_AT ASC", SqlUtil.buildArgs(eventName)).use { cursor ->
|
||||
return if (cursor.moveToFirst()) {
|
||||
cursor.getLong(0)
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun getMetrics(): List<EventMetrics> {
|
||||
val db = readableDatabase
|
||||
|
||||
|
@ -213,7 +223,7 @@ class LocalMetricsDatabase private constructor(
|
|||
}
|
||||
}
|
||||
|
||||
private fun eventPercent(eventName: String, percent: Int): Long {
|
||||
fun eventPercent(eventName: String, percent: Int): Long {
|
||||
return percentile(EventTotals.VIEW_NAME, "$EVENT_NAME = '$eventName'", percent)
|
||||
}
|
||||
|
||||
|
|
|
@ -53,7 +53,12 @@ class FcmFetchForegroundService : Service() {
|
|||
|
||||
private var foregroundServiceState: State = State.STOPPED
|
||||
|
||||
fun startServiceIfNecessary(context: Context) {
|
||||
/**
|
||||
* Attempts to start the foreground service if it isn't already running.
|
||||
*
|
||||
* @return false if we failed to start the foreground service
|
||||
*/
|
||||
fun startServiceIfNecessary(context: Context): Boolean {
|
||||
synchronized(this) {
|
||||
when (foregroundServiceState) {
|
||||
State.STOPPING -> foregroundServiceState = State.RESTARTING
|
||||
|
@ -64,11 +69,13 @@ class FcmFetchForegroundService : Service() {
|
|||
} catch (e: IllegalStateException) {
|
||||
Log.e(TAG, "Failed to start foreground service", e)
|
||||
State.STOPPED
|
||||
return false
|
||||
}
|
||||
}
|
||||
else -> Log.i(TAG, "Already started foreground service")
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
fun stopServiceIfNecessary(context: Context) {
|
||||
|
|
|
@ -51,22 +51,21 @@ object FcmFetchManager {
|
|||
@Volatile
|
||||
private var highPriority = false
|
||||
|
||||
/**
|
||||
* @return True if a service was successfully started, otherwise false.
|
||||
*/
|
||||
@JvmStatic
|
||||
fun startBackgroundService(context: Context) {
|
||||
Log.i(TAG, "Starting in the background.")
|
||||
context.startService(Intent(context, FcmFetchBackgroundService::class.java))
|
||||
SignalLocalMetrics.FcmServiceStartSuccess.onFcmStarted()
|
||||
}
|
||||
|
||||
/**
|
||||
* @return True if a service was successfully started, otherwise false.
|
||||
*/
|
||||
@JvmStatic
|
||||
fun startForegroundService(context: Context) {
|
||||
Log.i(TAG, "Starting in the foreground.")
|
||||
FcmFetchForegroundService.startServiceIfNecessary(context)
|
||||
if (FcmFetchForegroundService.startServiceIfNecessary(context)) {
|
||||
SignalLocalMetrics.FcmServiceStartSuccess.onFcmStarted()
|
||||
} else {
|
||||
SignalLocalMetrics.FcmServiceStartFailure.onFcmFailedToStart()
|
||||
}
|
||||
}
|
||||
|
||||
private fun postMayHaveMessagesNotification(context: Context) {
|
||||
|
|
|
@ -16,6 +16,7 @@ import org.thoughtcrime.securesms.jobs.SubmitRateLimitPushChallengeJob;
|
|||
import org.thoughtcrime.securesms.keyvalue.SignalStore;
|
||||
import org.thoughtcrime.securesms.registration.PushChallengeRequest;
|
||||
import org.thoughtcrime.securesms.util.NetworkUtil;
|
||||
import org.thoughtcrime.securesms.util.SignalLocalMetrics;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
|
@ -85,6 +86,7 @@ public class FcmReceiveService extends FirebaseMessagingService {
|
|||
}
|
||||
} catch (Exception e) {
|
||||
Log.w(TAG, "Failed to start service.", e);
|
||||
SignalLocalMetrics.FcmServiceStartFailure.onFcmFailedToStart();
|
||||
}
|
||||
|
||||
FcmFetchManager.enqueueFetch(context, highPriority);
|
||||
|
|
|
@ -298,6 +298,8 @@ class IncomingMessageObserver(private val context: Application) {
|
|||
val localReceiveMetric = SignalLocalMetrics.MessageReceive.start()
|
||||
val result = MessageDecryptor.decrypt(context, bufferedProtocolStore, envelope, serverDeliveredTimestamp)
|
||||
localReceiveMetric.onEnvelopeDecrypted()
|
||||
|
||||
SignalLocalMetrics.MessageLatency.onMessageReceived(envelope.serverTimestamp, serverDeliveredTimestamp)
|
||||
when (result) {
|
||||
is MessageDecryptor.Result.Success -> {
|
||||
val job = PushProcessMessageJobV2.processOrDefer(messageContentProcessor, result, localReceiveMetric)
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
/*
|
||||
* Copyright 2023 Signal Messenger, LLC
|
||||
* SPDX-License-Identifier: AGPL-3.0-only
|
||||
*/
|
||||
|
||||
package org.thoughtcrime.securesms.notifications
|
||||
|
||||
import org.signal.core.util.logging.Log
|
||||
import org.thoughtcrime.securesms.database.LocalMetricsDatabase
|
||||
import org.thoughtcrime.securesms.dependencies.ApplicationDependencies
|
||||
import org.thoughtcrime.securesms.util.SignalLocalMetrics
|
||||
|
||||
/**
|
||||
* Heuristic for estimating if a user has been experiencing issues with delayed notifications.
|
||||
*
|
||||
* This uses local metrics based off of message latency, failed service starts, and queue drain
|
||||
* timeouts.
|
||||
*
|
||||
* This will enable us to attempt to improve notifications for users who are experiencing these issues.
|
||||
*/
|
||||
object SlowNotificationHeuristics {
|
||||
|
||||
private val TAG = Log.tag(SlowNotificationHeuristics::class.java)
|
||||
|
||||
fun isHavingDelayedNotifications(configuration: Configuration): Boolean {
|
||||
val db = LocalMetricsDatabase.getInstance(ApplicationDependencies.getApplication())
|
||||
|
||||
val metrics = db.getMetrics()
|
||||
|
||||
val failedServiceStarts = hasRepeatedFailedServiceStarts(metrics, configuration.minimumEventAgeMs, configuration.minimumServiceEventCount, configuration.serviceStartFailurePercentage)
|
||||
val failedQueueDrains = isFailingToDrainQueue(metrics, configuration.minimumEventAgeMs, configuration.weeklyFailedQueueDrains)
|
||||
val longMessageLatency = hasLongMessageLatency(metrics, configuration.minimumEventAgeMs, configuration.messageLatencyPercentage, configuration.minimumMessageLatencyEvents, configuration.messageLatencyThreshold)
|
||||
|
||||
if (failedServiceStarts || failedQueueDrains || longMessageLatency) {
|
||||
Log.w(TAG, "User seems to be having delayed notifications: failed-service-starts=$failedServiceStarts failedQueueDrains=$failedQueueDrains longMessageLatency=$longMessageLatency")
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
private fun hasRepeatedFailedServiceStarts(metrics: List<LocalMetricsDatabase.EventMetrics>, minimumEventAgeMs: Long, minimumEventCount: Int, failurePercentage: Float): Boolean {
|
||||
if (!haveEnoughData(SignalLocalMetrics.FcmServiceStartSuccess.NAME, minimumEventAgeMs) && !haveEnoughData(SignalLocalMetrics.FcmServiceStartFailure.NAME, minimumEventAgeMs)) {
|
||||
Log.d(TAG, "insufficient data for service starts")
|
||||
return false
|
||||
}
|
||||
|
||||
val successes = metrics.filter { it.name == SignalLocalMetrics.FcmServiceStartSuccess.NAME }
|
||||
val failures = metrics.filter { it.name == SignalLocalMetrics.FcmServiceStartFailure.NAME }
|
||||
|
||||
if ((successes.size + failures.size) < minimumEventCount) {
|
||||
Log.d(TAG, "insufficient service start events")
|
||||
return false
|
||||
}
|
||||
|
||||
if (failures.size / (failures.size + successes.size) >= failurePercentage) {
|
||||
Log.w(TAG, "User often unable start FCM service. ${failures.size} failed : ${successes.size} successful")
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
private fun isFailingToDrainQueue(metrics: List<LocalMetricsDatabase.EventMetrics>, minimumEventAgeMs: Long, failureThreshold: Int): Boolean {
|
||||
if (!haveEnoughData(SignalLocalMetrics.PushWebsocketFetch.SUCCESS_EVENT, minimumEventAgeMs) && !haveEnoughData(SignalLocalMetrics.PushWebsocketFetch.TIMEOUT_EVENT, minimumEventAgeMs)) {
|
||||
Log.d(TAG, "insufficient data for failed queue drains")
|
||||
return false
|
||||
}
|
||||
val failures = metrics.filter { it.name == SignalLocalMetrics.PushWebsocketFetch.TIMEOUT_EVENT }
|
||||
if (failures.size < failureThreshold) {
|
||||
return false
|
||||
}
|
||||
Log.w(TAG, "User has repeatedly failed to drain queue ${failures.size} events")
|
||||
return true
|
||||
}
|
||||
|
||||
private fun hasLongMessageLatency(metrics: List<LocalMetricsDatabase.EventMetrics>, minimumEventAgeMs: Long, percentage: Int, messageThreshold: Int, durationThreshold: Long): Boolean {
|
||||
if (!haveEnoughData(SignalLocalMetrics.MessageLatency.NAME, minimumEventAgeMs)) {
|
||||
Log.d(TAG, "insufficient data for message latency")
|
||||
return false
|
||||
}
|
||||
val eventCount = metrics.count { it.name == SignalLocalMetrics.MessageLatency.NAME }
|
||||
if (eventCount < messageThreshold) {
|
||||
Log.d(TAG, "not enough messages for message latency")
|
||||
return false
|
||||
}
|
||||
val db = LocalMetricsDatabase.getInstance(ApplicationDependencies.getApplication())
|
||||
val averageLatency = db.eventPercent(SignalLocalMetrics.MessageLatency.NAME, percentage.coerceAtMost(100).coerceAtLeast(0))
|
||||
|
||||
val longMessageLatency = averageLatency > durationThreshold
|
||||
if (longMessageLatency) {
|
||||
Log.w(TAG, "User has high average message latency of $averageLatency ms over $eventCount events")
|
||||
}
|
||||
return longMessageLatency
|
||||
}
|
||||
|
||||
private fun haveEnoughData(eventName: String, minimumEventAgeMs: Long): Boolean {
|
||||
val db = LocalMetricsDatabase.getInstance(ApplicationDependencies.getApplication())
|
||||
|
||||
val oldestEvent = db.getOldestMetricTime(eventName)
|
||||
|
||||
return !(oldestEvent == 0L || oldestEvent > System.currentTimeMillis() - minimumEventAgeMs)
|
||||
}
|
||||
}
|
||||
|
||||
data class Configuration(
|
||||
val minimumEventAgeMs: Long,
|
||||
val minimumServiceEventCount: Int,
|
||||
val serviceStartFailurePercentage: Float,
|
||||
val weeklyFailedQueueDrains: Int,
|
||||
val minimumMessageLatencyEvents: Int,
|
||||
val messageLatencyThreshold: Long,
|
||||
val messageLatencyPercentage: Int
|
||||
)
|
|
@ -87,14 +87,15 @@ object LocalMetrics {
|
|||
*
|
||||
* If an event with the provided ID does not exist, this is effectively a no-op.
|
||||
*/
|
||||
fun splitWithDuration(id: String, split: String, duration: Long) {
|
||||
val time = System.currentTimeMillis()
|
||||
@JvmOverloads
|
||||
fun splitWithDuration(id: String, split: String, duration: Long, timeunit: TimeUnit = TimeUnit.MILLISECONDS) {
|
||||
val time = SystemClock.elapsedRealtimeNanos()
|
||||
|
||||
executor.execute {
|
||||
val lastTime: Long? = lastSplitTimeById[id]
|
||||
val splitDoesNotExist: Boolean = eventsById[id]?.splits?.none { it.name == split } ?: true
|
||||
if (lastTime != null && splitDoesNotExist) {
|
||||
eventsById[id]?.splits?.add(LocalMetricsSplit(split, duration))
|
||||
eventsById[id]?.splits?.add(LocalMetricsSplit(split, TimeUnit.NANOSECONDS.convert(duration, timeunit)))
|
||||
lastSplitTimeById[id] = time
|
||||
}
|
||||
}
|
||||
|
|
|
@ -203,9 +203,51 @@ public final class SignalLocalMetrics {
|
|||
}
|
||||
}
|
||||
|
||||
public static final class MessageLatency {
|
||||
public static final String NAME = "message-latency";
|
||||
|
||||
private static final String SPLIT_LATENCY = "latency";
|
||||
|
||||
public static void onMessageReceived(long serverReceiveTimestamp, long serverDeliverTimestamp) {
|
||||
long latency = serverDeliverTimestamp - serverReceiveTimestamp;
|
||||
|
||||
String id = NAME + System.currentTimeMillis();
|
||||
LocalMetrics.getInstance().start(id, NAME);
|
||||
LocalMetrics.getInstance().splitWithDuration(id, SPLIT_LATENCY, latency);
|
||||
LocalMetrics.getInstance().end(id);
|
||||
}
|
||||
}
|
||||
|
||||
public static final class FcmServiceStartFailure {
|
||||
public static final String NAME = "fcm-service-start-failure";
|
||||
|
||||
private static final String SPLIT_EVENT = "event";
|
||||
|
||||
public static void onFcmFailedToStart() {
|
||||
String id = NAME + System.currentTimeMillis();
|
||||
LocalMetrics.getInstance().start(id, NAME);
|
||||
LocalMetrics.getInstance().splitWithDuration(id, SPLIT_EVENT, 1);
|
||||
LocalMetrics.getInstance().end(id);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static final class FcmServiceStartSuccess {
|
||||
public static final String NAME = "fcm-service-start-success";
|
||||
|
||||
private static final String SPLIT_EVENT = "event";
|
||||
|
||||
public static void onFcmStarted() {
|
||||
String id = NAME + System.currentTimeMillis();
|
||||
LocalMetrics.getInstance().start(id, NAME);
|
||||
LocalMetrics.getInstance().splitWithDuration(id, SPLIT_EVENT, 1);
|
||||
LocalMetrics.getInstance().end(id);
|
||||
}
|
||||
|
||||
}
|
||||
public static final class PushWebsocketFetch {
|
||||
private static final String SUCCESS_EVENT = "push-websocket-fetch";
|
||||
private static final String TIMEOUT_EVENT = "timed-out-fetch";
|
||||
public static final String SUCCESS_EVENT = "push-websocket-fetch";
|
||||
public static final String TIMEOUT_EVENT = "timed-out-fetch";
|
||||
|
||||
private static final String SPLIT_BATCH_PROCESSED = "batches-processed";
|
||||
private static final String SPLIT_PROCESS_TIME = "fetch-time";
|
||||
|
|
Loading…
Add table
Reference in a new issue