Skip to content

Commit

Permalink
[PLAT-16464][xCluster] Improve logging for the table bad state alert
Browse files Browse the repository at this point in the history
Summary: This diff adds some logging to ease the debugging of alerts regarding table replication bad statuses.

Test Plan: Created some tables in bad statuses in an xCluster config and made sure the proper message is logged.

Reviewers: #yba-api-review!, cwang, vbansal, nbhatia

Reviewed By: cwang

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D41304
  • Loading branch information
shahrooz1997 committed Jan 22, 2025
1 parent 79609c5 commit eae11f8
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 13 deletions.
1 change: 1 addition & 0 deletions managed/RUNTIME-FLAGS.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
| "Whether YBA supports transactional xCluster configs" | "yb.xcluster.transactional.enabled" | "GLOBAL" | "It indicates whether YBA should support transactional xCluster configs" | "Boolean" |
| "Enable disaster recovery" | "yb.xcluster.dr.enabled" | "GLOBAL" | "It indicates whether creating disaster recovery configs are enabled" | "Boolean" |
| "Enable xcluster/DR auto flag validation" | "yb.xcluster.enable_auto_flag_validation" | "GLOBAL" | "Enables checks for xcluster/disaster recovery validations for autoflags for xcluster/DR operations" | "Boolean" |
| "Whether to log information about gathering table statuses in xCluster" | "yb.xcluster.table_status_logging_enabled" | "GLOBAL" | "Whether to log information about gathering bad table statuses in xCluster; the logs can be huge and this gives you a leverage to disable it" | "Boolean" |
| "Enable YBC for xCluster" | "yb.xcluster.use_ybc" | "GLOBAL" | "Enable YBC to take backup and restore during xClsuter bootstrap" | "Boolean" |
| "Whether installation of YugabyteDB version higher than YBA version is allowed" | "yb.allow_db_version_more_than_yba_version" | "GLOBAL" | "It indicates whether the installation of YugabyteDB with a version higher than YBA version is allowed on universe nodes" | "Boolean" |
| "Skip DB / YBA version comparison checks" | "yb.skip_version_checks" | "GLOBAL" | "Whether we should skip DB / YBA version comparison checks during upgrades, etc. Gives more flexibilty, but user should be careful when enabling this." | "Boolean" |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,12 @@ private List<Metric> collectMetrics(XClusterConfig xClusterConfig) {
}

XClusterConfigTaskBase.updateReplicationDetailsFromDB(
xClusterUniverseService, ybClientService, tableHandler, xClusterConfig, dbSyncTimeoutMs);
xClusterUniverseService,
ybClientService,
tableHandler,
xClusterConfig,
dbSyncTimeoutMs,
this.confGetter);
Set<XClusterTableConfig> xClusterTableConfigs = xClusterConfig.getTableDetails();
xClusterTableConfigs.forEach(
tableConfig -> metricsList.add(buildMetricTemplate(xClusterConfig, tableConfig)));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
import com.yugabyte.yw.common.XClusterUniverseService;
import com.yugabyte.yw.common.XClusterUtil;
import com.yugabyte.yw.common.backuprestore.BackupHelper;
import com.yugabyte.yw.common.config.GlobalConfKeys;
import com.yugabyte.yw.common.config.RuntimeConfGetter;
import com.yugabyte.yw.common.config.UniverseConfKeys;
import com.yugabyte.yw.common.customer.config.CustomerConfigService;
import com.yugabyte.yw.common.gflags.GFlagsUtil;
Expand Down Expand Up @@ -90,6 +92,7 @@
import java.util.stream.Collectors;
import javax.annotation.Nullable;
import lombok.Data;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
Expand Down Expand Up @@ -2178,7 +2181,8 @@ public static void updateReplicationDetailsFromDB(
YBClientService ybClientService,
UniverseTableHandler tableHandler,
XClusterConfig xClusterConfig,
long timeoutMs) {
long timeoutMs,
RuntimeConfGetter confGetter) {
Optional<Universe> targetUniverseOptional =
Objects.isNull(xClusterConfig.getTargetUniverseUUID())
? Optional.empty()
Expand All @@ -2194,7 +2198,7 @@ public static void updateReplicationDetailsFromDB(
ReplicationClusterData replicationClusterData;
try {
replicationClusterData =
collectReplicationClusterData(ybClientService, xClusterConfig, timeoutMs);
collectReplicationClusterData(ybClientService, xClusterConfig, timeoutMs, confGetter);
XClusterConfig.TableType tableType = xClusterConfig.getTableType();
replicationClusterData.sourceTableInfoList =
tableType.equals(XClusterConfig.TableType.YSQL)
Expand Down Expand Up @@ -2284,6 +2288,32 @@ public static void updateReplicationDetailsFromDB(
xClusterConfig.getUuid(),
e);
}

xClusterConfig
.getTableDetails()
.forEach(
tableConfig -> {
if (tableConfig.getStatus().getCode() != 4) {
if (tableConfig.getStatus().getCode() > 0) {
log.warn(
"In xCluster config {}, table {} is not in Running status",
xClusterConfig,
tableConfig);
} else {
log.error(
"In xCluster config {}, table {} is in bad status",
xClusterConfig,
tableConfig);
}
}
});

if (confGetter.getGlobalConf(GlobalConfKeys.xClusterTableStatusLoggingEnabled)) {
log.info(
"After adding source and target table info to the xCluster config {} : {}",
xClusterConfig,
xClusterConfig.getTableDetails());
}
}

/**
Expand Down Expand Up @@ -2453,7 +2483,10 @@ private static void addSourceAndTargetDbInfo(
* @return The ReplicationClusterData containing the collected data.
*/
private static ReplicationClusterData collectReplicationClusterData(
YBClientService ybClientService, XClusterConfig xClusterConfig, long timeoutMs) {
YBClientService ybClientService,
XClusterConfig xClusterConfig,
long timeoutMs,
RuntimeConfGetter confGetter) {
Universe sourceUniverse = Universe.getOrBadRequest(xClusterConfig.getSourceUniverseUUID());
Universe targetUniverse = Universe.getOrBadRequest(xClusterConfig.getTargetUniverseUUID());

Expand Down Expand Up @@ -2544,6 +2577,12 @@ private static ReplicationClusterData collectReplicationClusterData(
}

MoreExecutors.shutdownAndAwaitTermination(executorService, timeoutMs, TimeUnit.MILLISECONDS);
if (confGetter.getGlobalConf(GlobalConfKeys.xClusterTableStatusLoggingEnabled)) {
log.info(
"Replication cluster data collected for xCluster config {}: {}",
xClusterConfig.getUuid(),
data);
}
return data;
}

Expand All @@ -2570,7 +2609,7 @@ private static void addTransientTableConfigs(
throws Exception {
Set<String> sourceUniverseTableIds =
sourceTableInfoList.stream()
.map(tableInfo -> XClusterConfigTaskBase.getTableId(tableInfo))
.map(XClusterConfigTaskBase::getTableId)
.collect(Collectors.toSet());

// Update the status for tables that were previously being replicated but have been dropped from
Expand Down Expand Up @@ -3253,14 +3292,60 @@ public static GetUniverseReplicationInfoResponse getUniverseReplicationInfo(

/** Represents the data required for replication between clusters. */
@Data
@ToString
public static class ReplicationClusterData {

private List<MasterDdlOuterClass.ListTablesResponsePB.TableInfo> sourceTableInfoList;
private List<MasterDdlOuterClass.ListTablesResponsePB.TableInfo> targetTableInfoList;
private CatalogEntityInfo.SysClusterConfigEntryPB clusterConfig;
private Set<MasterTypes.NamespaceIdentifierPB> sourceNamespaceInfoList;
private Set<MasterTypes.NamespaceIdentifierPB> targetNamespaceInfoList;
private GetUniverseReplicationInfoResponse targetUniverseReplicationInfo;
@ToString.Exclude private GetUniverseReplicationInfoResponse targetUniverseReplicationInfo;

@ToString.Exclude
private GetXClusterOutboundReplicationGroupInfoResponse sourceUniverseReplicationInfo;

@ToString.Include(name = "targetUniverseReplicationInfo")
private String getTargetUniverseReplicationInfoString() {
if (Objects.isNull(targetUniverseReplicationInfo)) {
return null;
}
StringBuilder ret = new StringBuilder("(");
if (Objects.isNull(targetUniverseReplicationInfo.getReplicationType())) {
ret.append("replicationType=null");
} else {
ret.append("replicationType=")
.append(targetUniverseReplicationInfo.getReplicationType().toString());
}
if (Objects.isNull(targetUniverseReplicationInfo.getTableInfos())) {
ret.append(", tableInfos=null");
} else {
ret.append(", tableInfos=").append(targetUniverseReplicationInfo.getTableInfos());
}
if (Objects.isNull(targetUniverseReplicationInfo.getDbScopedInfos())) {
ret.append(", dbScopedInfos=null");
} else {
ret.append(", dbScopedInfos=").append(targetUniverseReplicationInfo.getDbScopedInfos());
}
ret.append(")");
return ret.toString();
}

@ToString.Include(name = "sourceUniverseReplicationInfo")
private String getSourceUniverseReplicationInfoString() {
if (Objects.isNull(sourceUniverseReplicationInfo)) {
return null;
}
StringBuilder ret = new StringBuilder("(");
if (Objects.isNull(sourceUniverseReplicationInfo.getNamespaceInfos())) {
ret.append("namespaceInfos=null");
} else {
ret.append("namespaceInfos=")
.append(sourceUniverseReplicationInfo.getNamespaceInfos().toString());
}
ret.append(")");
return ret.toString();
}
}

// --------------------------------------------------------------------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -802,6 +802,15 @@ public class GlobalConfKeys extends RuntimeConfigKeysModule {
+ " operations",
ConfDataType.BooleanType,
ImmutableList.of(ConfKeyTags.PUBLIC));
public static final ConfKeyInfo<Boolean> xClusterTableStatusLoggingEnabled =
new ConfKeyInfo<>(
"yb.xcluster.table_status_logging_enabled",
ScopeType.GLOBAL,
"Whether to log information about gathering table statuses in xCluster",
"Whether to log information about gathering bad table statuses in xCluster; the logs can"
+ " be huge and this gives you a leverage to disable it",
ConfDataType.BooleanType,
ImmutableList.of(ConfKeyTags.PUBLIC));
public static final ConfKeyInfo<Boolean> enableYbcForXCluster =
new ConfKeyInfo<>(
"yb.xcluster.use_ybc",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1278,7 +1278,8 @@ public Result get(UUID customerUUID, UUID drUUID, boolean syncWithDB) {
ybService,
tableHandler,
xClusterConfig,
confGetter.getGlobalConf(GlobalConfKeys.xclusterGetApiTimeoutMs));
confGetter.getGlobalConf(GlobalConfKeys.xclusterGetApiTimeoutMs),
this.confGetter);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,8 @@ public Result get(UUID customerUUID, UUID xclusterConfigUUID, boolean syncWithDB
ybService,
tableHandler,
xClusterConfig,
confGetter.getGlobalConf(GlobalConfKeys.xclusterGetApiTimeoutMs));
confGetter.getGlobalConf(GlobalConfKeys.xclusterGetApiTimeoutMs),
this.confGetter);
}

// Wrap XClusterConfig with lag metric data.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,12 @@ public class XClusterTableConfig extends Model {
@ApiModelProperty(
value = "Stream ID if replication is setup; bootstrap ID if the table is bootstrapped",
example = "a9d2470786694dc4b34e0e58e592da9e")
@ToString.Include
private String streamId;

@ApiModelProperty(value = "YbaApi Internal. Whether replication is set up for this table")
@YbaApi(visibility = YbaApiVisibility.INTERNAL, sinceYBAVersion = "2.16.0.0")
@ToString.Include
private boolean replicationSetupDone;

@ApiModelProperty(
Expand Down Expand Up @@ -114,6 +116,7 @@ public class XClusterTableConfig extends Model {
value = "Status",
allowableValues =
"Validated, Running, Updating, Warning, Error, Bootstrapping, Failed, UnableToFetch")
@ToString.Include
private Status status;

@Transient
Expand Down Expand Up @@ -143,7 +146,7 @@ public enum Status {
ReplicationError("ReplicationError", -9); // Not stored in YBA DB.

private final String status;
private final int code;
@Getter private final int code;

Status(String status, int code) {
this.status = status;
Expand All @@ -155,10 +158,6 @@ public enum Status {
public String toString() {
return this.status;
}

public int getCode() {
return this.code;
}
}

// TODO move API response attributes out of the DB model
Expand Down
1 change: 1 addition & 0 deletions managed/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ yb {
wait_for_replication_drain_timeout = 2h
allow_multiple_configs = true
}
table_status_logging_enabled = false
dr {
enabled = true
}
Expand Down

0 comments on commit eae11f8

Please sign in to comment.