浏览代码

Ticket 467 - CLEANALLRUV abort task should be able to ignore down replicas

Bug Description:  The abort task previously would wait for all the replicas to be online,
                  and receive the extended op task.  This made it impossible to abort the
                  cleanallruv task, if you wanted to abort because a server was down.

Fix Description:  Changed the default behavior to ignore down replicas, and added a new attribute
                  to the abort task entry:  replica-certify-all: (yes,no).  Default is "no". If
                  set to "yes", then it will wait until all the replicas have received the abort task.

                  Also fixed a crash caused by running the abort task agsint a server that does not have
                  replication setup.  The crashed was caused by freeing uninitialized pointers.

https://fedorahosted.org/389/ticket/467

Reviewed by: richm(Thanks!)
Mark Reynolds 13 年之前
父节点
当前提交
ce408229bc

+ 1 - 0
ldap/servers/plugins/replication/repl5.h

@@ -642,6 +642,7 @@ typedef struct _cleanruv_data
 	CSN *maxcsn;
 	CSN *maxcsn;
 	char *repl_root;
 	char *repl_root;
 	Slapi_DN *sdn;
 	Slapi_DN *sdn;
+	char *certify;
 } cleanruv_data;
 } cleanruv_data;
 
 
 /* replutil.c */
 /* replutil.c */

+ 4 - 0
ldap/servers/plugins/replication/repl5_replica.c

@@ -1914,6 +1914,7 @@ done:
         char *ridstr = NULL;
         char *ridstr = NULL;
         char *repl_root;
         char *repl_root;
         char *token = NULL;
         char *token = NULL;
+        char *certify = NULL;
         ReplicaId rid;
         ReplicaId rid;
         int i;
         int i;
 
 
@@ -1935,6 +1936,7 @@ done:
             }
             }
 
 
             repl_root = ldap_utf8strtok_r(iter, ":", &iter);
             repl_root = ldap_utf8strtok_r(iter, ":", &iter);
+            certify = ldap_utf8strtok_r(iter, ":", &iter);
             stop_ruv_cleaning();
             stop_ruv_cleaning();
             maxcsn = replica_get_cleanruv_maxcsn(r, rid);
             maxcsn = replica_get_cleanruv_maxcsn(r, rid);
             delete_cleaned_rid(r, rid, maxcsn);
             delete_cleaned_rid(r, rid, maxcsn);
@@ -1966,6 +1968,7 @@ done:
                     data->payload = payload;
                     data->payload = payload;
                     data->repl_root = slapi_ch_strdup(repl_root);
                     data->repl_root = slapi_ch_strdup(repl_root);
                     data->sdn = slapi_sdn_dup(r->repl_root);
                     data->sdn = slapi_sdn_dup(r->repl_root);
+                    data->certify = slapi_ch_strdup(certify);
 
 
                     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
                     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
                             (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
                             (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
@@ -1976,6 +1979,7 @@ done:
                         slapi_sdn_free(&data->sdn);
                         slapi_sdn_free(&data->sdn);
                         ber_bvfree(data->payload);
                         ber_bvfree(data->payload);
                         slapi_ch_free_string(&data->repl_root);
                         slapi_ch_free_string(&data->repl_root);
+                        slapi_ch_free_string(&data->certify);
                         slapi_ch_free((void **)&data);
                         slapi_ch_free((void **)&data);
                     }
                     }
                 }
                 }

+ 50 - 18
ldap/servers/plugins/replication/repl5_replica_config.c

@@ -1262,7 +1262,8 @@ replica_cleanall_ruv_task(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter,
         PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Invalid replica id (%d) for task - (%s)",
         PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Invalid replica id (%d) for task - (%s)",
             rid, slapi_sdn_get_dn(task_dn));
             rid, slapi_sdn_get_dn(task_dn));
         cleanruv_log(task, CLEANALLRUV_ID, "%s", returntext);
         cleanruv_log(task, CLEANALLRUV_ID, "%s", returntext);
-        rc = LDAP_OPERATIONS_ERROR;
+        *returncode = LDAP_OPERATIONS_ERROR;
+        rc = SLAPI_DSE_CALLBACK_ERROR;
         goto out;
         goto out;
     }
     }
     /*
     /*
@@ -1270,7 +1271,9 @@ replica_cleanall_ruv_task(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter,
      */
      */
     dn = slapi_sdn_new_dn_byval(base_dn);
     dn = slapi_sdn_new_dn_byval(base_dn);
     if((r = replica_get_replica_from_dn(dn)) == NULL){
     if((r = replica_get_replica_from_dn(dn)) == NULL){
-        *returncode = LDAP_OPERATIONS_ERROR ;
+        PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Could not find replica from dn(%s)",slapi_sdn_get_dn(dn));
+        cleanruv_log(task, CLEANALLRUV_ID, "%s", returntext);
+        *returncode = LDAP_OPERATIONS_ERROR;
         rc = SLAPI_DSE_CALLBACK_ERROR;
         rc = SLAPI_DSE_CALLBACK_ERROR;
         goto out;
         goto out;
     }
     }
@@ -1281,9 +1284,7 @@ replica_cleanall_ruv_task(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter,
 out:
 out:
     if(rc){
     if(rc){
         cleanruv_log(task, CLEANALLRUV_ID, "Task failed...(%d)", rc);
         cleanruv_log(task, CLEANALLRUV_ID, "Task failed...(%d)", rc);
-        *returncode = rc;
         slapi_task_finish(task, *returncode);
         slapi_task_finish(task, *returncode);
-        rc = SLAPI_DSE_CALLBACK_ERROR;
     } else {
     } else {
         rc = SLAPI_DSE_CALLBACK_OK;
         rc = SLAPI_DSE_CALLBACK_OK;
     }
     }
@@ -2182,18 +2183,19 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter
     Replica *replica;
     Replica *replica;
     ReplicaId rid;
     ReplicaId rid;
     cleanruv_data *data = NULL;
     cleanruv_data *data = NULL;
-    Slapi_DN *sdn;
+    Slapi_DN *sdn = NULL;
     Object *r;
     Object *r;
-    CSN *maxcsn;
+    CSN *maxcsn = NULL;
     const char *base_dn;
     const char *base_dn;
     const char *rid_str;
     const char *rid_str;
-    char *ridstr;
+    const char *certify_all;
+    char *ridstr = NULL;
     int rc = SLAPI_DSE_CALLBACK_OK;
     int rc = SLAPI_DSE_CALLBACK_OK;
 
 
     if(get_abort_cleanruv_task_count() >= CLEANRIDSIZ){
     if(get_abort_cleanruv_task_count() >= CLEANRIDSIZ){
         /* we are already running the maximum number of tasks */
         /* we are already running the maximum number of tasks */
-        cleanruv_log(task, ABORT_CLEANALLRUV_ID,
-    	    "Exceeded maximum number of active ABORT CLEANALLRUV tasks(%d)",CLEANRIDSIZ);
+        PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Exceeded maximum number of active ABORT CLEANALLRUV tasks(%d)",CLEANRIDSIZ);
+        cleanruv_log(task, ABORT_CLEANALLRUV_ID, "%s", returntext);
         *returncode = LDAP_OPERATIONS_ERROR;
         *returncode = LDAP_OPERATIONS_ERROR;
         return SLAPI_DSE_CALLBACK_ERROR;
         return SLAPI_DSE_CALLBACK_ERROR;
     }
     }
@@ -2204,17 +2206,20 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter
      *  Get our task settings
      *  Get our task settings
      */
      */
     if ((rid_str = fetch_attr(e, "replica-id", 0)) == NULL){
     if ((rid_str = fetch_attr(e, "replica-id", 0)) == NULL){
-        cleanruv_log(task, ABORT_CLEANALLRUV_ID,"Missing required attr \"replica-id\"");
+        PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Missing required attr \"replica-id\"");
+        cleanruv_log(task, ABORT_CLEANALLRUV_ID, "%s", returntext);
         *returncode = LDAP_OBJECT_CLASS_VIOLATION;
         *returncode = LDAP_OBJECT_CLASS_VIOLATION;
         rc = SLAPI_DSE_CALLBACK_ERROR;
         rc = SLAPI_DSE_CALLBACK_ERROR;
         goto out;
         goto out;
     }
     }
     if ((base_dn = fetch_attr(e, "replica-base-dn", 0)) == NULL){
     if ((base_dn = fetch_attr(e, "replica-base-dn", 0)) == NULL){
-        cleanruv_log(task, ABORT_CLEANALLRUV_ID,"Missing required attr \"replica-base-dn\"");
+        PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Missing required attr \"replica-base-dn\"");
+        cleanruv_log(task, ABORT_CLEANALLRUV_ID, "%s", returntext);
         *returncode = LDAP_OBJECT_CLASS_VIOLATION;
         *returncode = LDAP_OBJECT_CLASS_VIOLATION;
         rc = SLAPI_DSE_CALLBACK_ERROR;
         rc = SLAPI_DSE_CALLBACK_ERROR;
         goto out;
         goto out;
     }
     }
+    certify_all = fetch_attr(e, "replica-certify-all", 0);
     /*
     /*
      *  Check the rid
      *  Check the rid
      */
      */
@@ -2232,15 +2237,31 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter
      */
      */
     sdn = slapi_sdn_new_dn_byval(base_dn);
     sdn = slapi_sdn_new_dn_byval(base_dn);
     if((r = replica_get_replica_from_dn(sdn)) == NULL){
     if((r = replica_get_replica_from_dn(sdn)) == NULL){
-        cleanruv_log(task, ABORT_CLEANALLRUV_ID,"Failed to find replica from dn(%s)", base_dn);
+        PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Failed to find replica from dn(%s)", base_dn);
+        cleanruv_log(task, ABORT_CLEANALLRUV_ID, "%s", returntext);
         *returncode = LDAP_OPERATIONS_ERROR;
         *returncode = LDAP_OPERATIONS_ERROR;
         rc = SLAPI_DSE_CALLBACK_ERROR;
         rc = SLAPI_DSE_CALLBACK_ERROR;
         goto out;
         goto out;
     }
     }
+    /*
+     *  Check verify value
+     */
+    if(certify_all){
+        if(strcasecmp(certify_all,"yes") && strcasecmp(certify_all,"no")){
+            PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Invalid value for \"replica-certify-all\", the value "
+                "must be \"yes\" or \"no\".");
+            cleanruv_log(task, ABORT_CLEANALLRUV_ID, "%s", returntext);
+            *returncode = LDAP_OPERATIONS_ERROR;
+            rc = SLAPI_DSE_CALLBACK_ERROR;
+            goto out;
+        }
+    } else {
+        certify_all = "no";
+    }
     /*
     /*
      *  Create payload
      *  Create payload
      */
      */
-    ridstr = slapi_ch_smprintf("%d:%s", rid, base_dn);
+    ridstr = slapi_ch_smprintf("%d:%s:%s", rid, base_dn, certify_all);
     payload = create_ruv_payload(ridstr);
     payload = create_ruv_payload(ridstr);
 
 
     if(payload == NULL){
     if(payload == NULL){
@@ -2274,6 +2295,7 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter
     data->rid = rid;
     data->rid = rid;
     data->repl_root = slapi_ch_strdup(base_dn);
     data->repl_root = slapi_ch_strdup(base_dn);
     data->sdn = NULL;
     data->sdn = NULL;
+    data->certify = slapi_ch_strdup(certify_all);
 
 
     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
                 (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
                 (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
@@ -2286,7 +2308,6 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter
     }
     }
 
 
 out:
 out:
-
     csn_free(&maxcsn);
     csn_free(&maxcsn);
     slapi_ch_free_string(&ridstr);
     slapi_ch_free_string(&ridstr);
     slapi_sdn_free(&sdn);
     slapi_sdn_free(&sdn);
@@ -2321,7 +2342,7 @@ replica_abort_task_thread(void *arg)
          * to timing issues, we need to wait to grab the replica obj until we get here.
          * to timing issues, we need to wait to grab the replica obj until we get here.
          */
          */
         if((data->repl_obj = replica_get_replica_from_dn(data->sdn)) == NULL){
         if((data->repl_obj = replica_get_replica_from_dn(data->sdn)) == NULL){
-            cleanruv_log(data->task, ABORT_CLEANALLRUV_ID, "Failed to get replica from dn (%s).", slapi_sdn_get_dn(data->sdn));
+            cleanruv_log(data->task, ABORT_CLEANALLRUV_ID, "Failed to get replica object from dn (%s).", slapi_sdn_get_dn(data->sdn));
             goto done;
             goto done;
         }
         }
         if(data->replica == NULL && data->repl_obj){
         if(data->replica == NULL && data->repl_obj){
@@ -2335,6 +2356,10 @@ replica_abort_task_thread(void *arg)
      */
      */
     while(agmt_not_notified && !slapi_is_shutting_down()){
     while(agmt_not_notified && !slapi_is_shutting_down()){
         agmt_obj = agmtlist_get_first_agreement_for_replica (data->replica);
         agmt_obj = agmtlist_get_first_agreement_for_replica (data->replica);
+        if(agmt_obj == NULL){
+        	agmt_not_notified = 0;
+        	break;
+        }
         while (agmt_obj){
         while (agmt_obj){
             agmt = (Repl_Agmt*)object_get_data (agmt_obj);
             agmt = (Repl_Agmt*)object_get_data (agmt_obj);
             if(!agmt_is_enabled(agmt) || get_agmt_agreement_type(agmt) == REPLICA_TYPE_WINDOWS){
             if(!agmt_is_enabled(agmt) || get_agmt_agreement_type(agmt) == REPLICA_TYPE_WINDOWS){
@@ -2342,8 +2367,14 @@ replica_abort_task_thread(void *arg)
                 continue;
                 continue;
             }
             }
             if(replica_cleanallruv_send_abort_extop(agmt, data->task, data->payload)){
             if(replica_cleanallruv_send_abort_extop(agmt, data->task, data->payload)){
-                agmt_not_notified = 1;
-                break;
+                if(strcasecmp(data->certify,"yes") == 0){
+                    /* we are verifying all the replicas receive the abort task */
+                    agmt_not_notified = 1;
+                    break;
+                } else {
+                    /* we do not care if we could not reach a replica, just continue as if we did */
+                    agmt_not_notified = 0;
+                }
             } else {
             } else {
                 /* success */
                 /* success */
                 agmt_not_notified = 0;
                 agmt_not_notified = 0;
@@ -2373,7 +2404,7 @@ replica_abort_task_thread(void *arg)
 done:
 done:
     if(agmt_not_notified){
     if(agmt_not_notified){
         /* failure */
         /* failure */
-    	cleanruv_log(data->task, ABORT_CLEANALLRUV_ID,"Abort task failed, will resume the task at the next server startup.");
+        cleanruv_log(data->task, ABORT_CLEANALLRUV_ID,"Abort task failed, will resume the task at the next server startup.");
     } else {
     } else {
         /*
         /*
          *  Clean up the config
          *  Clean up the config
@@ -2390,6 +2421,7 @@ done:
         ber_bvfree(data->payload);
         ber_bvfree(data->payload);
     }
     }
     slapi_ch_free_string(&data->repl_root);
     slapi_ch_free_string(&data->repl_root);
+    slapi_ch_free_string(&data->certify);
     slapi_sdn_free(&data->sdn);
     slapi_sdn_free(&data->sdn);
     slapi_ch_free((void **)&data);
     slapi_ch_free((void **)&data);
 }
 }

+ 5 - 0
ldap/servers/plugins/replication/repl_extop.c

@@ -1455,6 +1455,7 @@ multimaster_extop_abort_cleanruv(Slapi_PBlock *pb)
 	char *extop_oid;
 	char *extop_oid;
 	char *repl_root;
 	char *repl_root;
 	char *payload = NULL;
 	char *payload = NULL;
+	char *certify_all;
 	char *iter;
 	char *iter;
 	int rc = 0;
 	int rc = 0;
 
 
@@ -1475,6 +1476,7 @@ multimaster_extop_abort_cleanruv(Slapi_PBlock *pb)
 	}
 	}
 	rid = atoi(ldap_utf8strtok_r(payload, ":", &iter));
 	rid = atoi(ldap_utf8strtok_r(payload, ":", &iter));
 	repl_root = ldap_utf8strtok_r(iter, ":", &iter);
 	repl_root = ldap_utf8strtok_r(iter, ":", &iter);
+	certify_all = ldap_utf8strtok_r(iter, ":", &iter);
 
 
 	if(!is_cleaned_rid(rid) || is_task_aborted(rid)){
 	if(!is_cleaned_rid(rid) || is_task_aborted(rid)){
 		/* This replica has already been aborted, or was never cleaned, or already finished cleaning */
 		/* This replica has already been aborted, or was never cleaned, or already finished cleaning */
@@ -1521,6 +1523,7 @@ multimaster_extop_abort_cleanruv(Slapi_PBlock *pb)
 	data->payload = slapi_ch_bvdup(extop_payload);
 	data->payload = slapi_ch_bvdup(extop_payload);
 	data->rid = rid;
 	data->rid = rid;
 	data->repl_root = slapi_ch_strdup(repl_root);
 	data->repl_root = slapi_ch_strdup(repl_root);
+	data->certify = slapi_ch_strdup(certify_all);
 	/*
 	/*
 	 *  Stop the cleaning, and delete the rid
 	 *  Stop the cleaning, and delete the rid
 	 */
 	 */
@@ -1541,6 +1544,8 @@ multimaster_extop_abort_cleanruv(Slapi_PBlock *pb)
 		}
 		}
 		slapi_log_error( SLAPI_LOG_REPL, repl_plugin_name, "Abort cleanAllRUV task: unable to create abort "
 		slapi_log_error( SLAPI_LOG_REPL, repl_plugin_name, "Abort cleanAllRUV task: unable to create abort "
 			"thread.  Aborting task.\n");
 			"thread.  Aborting task.\n");
+		slapi_ch_free_string(&data->repl_root);
+		slapi_ch_free_string(&data->certify);
 		rc = LDAP_OPERATIONS_ERROR;
 		rc = LDAP_OPERATIONS_ERROR;
 	}
 	}