Browse Source

Ticket 467 - CLEANALLRUV abort task should be able to ignore down replicas

Bug Description:  The abort task previously would wait for all the replicas to be online,
                  and receive the extended op task.  This made it impossible to abort the
                  cleanallruv task, if you wanted to abort because a server was down.

Fix Description:  Changed the default behavior to ignore down replicas, and added a new attribute
                  to the abort task entry:  replica-certify-all: (yes,no).  Default is "no". If
                  set to "yes", then it will wait until all the replicas have received the abort task.

                  Also fixed a crash caused by running the abort task agsint a server that does not have
                  replication setup.  The crashed was caused by freeing uninitialized pointers.

https://fedorahosted.org/389/ticket/467

Reviewed by: richm(Thanks!)
Mark Reynolds 13 years ago
parent
commit
ce408229bc

+ 1 - 0
ldap/servers/plugins/replication/repl5.h

@@ -642,6 +642,7 @@ typedef struct _cleanruv_data
 	CSN *maxcsn;
 	char *repl_root;
 	Slapi_DN *sdn;
+	char *certify;
 } cleanruv_data;
 
 /* replutil.c */

+ 4 - 0
ldap/servers/plugins/replication/repl5_replica.c

@@ -1914,6 +1914,7 @@ done:
         char *ridstr = NULL;
         char *repl_root;
         char *token = NULL;
+        char *certify = NULL;
         ReplicaId rid;
         int i;
 
@@ -1935,6 +1936,7 @@ done:
             }
 
             repl_root = ldap_utf8strtok_r(iter, ":", &iter);
+            certify = ldap_utf8strtok_r(iter, ":", &iter);
             stop_ruv_cleaning();
             maxcsn = replica_get_cleanruv_maxcsn(r, rid);
             delete_cleaned_rid(r, rid, maxcsn);
@@ -1966,6 +1968,7 @@ done:
                     data->payload = payload;
                     data->repl_root = slapi_ch_strdup(repl_root);
                     data->sdn = slapi_sdn_dup(r->repl_root);
+                    data->certify = slapi_ch_strdup(certify);
 
                     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
                             (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
@@ -1976,6 +1979,7 @@ done:
                         slapi_sdn_free(&data->sdn);
                         ber_bvfree(data->payload);
                         slapi_ch_free_string(&data->repl_root);
+                        slapi_ch_free_string(&data->certify);
                         slapi_ch_free((void **)&data);
                     }
                 }

+ 50 - 18
ldap/servers/plugins/replication/repl5_replica_config.c

@@ -1262,7 +1262,8 @@ replica_cleanall_ruv_task(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter,
         PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Invalid replica id (%d) for task - (%s)",
             rid, slapi_sdn_get_dn(task_dn));
         cleanruv_log(task, CLEANALLRUV_ID, "%s", returntext);
-        rc = LDAP_OPERATIONS_ERROR;
+        *returncode = LDAP_OPERATIONS_ERROR;
+        rc = SLAPI_DSE_CALLBACK_ERROR;
         goto out;
     }
     /*
@@ -1270,7 +1271,9 @@ replica_cleanall_ruv_task(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter,
      */
     dn = slapi_sdn_new_dn_byval(base_dn);
     if((r = replica_get_replica_from_dn(dn)) == NULL){
-        *returncode = LDAP_OPERATIONS_ERROR ;
+        PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Could not find replica from dn(%s)",slapi_sdn_get_dn(dn));
+        cleanruv_log(task, CLEANALLRUV_ID, "%s", returntext);
+        *returncode = LDAP_OPERATIONS_ERROR;
         rc = SLAPI_DSE_CALLBACK_ERROR;
         goto out;
     }
@@ -1281,9 +1284,7 @@ replica_cleanall_ruv_task(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter,
 out:
     if(rc){
         cleanruv_log(task, CLEANALLRUV_ID, "Task failed...(%d)", rc);
-        *returncode = rc;
         slapi_task_finish(task, *returncode);
-        rc = SLAPI_DSE_CALLBACK_ERROR;
     } else {
         rc = SLAPI_DSE_CALLBACK_OK;
     }
@@ -2182,18 +2183,19 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter
     Replica *replica;
     ReplicaId rid;
     cleanruv_data *data = NULL;
-    Slapi_DN *sdn;
+    Slapi_DN *sdn = NULL;
     Object *r;
-    CSN *maxcsn;
+    CSN *maxcsn = NULL;
     const char *base_dn;
     const char *rid_str;
-    char *ridstr;
+    const char *certify_all;
+    char *ridstr = NULL;
     int rc = SLAPI_DSE_CALLBACK_OK;
 
     if(get_abort_cleanruv_task_count() >= CLEANRIDSIZ){
         /* we are already running the maximum number of tasks */
-        cleanruv_log(task, ABORT_CLEANALLRUV_ID,
-    	    "Exceeded maximum number of active ABORT CLEANALLRUV tasks(%d)",CLEANRIDSIZ);
+        PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Exceeded maximum number of active ABORT CLEANALLRUV tasks(%d)",CLEANRIDSIZ);
+        cleanruv_log(task, ABORT_CLEANALLRUV_ID, "%s", returntext);
         *returncode = LDAP_OPERATIONS_ERROR;
         return SLAPI_DSE_CALLBACK_ERROR;
     }
@@ -2204,17 +2206,20 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter
      *  Get our task settings
      */
     if ((rid_str = fetch_attr(e, "replica-id", 0)) == NULL){
-        cleanruv_log(task, ABORT_CLEANALLRUV_ID,"Missing required attr \"replica-id\"");
+        PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Missing required attr \"replica-id\"");
+        cleanruv_log(task, ABORT_CLEANALLRUV_ID, "%s", returntext);
         *returncode = LDAP_OBJECT_CLASS_VIOLATION;
         rc = SLAPI_DSE_CALLBACK_ERROR;
         goto out;
     }
     if ((base_dn = fetch_attr(e, "replica-base-dn", 0)) == NULL){
-        cleanruv_log(task, ABORT_CLEANALLRUV_ID,"Missing required attr \"replica-base-dn\"");
+        PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Missing required attr \"replica-base-dn\"");
+        cleanruv_log(task, ABORT_CLEANALLRUV_ID, "%s", returntext);
         *returncode = LDAP_OBJECT_CLASS_VIOLATION;
         rc = SLAPI_DSE_CALLBACK_ERROR;
         goto out;
     }
+    certify_all = fetch_attr(e, "replica-certify-all", 0);
     /*
      *  Check the rid
      */
@@ -2232,15 +2237,31 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter
      */
     sdn = slapi_sdn_new_dn_byval(base_dn);
     if((r = replica_get_replica_from_dn(sdn)) == NULL){
-        cleanruv_log(task, ABORT_CLEANALLRUV_ID,"Failed to find replica from dn(%s)", base_dn);
+        PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Failed to find replica from dn(%s)", base_dn);
+        cleanruv_log(task, ABORT_CLEANALLRUV_ID, "%s", returntext);
         *returncode = LDAP_OPERATIONS_ERROR;
         rc = SLAPI_DSE_CALLBACK_ERROR;
         goto out;
     }
+    /*
+     *  Check verify value
+     */
+    if(certify_all){
+        if(strcasecmp(certify_all,"yes") && strcasecmp(certify_all,"no")){
+            PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE, "Invalid value for \"replica-certify-all\", the value "
+                "must be \"yes\" or \"no\".");
+            cleanruv_log(task, ABORT_CLEANALLRUV_ID, "%s", returntext);
+            *returncode = LDAP_OPERATIONS_ERROR;
+            rc = SLAPI_DSE_CALLBACK_ERROR;
+            goto out;
+        }
+    } else {
+        certify_all = "no";
+    }
     /*
      *  Create payload
      */
-    ridstr = slapi_ch_smprintf("%d:%s", rid, base_dn);
+    ridstr = slapi_ch_smprintf("%d:%s:%s", rid, base_dn, certify_all);
     payload = create_ruv_payload(ridstr);
 
     if(payload == NULL){
@@ -2274,6 +2295,7 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter
     data->rid = rid;
     data->repl_root = slapi_ch_strdup(base_dn);
     data->sdn = NULL;
+    data->certify = slapi_ch_strdup(certify_all);
 
     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
                 (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
@@ -2286,7 +2308,6 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter
     }
 
 out:
-
     csn_free(&maxcsn);
     slapi_ch_free_string(&ridstr);
     slapi_sdn_free(&sdn);
@@ -2321,7 +2342,7 @@ replica_abort_task_thread(void *arg)
          * to timing issues, we need to wait to grab the replica obj until we get here.
          */
         if((data->repl_obj = replica_get_replica_from_dn(data->sdn)) == NULL){
-            cleanruv_log(data->task, ABORT_CLEANALLRUV_ID, "Failed to get replica from dn (%s).", slapi_sdn_get_dn(data->sdn));
+            cleanruv_log(data->task, ABORT_CLEANALLRUV_ID, "Failed to get replica object from dn (%s).", slapi_sdn_get_dn(data->sdn));
             goto done;
         }
         if(data->replica == NULL && data->repl_obj){
@@ -2335,6 +2356,10 @@ replica_abort_task_thread(void *arg)
      */
     while(agmt_not_notified && !slapi_is_shutting_down()){
         agmt_obj = agmtlist_get_first_agreement_for_replica (data->replica);
+        if(agmt_obj == NULL){
+        	agmt_not_notified = 0;
+        	break;
+        }
         while (agmt_obj){
             agmt = (Repl_Agmt*)object_get_data (agmt_obj);
             if(!agmt_is_enabled(agmt) || get_agmt_agreement_type(agmt) == REPLICA_TYPE_WINDOWS){
@@ -2342,8 +2367,14 @@ replica_abort_task_thread(void *arg)
                 continue;
             }
             if(replica_cleanallruv_send_abort_extop(agmt, data->task, data->payload)){
-                agmt_not_notified = 1;
-                break;
+                if(strcasecmp(data->certify,"yes") == 0){
+                    /* we are verifying all the replicas receive the abort task */
+                    agmt_not_notified = 1;
+                    break;
+                } else {
+                    /* we do not care if we could not reach a replica, just continue as if we did */
+                    agmt_not_notified = 0;
+                }
             } else {
                 /* success */
                 agmt_not_notified = 0;
@@ -2373,7 +2404,7 @@ replica_abort_task_thread(void *arg)
 done:
     if(agmt_not_notified){
         /* failure */
-    	cleanruv_log(data->task, ABORT_CLEANALLRUV_ID,"Abort task failed, will resume the task at the next server startup.");
+        cleanruv_log(data->task, ABORT_CLEANALLRUV_ID,"Abort task failed, will resume the task at the next server startup.");
     } else {
         /*
          *  Clean up the config
@@ -2390,6 +2421,7 @@ done:
         ber_bvfree(data->payload);
     }
     slapi_ch_free_string(&data->repl_root);
+    slapi_ch_free_string(&data->certify);
     slapi_sdn_free(&data->sdn);
     slapi_ch_free((void **)&data);
 }

+ 5 - 0
ldap/servers/plugins/replication/repl_extop.c

@@ -1455,6 +1455,7 @@ multimaster_extop_abort_cleanruv(Slapi_PBlock *pb)
 	char *extop_oid;
 	char *repl_root;
 	char *payload = NULL;
+	char *certify_all;
 	char *iter;
 	int rc = 0;
 
@@ -1475,6 +1476,7 @@ multimaster_extop_abort_cleanruv(Slapi_PBlock *pb)
 	}
 	rid = atoi(ldap_utf8strtok_r(payload, ":", &iter));
 	repl_root = ldap_utf8strtok_r(iter, ":", &iter);
+	certify_all = ldap_utf8strtok_r(iter, ":", &iter);
 
 	if(!is_cleaned_rid(rid) || is_task_aborted(rid)){
 		/* This replica has already been aborted, or was never cleaned, or already finished cleaning */
@@ -1521,6 +1523,7 @@ multimaster_extop_abort_cleanruv(Slapi_PBlock *pb)
 	data->payload = slapi_ch_bvdup(extop_payload);
 	data->rid = rid;
 	data->repl_root = slapi_ch_strdup(repl_root);
+	data->certify = slapi_ch_strdup(certify_all);
 	/*
 	 *  Stop the cleaning, and delete the rid
 	 */
@@ -1541,6 +1544,8 @@ multimaster_extop_abort_cleanruv(Slapi_PBlock *pb)
 		}
 		slapi_log_error( SLAPI_LOG_REPL, repl_plugin_name, "Abort cleanAllRUV task: unable to create abort "
 			"thread.  Aborting task.\n");
+		slapi_ch_free_string(&data->repl_root);
+		slapi_ch_free_string(&data->certify);
 		rc = LDAP_OPERATIONS_ERROR;
 	}