Browse Source

Resolves: bug 488866
Bug Description: crash in reliab15 test
Reviewed by: nkinder (Thanks!)
Fix Description: My earlier fix was for the case where the result reader thread disconnects. But it looks like there is still a problem if the update sender thread disconnects out from under the reader thread. We need to use conn_connected() to test to see if the connection is connected before we attempt to access conn->ld in the result reader thread. I also improved the error messages so that I could tell if the errors were coming from the update sender thread or the result reader thread.
Platforms tested: RHEL5
Flag Day: no
Doc impact: no

Rich Megginson 16 years ago
parent
commit
0e53c2ae9d

+ 15 - 1
ldap/servers/plugins/replication/repl5_connection.c

@@ -306,6 +306,11 @@ conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retda
 
 
 			while (1) 
 			while (1) 
 			{
 			{
+				if (!conn_connected(conn)) {
+					rc = -1;
+					return_value = CONN_NOT_CONNECTED;
+					break;
+				}
 				rc = ldap_result(conn->ld, LDAP_RES_ANY , 1, &local_timeout, &res);
 				rc = ldap_result(conn->ld, LDAP_RES_ANY , 1, &local_timeout, &res);
 				if (0 != rc)
 				if (0 != rc)
 				{
 				{
@@ -344,11 +349,20 @@ conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retda
 				conn->last_ldap_error = LDAP_TIMEOUT;
 				conn->last_ldap_error = LDAP_TIMEOUT;
 				return_value = CONN_TIMEOUT;
 				return_value = CONN_TIMEOUT;
 			}
 			}
+			else if ((-1 == rc) && (CONN_NOT_CONNECTED == return_value))
+			{
+				/* must not access conn->ld if disconnected in another thread */
+				/* the other thread that actually did the conn_disconnect() */
+				/* will set the status and error info */
+				slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name,
+								"%s: Connection disconnected by another thread\n",
+								agmt_get_long_name(conn->agmt));
+			}
 			else if (-1 == rc)
 			else if (-1 == rc)
 			{
 			{
 				/* Error */
 				/* Error */
 				char *s = NULL;
 				char *s = NULL;
-		
+
 				rc = ldap_get_lderrno(conn->ld, NULL, &s);
 				rc = ldap_get_lderrno(conn->ld, NULL, &s);
 				conn->last_ldap_errmsg = s;
 				conn->last_ldap_errmsg = s;
 				conn->last_ldap_error = rc;
 				conn->last_ldap_error = rc;

+ 4 - 4
ldap/servers/plugins/replication/repl5_inc_protocol.c

@@ -1798,7 +1798,7 @@ send_updates(Private_Repl_Protocol *prp, RUV *remote_update_vector, PRUint32 *nu
 							agmt_inc_last_update_changecount (prp->agmt, csn_get_replicaid(entry.op->csn), 1 /*skipped*/);
 							agmt_inc_last_update_changecount (prp->agmt, csn_get_replicaid(entry.op->csn), 1 /*skipped*/);
 						}
 						}
 						slapi_log_error(finished ? SLAPI_LOG_FATAL : slapi_log_urp, repl_plugin_name,
 						slapi_log_error(finished ? SLAPI_LOG_FATAL : slapi_log_urp, repl_plugin_name,
-							"%s: Consumer failed to replay change (uniqueid %s, CSN %s): %s. %s.\n",
+							"%s: Failed to send update operation to consumer (uniqueid %s, CSN %s): %s. %s.\n",
 							agmt_get_long_name(prp->agmt),
 							agmt_get_long_name(prp->agmt),
 							entry.op->target_address.uniqueid, csn_str,
 							entry.op->target_address.uniqueid, csn_str,
 							ldap_err2string(error),
 							ldap_err2string(error),
@@ -1811,7 +1811,7 @@ send_updates(Private_Repl_Protocol *prp, RUV *remote_update_vector, PRUint32 *nu
 						return_value = UPDATE_CONNECTION_LOST;
 						return_value = UPDATE_CONNECTION_LOST;
 						finished = 1;
 						finished = 1;
 						slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name,
 						slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name,
-							"%s: Consumer failed to replay change (uniqueid %s, CSN %s): "
+							"%s: Failed to send update operation to consumer (uniqueid %s, CSN %s): "
 							"%s. Will retry later.\n",
 							"%s. Will retry later.\n",
 							agmt_get_long_name(prp->agmt),
 							agmt_get_long_name(prp->agmt),
 							entry.op->target_address.uniqueid, csn_str,
 							entry.op->target_address.uniqueid, csn_str,
@@ -1822,7 +1822,7 @@ send_updates(Private_Repl_Protocol *prp, RUV *remote_update_vector, PRUint32 *nu
 						return_value = UPDATE_TIMEOUT;
 						return_value = UPDATE_TIMEOUT;
 						finished = 1;
 						finished = 1;
 						slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name,
 						slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name,
-							"%s: Consumer timed out to replay change (uniqueid %s, CSN %s): "
+							"%s: Timed out sending update operation to consumer (uniqueid %s, CSN %s): "
 							"%s.\n",
 							"%s.\n",
 							agmt_get_long_name(prp->agmt),
 							agmt_get_long_name(prp->agmt),
 							entry.op->target_address.uniqueid, csn_str,
 							entry.op->target_address.uniqueid, csn_str,
@@ -1837,7 +1837,7 @@ send_updates(Private_Repl_Protocol *prp, RUV *remote_update_vector, PRUint32 *nu
 						return_value = UPDATE_TRANSIENT_ERROR;
 						return_value = UPDATE_TRANSIENT_ERROR;
 						finished = 1;
 						finished = 1;
 						slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name,
 						slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name,
-							"%s: Failed to replay change (uniqueid %s, CSN %s): "
+							"%s: Failed to send update operation to consumer (uniqueid %s, CSN %s): "
 							"Local error. Will retry later.\n",
 							"Local error. Will retry later.\n",
 							agmt_get_long_name(prp->agmt),
 							agmt_get_long_name(prp->agmt),
 							entry.op->target_address.uniqueid, csn_str);
 							entry.op->target_address.uniqueid, csn_str);