[Linux-ha-dev] client sending messages stuck heartbeat

Guochun Shi gshi at ncsa.uiuc.edu
Fri Oct 8 02:03:25 MDT 2004


hi,

I think I found a bug.
With CVS head, replace the attached sender.c with lib/hbclient/api_test.c, compile it.

1. start heartbeat in both machines (A and B)
2. start api_test in one machine (A)
     api_test will keep sending ordered messages to the cluster one for each second,
     therefore this is not flow control problem

after sometime (~2 minutes in my machines), B decared A dead and took over A's resources
there is nothing wrong in A's log. 

3. kill api_test in A, A started to "wake up" and find itself dead, then both machine restarted.
It seems master process was stuck in somewhere related with the client.

here is ha.cf 

debugfile /var/log/ha-debug
logfile /var/log/ha-log
logfacility     local7
keepalive 2
deadtime 10
warntime 10
initdead 20
udpport 694
bcast   eth0            # Linux
bcast   eth1            # Linux
auto_failback off
node    posic066
node    posic067
apiauth ping gid=haclient uid=gshi,root
apiauth ccm  gid=haclient uid=root
apiauth evms gid=haclient uid=root
apiauth ipfail gid=haclient uid=gshi,root

and haresources

posic066  141.142.61.111
posic067 141.142.61.112


-Guochun
-------------- next part --------------
/* $Id: api_test.c,v 1.5 2004/05/17 15:12:08 lars Exp $ */
/* 
 * api_test: Test program for testing the heartbeat API
 *
 * Copyright (C) 2000 Alan Robertson <alanr at unix.sh>
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <portability.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <clplumbing/cl_log.h>
#include <clplumbing/cl_signal.h>
#include <clplumbing/cl_malloc.h>
#include <sys/types.h>
#include <sys/utsname.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <stdarg.h>
#include <syslog.h>
#include <hb_api_core.h>
#include <hb_api.h>

/*
 * A heartbeat API test program...
 */

void NodeStatus(const char * node, const char * status, void * private);
void LinkStatus(const char * node, const char *, const char *, void*);
void ClientStatus(const char * node, const char *, const char *, void*);
void gotsig(int nsig);

void
NodeStatus(const char * node, const char * status, void * private)
{
	cl_log(LOG_NOTICE, "Status update: Node %s now has status %s\n"
	,	node, status);
}

void
LinkStatus(const char * node, const char * lnk, const char * status
,	void * private)
{
	cl_log(LOG_NOTICE, "Link Status update: Link %s/%s now has status %s\n"
	,	node, lnk, status);
}

void
ClientStatus(const char * node, const char * client, const char * status
,	void * private)
{
	cl_log(LOG_INFO, "Status update: Client %s/%s now has status [%s]\n"
	,	node, client, status);
}

int quitnow = 0;
void gotsig(int nsig)
{
	(void)nsig;
	quitnow = 1;
}

const char * mandparms[] =
{	KEY_HBVERSION
,	KEY_HOPS
,	KEY_KEEPALIVE
,	KEY_DEADTIME
,	KEY_DEADPING
,	KEY_WARNTIME
,	KEY_INITDEAD
,	KEY_BAUDRATE
,	KEY_UDPPORT
,	KEY_AUTOFAIL
,	KEY_GEN_METH
,	KEY_REALTIME
,	KEY_DEBUGLEVEL
,	KEY_NORMALPOLL};

const char * optparms[] =
{	KEY_LOGFILE
,	KEY_DBGFILE
,	KEY_FACILITY
,	KEY_RT_PRIO
,	KEY_WATCHDOG};



int
main(int argc, char ** argv)
{
	struct ha_msg*	reply;
	struct ha_msg*	pingreq = NULL;
  	unsigned	fmask; 
  	ll_cluster_t*	hb; 
	const char *	node;
	const char *	intf;
	int		msgcount=0;
	char *		ctmp;
	const char *	cval;
	int		j;
	const char *	cstatus;
	int		timeout = 100; /* milliseconds */

	cl_log_set_entity(argv[0]);
	cl_log_enable_stderr(TRUE);
	cl_log_set_facility(LOG_USER);
	hb = ll_cluster_new("heartbeat");
	cl_log(LOG_INFO, "PID=%ld\n", (long)getpid());
	cl_log(LOG_INFO, "Signing in with heartbeat\n");
	if (hb->llc_ops->signon(hb, "ping")!= HA_OK) {
		cl_log(LOG_ERR, "Cannot sign on with heartbeat\n");
		cl_log(LOG_ERR, "REASON: %s\n", hb->llc_ops->errmsg(hb));
		exit(1);
	}

	if (hb->llc_ops->set_nstatus_callback(hb, NodeStatus, NULL) !=HA_OK){
		cl_log(LOG_ERR, "Cannot set node status callback\n");
		cl_log(LOG_ERR, "REASON: %s\n", hb->llc_ops->errmsg(hb));
		exit(2);
	}

	if (hb->llc_ops->set_ifstatus_callback(hb, LinkStatus, NULL)!=HA_OK){
		cl_log(LOG_ERR, "Cannot set if status callback\n");
		cl_log(LOG_ERR, "REASON: %s\n", hb->llc_ops->errmsg(hb));
		exit(3);
	}

	if (hb->llc_ops->set_cstatus_callback(hb, ClientStatus, NULL)!=HA_OK){
		cl_log(LOG_ERR, "Cannot set client status callback\n");
		cl_log(LOG_ERR, "REASON: %s\n", hb->llc_ops->errmsg(hb));
		exit(4);
	}

/* Async get client status information in the cluster */
	hb->llc_ops->client_status(hb, NULL , NULL, -1); 
	
	pingreq = ha_msg_new(0);
	ha_msg_add(pingreq, F_TYPE, "ping");	

	while (1){
		sleep(1);
		cl_log(LOG_INFO, "Sleeping...\n");
		
		if (hb->llc_ops->send_ordered_clustermsg(hb, pingreq) == HA_OK) {
			cl_log(LOG_INFO, "Sent ping request to cluster\n");
			
		}
	}

#if 0
	fmask = LLC_FILTER_RAW;
#else
	fmask = LLC_FILTER_DEFAULT;
#endif

#if 1	

	/* This isn't necessary -- you don't need this call - it's just for testing... */
	cl_log(LOG_INFO, "Setting message filter mode\n");
	if (hb->llc_ops->setfmode(hb, fmask) != HA_OK) {
		cl_log(LOG_ERR, "Cannot set filter mode\n");
		cl_log(LOG_ERR, "REASON: %s\n", hb->llc_ops->errmsg(hb));
		exit(4);
	}

	for (j=0; j < DIMOF(mandparms); ++j) {
		if ((ctmp = hb->llc_ops->get_parameter(hb, mandparms[j])) != NULL) {
			cl_log(LOG_INFO, "Parameter %s is [%s]"
			,	mandparms[j]
			,	ctmp);
			cl_free(ctmp); ctmp = NULL;
		}else{
			cl_log(LOG_ERR, "Mandantory Parameter %s is not available!"
			,	mandparms[j]);
		}
	}
	for (j=0; j < DIMOF(optparms); ++j) {
		if ((ctmp = hb->llc_ops->get_parameter(hb, optparms[j])) != NULL) {
			cl_log(LOG_INFO, "Optional Parameter %s is [%s]"
			,	optparms[j]
			,	ctmp);
			cl_free(ctmp); ctmp = NULL;
		}
	}
	if ((cval = hb->llc_ops->get_resources(hb)) == NULL) {
		cl_perror("Cannot get resource status");
		cl_log(LOG_ERR, "REASON: %s\n"
		,	hb->llc_ops->errmsg(hb));
	}else{
		cl_log(LOG_INFO, "Current resource status: %s", cval);
	}


	cl_log(LOG_INFO, "Starting node walk\n");
	if (hb->llc_ops->init_nodewalk(hb) != HA_OK) {
		cl_log(LOG_ERR, "Cannot start node walk\n");
		cl_log(LOG_ERR, "REASON: %s\n", hb->llc_ops->errmsg(hb));
		exit(5);
	}
	while((node = hb->llc_ops->nextnode(hb))!= NULL) {
		cl_log(LOG_INFO, "Cluster node: %s: status: %s\n", node
		,	hb->llc_ops->node_status(hb, node));
		if (hb->llc_ops->init_ifwalk(hb, node) != HA_OK) {
			cl_log(LOG_ERR, "Cannot start if walk\n");
			cl_log(LOG_ERR, "REASON: %s\n"
			,	hb->llc_ops->errmsg(hb));
			exit(6);
		}
		while ((intf = hb->llc_ops->nextif(hb))) {
			cl_log(LOG_INFO, "\tnode %s: intf: %s ifstatus: %s\n"
			,	node, intf
			,	hb->llc_ops->if_status(hb, node, intf));
		}
		if (hb->llc_ops->end_ifwalk(hb) != HA_OK) {
			cl_log(LOG_ERR, "Cannot end if walk\n");
			cl_log(LOG_ERR, "REASON: %s\n"
			,	hb->llc_ops->errmsg(hb));
			exit(7);
		}
		cstatus = hb->llc_ops->client_status(hb, node, "ping", timeout);
		cl_log(LOG_INFO, "%s/api_test status: [%s]\n", node
		,	cstatus == NULL ? "timeout" : cstatus);
	}
	if (hb->llc_ops->end_nodewalk(hb) != HA_OK) {
		cl_log(LOG_ERR, "Cannot end node walk\n");
		cl_log(LOG_ERR, "REASON: %s\n", hb->llc_ops->errmsg(hb));
		exit(8);
	}

	CL_SIGINTERRUPT(SIGINT, 1);
	CL_SIGNAL(SIGINT, gotsig);

#if 0
	/* This is not necessary either ;-) */
	cl_log(LOG_INFO, "Setting message signal\n");
	if (hb->llc_ops->setmsgsignal(hb, 0) != HA_OK) {
		cl_log(LOG_ERR, "Cannot set message signal\n");
		cl_log(LOG_ERR, "REASON: %s\n", hb->llc_ops->errmsg(hb));
		exit(9);
	}

#endif
	pingreq = ha_msg_new(0);
	ha_msg_add(pingreq, F_TYPE, "ping");
	cl_log(LOG_INFO, "Sleeping...\n");
	sleep(2);

	if (hb->llc_ops->sendclustermsg(hb, pingreq) == HA_OK) {
		cl_log(LOG_INFO, "Sent ping request to cluster\n");
	}else{
		cl_log(LOG_ERR, "PING request FAIL to cluster\n");
		cl_log(LOG_ERR, "REASON: %s\n", hb->llc_ops->errmsg(hb));
	}

	cl_log(LOG_INFO, "Waiting for messages...\n");
	errno = 0;
	for(; !quitnow && (reply=hb->llc_ops->readmsg(hb, 1)) != NULL;) {
		const char *	type;
		const char *	orig;
		++msgcount;
		if ((type = ha_msg_value(reply, F_TYPE)) == NULL) {
			type = "?";
		}
		if ((orig = ha_msg_value(reply, F_ORIG)) == NULL) {
			orig = "?";
		}
		cl_log(LOG_NOTICE, "Got message %d of type [%s] from [%s]\n"
		,	msgcount, type, orig);
		if (strcasecmp(type, T_APICLISTAT) == 0) {
			cl_log_message(reply);
			cl_log(LOG_NOTICE, "%s", hb->llc_ops->errmsg(hb));
		}

		if (strcmp(type, "ping") ==0) {
			struct ha_msg*	pingreply = ha_msg_new(4);
			int	count;

			ha_msg_add(pingreply, F_TYPE, "pingreply");

			for (count=0; count < 10; ++count) {
				if (hb->llc_ops->sendnodemsg(hb, pingreply, orig)
				==	HA_OK) {
					cl_log(LOG_INFO
					,	"Sent ping reply(%d) to [%s]\n"
					,	count, orig);
				}else{
					cl_log(LOG_ERR, "PING %d FAIL to [%s]\n"
					,	count, orig);
				}
			}
			ha_msg_del(pingreply); pingreply=NULL;
		}
		ha_msg_del(reply); reply=NULL;
	}

#endif 
	if (!quitnow) {
		cl_log(LOG_ERR, "read_hb_msg returned NULL");
		cl_log(LOG_ERR, "REASON: %s\n", hb->llc_ops->errmsg(hb));
	}
	if (hb->llc_ops->signoff(hb) != HA_OK) {
		cl_log(LOG_ERR, "Cannot sign off from heartbeat.\n");
		cl_log(LOG_ERR, "REASON: %s\n", hb->llc_ops->errmsg(hb));
		exit(10);
	}
	if (hb->llc_ops->delete(hb) != HA_OK) {
		cl_log(LOG_ERR, "Cannot delete API object.\n");
		cl_log(LOG_ERR, "REASON: %s\n", hb->llc_ops->errmsg(hb));
		exit(11);
	}

	return 0;
}


More information about the Linux-HA-Dev mailing list