[Linux-HA] RE: Failover of resource

Andrew Beekhof beekhof at gmail.com
Fri Jul 13 04:52:35 MDT 2007


On 7/13/07, Taldevkar, Chetan <chetan.taldevkar at patni.com> wrote:
>
> Message: 4
>
> Hi Andrew,
>
> Please ignore my earlier 2 mails as they were with node 1 and node 2
> having same score (1500). The below cib.xml is the result of exact
> forced_failedover configuration given on the site.
>
> 1. I tried with default_resource_failure_stickiness = -INFINITY and
> default_resource_stickiness = 0. The failover happens to another node.
> But in case second node fails it does not fails over to first node. Is
> this due to failcount on node one?


yes


> How to take care of this?


its a juggling act really.

if you want a single resource to move from one node to the other, then you
need to set default_resource_failure_stickiness > default_resource_stickines

but if you want it to move back again, you
need default_resource_failure_stickiness < rsc_location.score

in fact, the number of times it can ping-pong between the nodes is
(rsc_location.score / default_resource_failure_stickiness)


now having said that, things get a whole lot trickier when groups are
involved because some values start being multiplied by the number resources
in the group.

which is not ideal and i hope to find the time to fix it one of these days



2. This is the output from cibadmin - Q for forced_failover. It
> continues to run monitor part of the script on failed node even after
> failing over to another node. The crm_failcount -G -U on failed node
> return failcount as 3.
>
> <cib generated="true" admin_epoch="0" epoch="1" num_updates="23"
> have_quorum="true" ignore_dtd="false" num_peers="2" ccm_transition="2"
> cib_feature_revision="1.3"
> dc_uuid="1c3fdfbd-ee55-47e3-a8c2-52f34a5c5553">
>    <configuration>
>      <crm_config>
>        <cluster_property_set id="cib-bootstrap-options">
>          <attributes>
>            <nvpair id="symmetric_cluster" name="symmetric_cluster"
> value="true"/>
>            <nvpair id="no_quorum_policy" name="no_quorum_policy"
> value="stop"/>
>            <nvpair id="default_resource_stickiness"
> name="default_resource_stickiness" value="500"/>
>            <nvpair id="default_resource_failure_stickiness"
> name="default_resource_failure_stickiness" value="-1001"/>
>            <nvpair id="stonith_enabled" name="stonith_enabled"
> value="false"/>
>            <nvpair id="stop_orphan_resources"
> name="stop_orphan_resources" value="true"/>
>            <nvpair id="stop_orphan_actions" name="stop_orphan_actions"
> value="true"/>
>            <nvpair id="remove_after_stop" name="remove_after_stop"
> value="true"/>
>            <nvpair id="is_managed_default" name="is_managed_default"
> value="true"/>
>            <nvpair id="short_resource_names" name="short_resource_names"
> value="true"/>
>          </attributes>
>        </cluster_property_set>
>      </crm_config>
>      <nodes>
>        <node id="1c3fdfbd-ee55-47e3-a8c2-52f34a5c5553"
> uname="wabtecwl1.patni.com" type="normal"/>
>        <node id="5426e37c-9469-40a3-813c-eebeb0b7c6a0"
> uname="wabtectestconfig.patni.com" type="normal"/>
>      </nodes>
>      <resources>
>        <group id="group_org" collocated="true" ordered="true">
>          <primitive class="ocf" id="IPaddr_1" provider="heartbeat"
> type="IPaddr">
>            <operations/>
>            <instance_attributes id="i1">
>              <attributes>
>                <nvpair id="id1" name="ip" value="172.20.1.94"/>
>                <nvpair id="mask1" name="netmask" value="24"/>
>                <nvpair id="nic1" name="nic" value="eth0"/>
>              </attributes>
>            </instance_attributes>
>          </primitive>
>          <primitive id="res_ttsvc" class="ocf" type="ttsvc.sh"
> provider="heartbeat">
>            <instance_attributes id="res_ttsvc_instance_attrs">
>              <attributes/>
>            </instance_attributes>
>            <operations>
>              <op id="tt_start_1" name="start" description="begin op"
> timeout="3s" start_delay="0" disabled="false"/>
>              <op id="tt_status_1" name="monitor" description="check
> state" interval="2s" timeout="3s" start_delay="0" disabled="false"/>
>              <op id="tt_stop_1" name="stop" description="stop status
> check" timeout="2s" start_delay="0" disabled="false"/>
>            </operations>
>          </primitive>
>        </group>
>      </resources>
>      <constraints>
>        <rsc_location id="place_testconfig" rsc="group_org">
>          <rule id="prefered_testconfig" score="1500">
>            <expression id="e1" attribute="#uname" operation="eq"
> value="wabtectestconfig.patni.com"/>
>          </rule>
>        </rsc_location>
>        <rsc_location id="place_wl1config" rsc="group_org">
>          <rule id="prefered_wl1config" score="1000">
>            <expression id="e2" attribute="#uname" operation="eq"
> value="wabtectwl1.patni.com"/>
>          </rule>
>        </rsc_location>
>      </constraints>
>    </configuration>
>    <status>
>      <node_state id="1c3fdfbd-ee55-47e3-a8c2-52f34a5c5553"
> uname="wabtecwl1.patni.com" crmd="online"
> crm-debug-origin="do_update_resource" shutdown="0" in_ccm="true"
> ha="active" join="member" expected="member">
>        <lrm id="1c3fdfbd-ee55-47e3-a8c2-52f34a5c5553">
>          <lrm_resources>
>            <lrm_resource id="IPaddr_1" type="IPaddr" class="ocf"
> provider="heartbeat">
>              <lrm_rsc_op id="IPaddr_1_monitor_0" operation="monitor"
> crm-debug-origin="do_update_resource"
> transition_key="3:0:acf06009-6ac9-48b2-b58f-04a04080b561"
> transition_magic="0:7;3:0:acf06009-6ac9-48b2-b58f-04a04080b561"
> call_id="2" crm_feature_set="1.0.7" rc_code="7" op_status="0"
> interval="0" op_digest="8d3445b5418271954723b32be9490d16"/>
>            </lrm_resource>
>            <lrm_resource id="res_ttsvc" type="ttsvc.sh" class="ocf"
> provider="heartbeat">
>              <lrm_rsc_op id="res_ttsvc_monitor_0" operation="monitor"
> crm-debug-origin="do_update_resource"
> transition_key="4:0:acf06009-6ac9-48b2-b58f-04a04080b561"
> transition_magic="4:1;4:0:acf06009-6ac9-48b2-b58f-04a04080b561"
> call_id="3" crm_feature_set="1.0.7" rc_code="1" op_status="4"
> interval="0" op_digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
>              <lrm_rsc_op id="res_ttsvc_stop_0" operation="stop"
> crm-debug-origin="do_update_resource"
> transition_key="1:1:acf06009-6ac9-48b2-b58f-04a04080b561"
> transition_magic="0:0;1:1:acf06009-6ac9-48b2-b58f-04a04080b561"
> call_id="4" crm_feature_set="1.0.7" rc_code="0" op_status="0"
> interval="0" op_digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
>            </lrm_resource>
>          </lrm_resources>
>        </lrm>
>        <transient_attributes id="1c3fdfbd-ee55-47e3-a8c2-52f34a5c5553">
>          <instance_attributes
> id="status-1c3fdfbd-ee55-47e3-a8c2-52f34a5c5553">
>            <attributes>
>              <nvpair
> id="status-1c3fdfbd-ee55-47e3-a8c2-52f34a5c5553-probe_complete"
> name="probe_complete" value="true"/>
>            </attributes>
>          </instance_attributes>
>        </transient_attributes>
>      </node_state>
>      <node_state id="5426e37c-9469-40a3-813c-eebeb0b7c6a0"
> uname="wabtectestconfig.patni.com" crmd="online"
> crm-debug-origin="do_update_resource" in_ccm="true" ha="active"
> join="member" expected="member" shutdown="0">
>        <lrm id="5426e37c-9469-40a3-813c-eebeb0b7c6a0">
>          <lrm_resources>
>            <lrm_resource id="IPaddr_1" type="IPaddr" class="ocf"
> provider="heartbeat">
>              <lrm_rsc_op id="IPaddr_1_monitor_0" operation="monitor"
> crm-debug-origin="do_update_resource"
> transition_key="6:0:acf06009-6ac9-48b2-b58f-04a04080b561"
> transition_magic="0:7;6:0:acf06009-6ac9-48b2-b58f-04a04080b561"
> call_id="2" crm_feature_set="1.0.7" rc_code="7" op_status="0"
> interval="0" op_digest="8d3445b5418271954723b32be9490d16"/>
>              <lrm_rsc_op id="IPaddr_1_start_0" operation="start"
> crm-debug-origin="do_update_resource"
> transition_key="4:2:acf06009-6ac9-48b2-b58f-04a04080b561"
> transition_magic="0:0;4:2:acf06009-6ac9-48b2-b58f-04a04080b561"
> call_id="5" crm_feature_set="1.0.7" rc_code="0" op_status="0"
> interval="0" op_digest="8d3445b5418271954723b32be9490d16"/>
>            </lrm_resource>
>            <lrm_resource id="res_ttsvc" type="ttsvc.sh" class="ocf"
> provider="heartbeat">
>              <lrm_rsc_op id="res_ttsvc_monitor_0" operation="monitor"
> crm-debug-origin="do_update_resource"
> transition_key="7:0:acf06009-6ac9-48b2-b58f-04a04080b561"
> transition_magic="0:0;7:0:acf06009-6ac9-48b2-b58f-04a04080b561"
> call_id="3" crm_feature_set="1.0.7" rc_code="0" op_status="0"
> interval="0" op_digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
>              <lrm_rsc_op id="res_ttsvc_stop_0" operation="stop"
> crm-debug-origin="do_update_resource"
> transition_key="2:4:acf06009-6ac9-48b2-b58f-04a04080b561"
> transition_magic="0:0;2:4:acf06009-6ac9-48b2-b58f-04a04080b561"
> call_id="9" crm_feature_set="1.0.7" rc_code="0" op_status="0"
> interval="0" op_digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
>              <lrm_rsc_op id="res_ttsvc_start_0" operation="start"
> crm-debug-origin="do_update_resource"
> transition_key="5:2:acf06009-6ac9-48b2-b58f-04a04080b561"
> transition_magic="0:0;5:2:acf06009-6ac9-48b2-b58f-04a04080b561"
> call_id="6" crm_feature_set="1.0.7" rc_code="0" op_status="0"
> interval="0" op_digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
>              <lrm_rsc_op id="res_ttsvc_monitor_2000" operation="monitor"
> crm-debug-origin="do_update_resource"
> transition_key="6:2:acf06009-6ac9-48b2-b58f-04a04080b561"
> transition_magic="2:-1;6:2:acf06009-6ac9-48b2-b58f-04a04080b561"
> call_id="7" crm_feature_set="1.0.7" rc_code="-1" op_status="2"
> interval="2000" op_digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
>            </lrm_resource>
>          </lrm_resources>
>        </lrm>
>        <transient_attributes id="5426e37c-9469-40a3-813c-eebeb0b7c6a0">
>          <instance_attributes
> id="status-5426e37c-9469-40a3-813c-eebeb0b7c6a0">
>            <attributes>
>              <nvpair
> id="status-5426e37c-9469-40a3-813c-eebeb0b7c6a0-probe_complete"
> name="probe_complete" value="true"/>
>              <nvpair
> id="status-5426e37c-9469-40a3-813c-eebeb0b7c6a0-fail-count-res_ttsvc"
> name="fail-count-res_ttsvc" value="2"/>
>            </attributes>
>          </instance_attributes>
>        </transient_attributes>
>      </node_state>
>    </status>
> </cib>
>
>
> Thanks,
> Chetan
>
> http://www.patni.com
> World-Wide Partnerships. World-Class Solutions.
> _____________________________________________________________________
>
> This e-mail message may contain proprietary, confidential or legally
> privileged information for the sole use of the person or entity to
> whom this message was originally addressed. Any review, e-transmission
> dissemination or other use of or taking of any action in reliance upon
> this information by persons or entities other than the intended
> recipient is prohibited. If you have received this e-mail in error
> kindly delete  this e-mail from your records. If it appears that this
> mail has been forwarded to you without proper authority, please notify
> us immediately at netadmin at patni.com and delete this mail.
> _____________________________________________________________________
>
> _______________________________________________
> Linux-HA mailing list
> Linux-HA at lists.linux-ha.org
> http://lists.linux-ha.org/mailman/listinfo/linux-ha
> See also: http://linux-ha.org/ReportingProblems
>



More information about the Linux-HA mailing list