Fails related to memcached but memcached not used


#1

Hello,

We use couchbase 3.1.2 on our project.

we run 2 distincts cluster with 5 machines with 8VCPU/16G RAM

on cluster 1 no issues, everything is fin
on cluster 2 in one week we lost 3 times huge amount of data, for some reason several nodes starts failing , as we have only replica, when two nodes or more are down we are in big trouble.

We are currently increasing the number of replicas to 2 and adding 3 nodes but I would like to find out why couchbase fails

we receive that kind of mail from email alerts when it fail , after that the node is marked as down in couchbase console

Couchbase Server alert: auto_failover_node

couchbase@cb03
ven. 11/12/2015 12:00
À :
Nicolas Raby;
Rudy Pons;
Sergiu Niculas;

Node (‘ns_1@10.xx.xx.xx’) was automatically failovered.
[{last_heard,{1449,831631,197010}},
{stale_slow_status,{1449,831596,183176}},
{now,{1449,831621,192185}},
{active_buckets,[“bucket_name”]},
{ready_buckets,[]},
{status_latency,9374},
{outgoing_replications_safeness_level,[{“bucket_name”,green}]},
{incoming_replications_conf_hashes,[{“bucket_name”,
[{'ns_1@10.32.xx.xx’,54319253},
{'ns_1@10.32.xx.xx’,106088567},
{'ns_1@10.32.xx.xx’,128028598},
{'ns_1@10.32.xx.xx’,131975223},
{'ns_1@10.32.xx.xx’,11642442},
{'ns_1@10.32.xx.xx’,52056934},
{'ns_1@10.32.xx.xx’,12953452}]}]},
{local_tasks,[[{pid,<<"<0.9106.3660>">>},
{changes_done,115111},
{design_documents,[<<"_design/sns_ids">>]},
{indexer_type,main},
{initial_build,false},
{progress,98},
{set,<<“bucket_name”>>},
{signature,<<“a1d9422beabbf2c0bf94a3f85f1e5e9d”>>},
{started_on,1449831574},
{total_changes,117152},
{type,indexer},
{updated_on,1449831594}],
[{pid,<<"<0.19984.3662>">>},
{changes_done,11482},
{design_documents,[<<"_design/user">>]},
{indexer_type,main},
{initial_build,false},
{progress,9},
{set,<<“bucket_name”>>},
{signature,<<“e752e0f1f0c044a63cd1ea20d67fab95”>>},
{started_on,1449831574},
{total_changes,117820},
{type,indexer},
{updated_on,1449831592}],
[{pid,<<"<0.9189.3663>">>},
{changes_done,1},
{design_documents,[<<"_design/sns_ids">>]},
{indexer_type,replica},
{initial_build,false},
{progress,0},
{set,<<“bucket_name”>>},
{signature,<<“a1d9422beabbf2c0bf94a3f85f1e5e9d”>>},
{started_on,1449831586},
{total_changes,456873},
{type,indexer},
{updated_on,1449831587}],
[{pid,<<"<0.650.3665>">>},
{changes_done,1},
{design_documents,[<<"_design/user">>]},
{indexer_type,replica},
{initial_build,false},
{progress,0},
{set,<<“bucket_name”>>},
{signature,<<“e752e0f1f0c044a63cd1ea20d67fab95”>>},
{started_on,1449831586},
{total_changes,456908},
{type,indexer},
{updated_on,1449831587}],
[{type,xdcr},
{id,<<“cf32d4c44dc997409ef2b23db69d6dc0/bucket_name/bucket_name”>>},
{errors,[<<“2015-12-11 10:53:56 [Vb Rep] Error replicating vbucket 536. Please see logs for details.”>>,
<<“2015-12-11 10:53:58 [Vb Rep] Error replicating vbucket 537. Please see logs for details.”>>,
<<“2015-12-11 10:53:59 [Vb Rep] Error replicating vbucket 535. Please see logs for details.”>>,
<<“2015-12-11 10:54:21 [Vb Rep] Error replicating vbucket 534. Please see logs for details.”>>,
<<“2015-12-11 10:54:34 [Vb Rep] Error replicating vbucket 201. Please see logs for details.”>>,
<<“2015-12-11 10:54:34 [Vb Rep] Error replicating vbucket 201. Please see logs for details.”>>,
<<“2015-12-11 10:54:34 [Vb Rep] Error replicating vbucket 532. Please see logs for details.”>>,
<<“2015-12-11 10:54:49 [Vb Rep] Error replicating vbucket 527. Please see logs for details.”>>,
<<“2015-12-11 10:54:49 [Vb Rep] Error replicating vbucket 530. Please see logs for details.”>>,
<<“2015-12-11 10:54:49 [Vb Rep] Error replicating vbucket 531. Please see logs for details.”>>]},
{changes_left,0},
{docs_checked,9119334},
{docs_written,7819168},
{active_vbreps,0},
{max_vbreps,16},
{waiting_vbreps,0},
{time_working,90681.835416},
{time_committing,0.0},
{time_working_rate,0.39431698160248835},
{num_checkpoints,4659},
{num_failedckpts,27},
{wakeups_rate,67.02578827876921},
{worker_batches_rate,67.87961997658793},
{rate_replication,61.47588224294756},
{bandwidth_usage,1117272.076032005},
{rate_doc_checks,67.87961997658793},
{rate_doc_opt_repd,7.684485280368445},
{meta_latency_aggr,81.03759335582326},
{meta_latency_wt,60.19513469621949},
{docs_latency_aggr,102.04057237461696},
{docs_latency_wt,61.47588224294756}],
[{type,xdcr},
{id,<<“6e8ef4879cf23a41b3a86d6147656fde/bucket_name/bucket_name”>>},
{errors,[<<“2015-12-11 10:49:32 [Vb Rep] Error replicating vbucket 918. Please see logs for details.”>>,
<<“2015-12-11 10:49:33 [Vb Rep] Error replicating vbucket 971. Please see logs for details.”>>,
<<“2015-12-11 10:49:39 [Vb Rep] Error replicating vbucket 970. Please see logs for details.”>>,
<<“2015-12-11 10:53:11 [Vb Rep] Error replicating vbucket 933. Please see logs for details.”>>,
<<“2015-12-11 10:53:27 [Vb Rep] Error replicating vbucket 913. Please see logs for details.”>>,
<<“2015-12-11 10:53:40 [Vb Rep] Error replicating vbucket 902. Please see logs for details.”>>,
<<“2015-12-11 10:54:30 [Vb Rep] Error replicating vbucket 533. Please see logs for details.”>>,
<<“2015-12-11 10:54:49 [Vb Rep] Error replicating vbucket 527. Please see logs for details.”>>,
<<“2015-12-11 10:54:49 [Vb Rep] Error replicating vbucket 530. Please see logs for details.”>>,
<<“2015-12-11 10:54:49 [Vb Rep] Error replicating vbucket 531. Please see logs for details.”>>]},
{changes_left,0},
{docs_checked,14737191},
{docs_written,14600864},
{active_vbreps,0},
{max_vbreps,16},
{waiting_vbreps,0},
{time_working,729678.670095},
{time_committing,0.0},
{time_working_rate,2.1916942920140117},
{num_checkpoints,10416},
{num_failedckpts,0},
{wakeups_rate,67.0352323518009},
{worker_batches_rate,70.02406436748629},
{rate_replication,70.02406436748629},
{bandwidth_usage,1776084.3909528907},
{rate_doc_checks,70.02406436748629},
{rate_doc_opt_repd,7.685568040333861},
{meta_latency_aggr,625.6440932993802},
{meta_latency_wt,62.33849632715243},
{docs_latency_aggr,1361.243900647808},
{docs_latency_wt,70.02406436748629}]]},
{memory,[{total,915102720},
{processes,589867264},
{processes_used,583246272},
{system,325235456},
{atom,686993},
{atom_used,671435},
{binary,184067032},
{code,16585082},
{ets,113973368}]},
{system_memory_data,[{system_total_memory,16840982528},
{free_swap,4183306240},
{total_swap,4194299904},
{cached_memory,5769744384},
{buffered_memory,5541888},
{free_memory,164978688},
{total_memory,16840982528}]},
{node_storage_conf,[{db_path,"/opt/couchbase/var/lib/couchbase/data"},
{index_path,"/opt/couchbase/var/lib/couchbase/data"}]},
{statistics,[{wall_clock,{93083603,4992}},
{context_switches,{2969258813,0}},
{garbage_collection,{662268859,1699977302322,0}},
{io,{{input,1177303557062},{output,972124959535}}},
{reductions,{994321513862,45981179}},
{run_queue,0},
{runtime,{266307170,12090}},
{run_queues,{0,0,0,0,0,0,0,0}}]},
{system_stats,[{cpu_utilization_rate,72.04574332909785},
{swap_total,4194299904},
{swap_used,10993664},
{mem_total,16840982528},
{mem_free,5940170752}]},
{interesting_stats,[{cmd_get,193.80619380619382},
{couch_docs_actual_disk_size,36469613566},
{couch_docs_data_size,12147570786},
{couch_views_actual_disk_size,472475565},
{couch_views_data_size,13667288},
{curr_items,1312139},
{curr_items_tot,3132800},
{ep_bg_fetched,0.999000999000999},
{get_hits,134.86513486513488},
{mem_used,5697327240},
{ops,206.7932067932068},
{vb_replica_curr_items,1820661}]},
{per_bucket_interesting_stats,[{“bucket_name”,
[{cmd_get,193.80619380619382},
{couch_docs_actual_disk_size,36469613566},
{couch_docs_data_size,12147570786},
{couch_views_actual_disk_size,472475565},
{couch_views_data_size,13667288},
{curr_items,1312139},
{curr_items_tot,3132800},
{ep_bg_fetched,0.999000999000999},
{get_hits,134.86513486513488},
{mem_used,5697327240},
{ops,206.7932067932068},
{vb_replica_curr_items,1820661}]}]},
{processes_stats,[{<<“proc/(main)beam.smp/cpu_utilization”>>,0},
{<<“proc/(main)beam.smp/major_faults”>>,0},
{<<“proc/(main)beam.smp/major_faults_raw”>>,315},
{<<“proc/(main)beam.smp/mem_resident”>>,1518813184},
{<<“proc/(main)beam.smp/mem_share”>>,10297344},
{<<“proc/(main)beam.smp/mem_size”>>,6348353536},
{<<“proc/(main)beam.smp/minor_faults”>>,4116},
{<<“proc/(main)beam.smp/minor_faults_raw”>>,332347974},
{<<“proc/(main)beam.smp/page_faults”>>,4116},
{<<“proc/(main)beam.smp/page_faults_raw”>>,332348289},
{<<“proc/beam.smp/cpu_utilization”>>,0},
{<<“proc/beam.smp/major_faults”>>,0},
{<<“proc/beam.smp/major_faults_raw”>>,0},
{<<“proc/beam.smp/mem_resident”>>,28741632},
{<<“proc/beam.smp/mem_share”>>,2621440},
{<<“proc/beam.smp/mem_size”>>,1006260224},
{<<“proc/beam.smp/minor_faults”>>,0},
{<<“proc/beam.smp/minor_faults_raw”>>,10460},
{<<“proc/beam.smp/page_faults”>>,0},
{<<“proc/beam.smp/page_faults_raw”>>,10460},
{<<“proc/memcached/cpu_utilization”>>,250},
{<<“proc/memcached/major_faults”>>,7},
{<<“proc/memcached/major_faults_raw”>>,171},
{<<“proc/memcached/mem_resident”>>,8743337984},
{<<“proc/memcached/mem_share”>>,6537216},
{<<“proc/memcached/mem_size”>>,9148088320},
{<<“proc/memcached/minor_faults”>>,1},
{<<“proc/memcached/minor_faults_raw”>>,2228534},
{<<“proc/memcached/page_faults”>>,8},
{<<“proc/memcached/page_faults_raw”>>,2228705}]},
{cluster_compatibility_version,196608},
{version,[{lhttpc,“1.3.0”},
{os_mon,“2.2.14”},
{public_key,“0.21”},
{asn1,“2.0.4”},
{couch,“2.1.1r-462-g80ef126”},
{kernel,“2.16.4”},
{syntax_tools,“1.6.13”},
{xmerl,“1.3.6”},
{ale,“3.1.2-1815-rel-enterprise”},
{couch_set_view,“2.1.1r-462-g80ef126”},
{compiler,“4.9.4”},
{inets,“5.9.8”},
{mapreduce,“1.0.0”},
{couch_index_merger,“2.1.1r-462-g80ef126”},
{ns_server,“3.1.2-1815-rel-enterprise”},
{oauth,“7d85d3ef”},
{crypto,“3.2”},
{ssl,“5.3.3”},
{sasl,“2.3.4”},
{couch_view_parser,“1.0.0”},
{mochiweb,“2.4.2”},
{stdlib,“1.19.4”}]},
{supported_compat_version,[3,0]},
{advertised_version,[3,1,1]},
{system_arch,“x86_64-unknown-linux-gnu”},
{wall_clock,93083},
{memory_data,{16840982528,16629751808,{<13702.410.0>,354846904}}},
{disk_data,[{"/",4386244,52},
{"/dev/shm",4028832,0},
{"/boot",487652,16},
{"/home",999320,1},
{"/tmp",1999184,1},
{"/var",2934032,41},
{"/opt/couchbase",262110212,37}]},
{meminfo,<<“MemTotal: 16446272 kB\nMemFree: 162848 kB\nBuffers: 5412 kB\nCached: 5632764 kB\nSwapCached: 2688 kB\nActive: 11125364 kB\nInactive: 4593564 kB\nActive(anon): 8563036 kB\nInactive(anon): 1519668 kB\nActive(file): 2562328 kB\nInactive(file): 3073896 kB\nUnevictable: 0 kB\nMlocked: 0 kB\nSwapTotal: 4095996 kB\nSwapFree: 4085260 kB\nDirty: 233992 kB\nWriteback: 0 kB\nAnonPages: 10080344 kB\nMapped: 16248 kB\nShmem: 300 kB\nSlab: 295800 kB\nSReclaimable: 215564 kB\nSUnreclaim: 80236 kB\nKernelStack: 3168 kB\nPageTables: 26740 kB\nNFS_Unstable: 0 kB\nBounce: 0 kB\nWritebackTmp: 0 kB\nCommitLimit: 12319132 kB\nCommitted_AS: 11439464 kB\nVmallocTotal: 34359738367 kB\nVmallocUsed: 291964 kB\nVmallocChunk: 34359441832 kB\nHardwareCorrup
ted: 0 kB\nAnonHugePages: 0 kB\nHugePages_Total: 0\nHugePages_Free: 0\nHugePages_Rsvd: 0\nHugePages_Surp: 0\nHugepagesize: 2048 kB\nDirectMap4k: 10240 kB\nDirectMap2M: 16766976 kB\n”>>}]

The issue seems to be related to memcached, but we use only 1 bucket and it runs on couchbase, not memcached.

any idea what s going on here ?