Skip to content

CCR Rolling upgrade failure #39258

Closed
Closed
@benwtrent

Description

@benwtrent

https://elasticsearch-ci.elastic.co/job/elastic+elasticsearch+master+release-tests/478/console

https://elasticsearch-ci.elastic.co/job/elastic+elasticsearch+master+bwc-tests/474/console

Both builds failed to the same problem

[2019-02-21T15:10:42,812][DEBUG][o.e.a.a.c.h.TransportClusterHealthAction] [upgraded-node-follower-0] no known master node, scheduling a retry
|    [2019-02-21T15:10:43,748][INFO ][o.e.c.c.JoinHelper       ] [upgraded-node-follower-0] failed to join {node-1}{7iLNddDJSOOR1K39-oLSdQ}{SwgTxJ6FSzyhjJ3BbrBbxg}{127.0.0.1}{127.0.0.1:41039}{testattr=test, xpack.installed=true} with JoinRequest{sourceNode={upgraded-node-follower-0}{HxHXwQW9SJOeTYWF1sFi4w}{Q4vSZxxZTz24xXlw_m0nuw}{127.0.0.1}{127.0.0.1:39352}{upgraded=true, xpack.installed=true, testattr=test}, optionalJoin=Optional[Join{term=4, lastAcceptedTerm=3, lastAcceptedVersion=24, sourceNode={upgraded-node-follower-0}{HxHXwQW9SJOeTYWF1sFi4w}{Q4vSZxxZTz24xXlw_m0nuw}{127.0.0.1}{127.0.0.1:39352}{upgraded=true, xpack.installed=true, testattr=test}, targetNode={node-1}{7iLNddDJSOOR1K39-oLSdQ}{SwgTxJ6FSzyhjJ3BbrBbxg}{127.0.0.1}{127.0.0.1:41039}{testattr=test, xpack.installed=true}}]}
|    org.elasticsearch.transport.RemoteTransportException: [node-1][127.0.0.1:41039][internal:cluster/coordination/join]
|    Caused by: java.lang.IllegalStateException: failure when sending a validation request to node
|    	at org.elasticsearch.cluster.coordination.Coordinator$3.onFailure(Coordinator.java:476) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.cluster.coordination.JoinHelper$5.handleException(JoinHelper.java:289) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.transport.TransportService$ContextRestoreResponseHandler.handleException(TransportService.java:1118) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.transport.TcpTransport.lambda$handleException$24(TcpTransport.java:1001) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:681) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) ~[?:?]
|    	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) ~[?:?]
|    	at java.lang.Thread.run(Thread.java:834) [?:?]
|    Caused by: org.elasticsearch.transport.RemoteTransportException: [upgraded-node-follower-0][127.0.0.1:39352][internal:cluster/coordination/join/validate]
|    Caused by: java.lang.IllegalArgumentException: No byte size unit found for id [12]
|    	at org.elasticsearch.common.unit.ByteSizeUnit.fromId(ByteSizeUnit.java:264) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.common.unit.ByteSizeUnit.readFrom(ByteSizeUnit.java:302) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.common.unit.ByteSizeValue.<init>(ByteSizeValue.java:52) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.xpack.ccr.action.ShardFollowTask.readFrom(ShardFollowTask.java:104) ~[?:?]
|    	at org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput.readNamedWriteable(NamedWriteableAwareStreamInput.java:46) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput.readNamedWriteable(NamedWriteableAwareStreamInput.java:39) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.persistent.PersistentTasksCustomMetaData$PersistentTask.<init>(PersistentTasksCustomMetaData.java:351) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.common.io.stream.StreamInput.readMap(StreamInput.java:497) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.persistent.PersistentTasksCustomMetaData.<init>(PersistentTasksCustomMetaData.java:536) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput.readNamedWriteable(NamedWriteableAwareStreamInput.java:46) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput.readNamedWriteable(NamedWriteableAwareStreamInput.java:39) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.cluster.metadata.MetaData.readFrom(MetaData.java:913) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.cluster.ClusterState.readFrom(ClusterState.java:798) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.discovery.zen.MembershipAction$ValidateJoinRequest.readFrom(MembershipAction.java:174) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.common.io.stream.Streamable.lambda$newWriteableReader$0(Streamable.java:51) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.transport.RequestHandlerRegistry.newRequest(RequestHandlerRegistry.java:56) ~[elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.transport.TcpTransport.handleRequest(TcpTransport.java:1032) [elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.transport.TcpTransport.messageReceived(TcpTransport.java:922) [elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.transport.TcpTransport.inboundMessage(TcpTransport.java:753) [elasticsearch-8.0.0.jar:8.0.0]
|    	at org.elasticsearch.transport.netty4.Netty4MessageChannelHandler.channelRead(Netty4MessageChannelHandler.java:53) [transport-netty4-client-8.0.0.jar:8.0.0]
|    	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:323) [netty-codec-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:297) [netty-codec-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.handler.logging.LoggingHandler.channelRead(LoggingHandler.java:241) [netty-handler-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1434) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:965) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:163) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:656) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.nio.NioEventLoop.processSelectedKeysPlain(NioEventLoop.java:556) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:510) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:470) [netty-transport-4.1.32.Final.jar:4.1.32.Final]
|    	at io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:909) [netty-common-4.1.32.Final.jar:4.1.32.Final]
|    	at java.lang.Thread.run(Thread.java:834) ~[?:?]

For oneThirdUpgradedTestCluster I never see a node-1 even attempt to build, So I am assuming it is trying to connected to the old cluster. From what I can tell both node-1 and the new node are "followers".

Node-1 logs have numerous validation failures coming in.

{"type": "server", "timestamp": "2019-02-21T11:14:05,264+0000", "level": "WARN", "component": "o.e.c.c.Coordinator", "cluster.name": "follower", "node.name": "node-1", "cluster.uuid": "eiHra7ZjTjaQb0VqGeFd5w", "node.id": "sVwc7t_jRLuANxZMCGQDig",  "message": "failed to validate incoming join request from node [{upgraded-node-follower-0}{CQyeIV-NTbOQr-4Xno4nRw}{KfuZ7VnbTe2wNld7X1ah0Q}{127.0.0.1}{127.0.0.1:46360}{testattr=test, upgraded=true, xpack.installed=true}]"

Full stack trace

stacktrace: [org.elasticsearch.transport.RemoteTransportException: [upgraded-node-follower-0][127.0.0.1:46360][internal:cluster/coordination/join/validate],
Caused by: java.lang.IllegalArgumentException: No byte size unit found for id [12],
at org.elasticsearch.common.unit.ByteSizeUnit.fromId(ByteSizeUnit.java:264) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.common.unit.ByteSizeUnit.readFrom(ByteSizeUnit.java:302) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.common.unit.ByteSizeValue.<init>(ByteSizeValue.java:52) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.xpack.ccr.action.ShardFollowTask.readFrom(ShardFollowTask.java:104) ~[?:?],
at org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput.readNamedWriteable(NamedWriteableAwareStreamInput.java:46) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput.readNamedWriteable(NamedWriteableAwareStreamInput.java:39) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.persistent.PersistentTasksCustomMetaData$PersistentTask.<init>(PersistentTasksCustomMetaData.java:351) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.common.io.stream.StreamInput.readMap(StreamInput.java:497) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.persistent.PersistentTasksCustomMetaData.<init>(PersistentTasksCustomMetaData.java:536) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput.readNamedWriteable(NamedWriteableAwareStreamInput.java:46) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput.readNamedWriteable(NamedWriteableAwareStreamInput.java:39) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.cluster.metadata.MetaData.readFrom(MetaData.java:913) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.cluster.ClusterState.readFrom(ClusterState.java:798) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.discovery.zen.MembershipAction$ValidateJoinRequest.readFrom(MembershipAction.java:174) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.common.io.stream.Streamable.lambda$newWriteableReader$0(Streamable.java:51) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.transport.RequestHandlerRegistry.newRequest(RequestHandlerRegistry.java:56) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.transport.TcpTransport.handleRequest(TcpTransport.java:1032) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.transport.TcpTransport.messageReceived(TcpTransport.java:922) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.transport.TcpTransport.inboundMessage(TcpTransport.java:753) ~[elasticsearch-7.1.0-SNAPSHOT.jar:7.1.0-SNAPSHOT],
at org.elasticsearch.transport.netty4.Netty4MessageChannelHandler.channelRead(Netty4MessageChannelHandler.java:53) ~[?:?],
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362) ~[?:?],
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348) ~[?:?],
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340) ~[?:?],
at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:323) ~[?:?],
at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:297) ~[?:?],
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362) ~[?:?],
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348) ~[?:?],
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340) ~[?:?],
at io.netty.handler.logging.LoggingHandler.channelRead(LoggingHandler.java:241) ~[?:?],
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362) ~[?:?],
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348) ~[?:?],
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340) ~[?:?],
at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1434) ~[?:?],
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362) ~[?:?],
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348) ~[?:?],
at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:965) ~[?:?],
at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:163) ~[?:?],
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:656) ~[?:?],
at io.netty.channel.nio.NioEventLoop.processSelectedKeysPlain(NioEventLoop.java:556) ~[?:?],
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:510) ~[?:?],
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:470) ~[?:?],
at io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:909) ~[?:?],
at java.lang.Thread.run(Thread.java:834) [?:?]]

Previous to these validation failures apparently node-0 left the cluster (not sure if normal or not)

{"type": "server", "timestamp": "2019-02-21T11:13:46,771+0000", "level": "INFO", "component": "o.e.c.s.MasterService", "cluster.name": "follower", "node.name": "node-1", "cluster.uuid": "eiHra7ZjTjaQb0VqGeFd5w", "node.id": "sVwc7t_jRLuANxZMCGQDig",  "message": "node-left[{node-0}{CQyeIV-NTbOQr-4Xno4nRw}{xILDEmPpQs2RsY8vr9CRjw}{127.0.0.1}{127.0.0.1:32782}{testattr=test, xpack.installed=true} disconnected], term: 1, version: 25, reason: removed {{node-0}{CQyeIV-NTbOQr-4Xno4nRw}{xILDEmPpQs2RsY8vr9CRjw}{127.0.0.1}{127.0.0.1:32782}{testattr=test, xpack.installed=true},}"  }

Metadata

Metadata

Assignees

No one assigned

    Labels

    :Distributed Indexing/CCRIssues around the Cross Cluster State Replication features>test-failureTriaged test failures from CI

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions