天天看點

容易誤解的NPE問題描述問題排查總結

問題描述

騎手交易風控金額審批通過後,結算失敗,風控的同學檢視堆棧後發現有兩類異常,一類是正常的空指針,一類是看不懂的空指針。正常的空指針異常就不再列舉了,直接來看下面這個不好了解的NPE

content:  9672833181742924 [DubboServerHandler-localhost:port-thread-199] com.....riskcontrol.trade.unit.dubbo.client.RiderTradeClient:addBalanceLogForAbnormalBack:46 --- riderTradeCostSettleOutmodedProvider#addBalanceLogForAbnormalBack異常
java.lang.NullPointerException: null
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) ~[?:1.8.0_201]
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) ~[?:1.8.0_201]
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) ~[?:1.8.0_201]
	at java.lang.reflect.Constructor.newInstance(Constructor.java:423) ~[?:1.8.0_201]
	at com.alibaba.com.caucho.hessian.io.JavaDeserializer.instantiate(JavaDeserializer.java:305) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.com.caucho.hessian.io.JavaDeserializer.readObject(JavaDeserializer.java:194) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.com.caucho.hessian.io.SerializerFactory.readObject(SerializerFactory.java:524) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.com.caucho.hessian.io.Hessian2Input.readObjectInstance(Hessian2Input.java:2743) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.com.caucho.hessian.io.Hessian2Input.readObject(Hessian2Input.java:2683) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.com.caucho.hessian.io.Hessian2Input.readObject(Hessian2Input.java:2657) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.common.serialize.support.hessian.Hessian2ObjectInput.readObject(Hessian2ObjectInput.java:76) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.rpc.protocol.dubbo.DecodeableRpcResult.decode(DecodeableRpcResult.java:92) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.rpc.protocol.dubbo.DecodeableRpcResult.decode(DecodeableRpcResult.java:109) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.rpc.protocol.dubbo.DubboCodec.decodeBody(DubboCodec.java:90) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.remoting.exchange.codec.ExchangeCodec.decode(ExchangeCodec.java:119) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.remoting.exchange.codec.ExchangeCodec.decode(ExchangeCodec.java:80) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.rpc.protocol.dubbo.DubboCountCodec.decode(DubboCountCodec.java:46) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.remoting.transport.netty.NettyCodecAdapter$InternalDecoder.messageReceived(NettyCodecAdapter.java:134) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:80) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:559) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:274) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:261) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.socket.nio.NioWorker.read(NioWorker.java:349) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.socket.nio.NioWorker.processSelectedKeys(NioWorker.java:280) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:200) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:44) ~[netty-3.2.5.Final.jar!/:?]
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_201]
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_201]
	at java.lang.Thread.run(Thread.java:748) [?:1.8.0_201]
           

本地方法NPE?不能夠啊!!!

問題排查

根據調用方提供的資訊,檢視服務提供者的日志,報錯日志如下:

content:  [DubboServerHandler-localhost:port-thread-494] 
com.....common.exception.filter.dubbo.BusinessExceptionFilter:invoke:90 
Got unchecked and undeclared exception which called by remoteHost. 
service: com.....billing.settle.provider.outmoded.RiderTradeCostSettleOutmodedProvider, 
method: addBalanceLogForAbnormalBack, args:[23,"607623443302600704",6286.77], 
exception: java.lang.NullPointerException: null ===> java.lang.NullPointerException
           

可以看到該NPE異常是沒有堆棧的,也就是說由于大量的NPE觸發了JVM的堆棧優化,不再輸出堆棧,該優化參數:OmitStackTraceInFastThrow。相關源碼如下:

// src/share/vm/opto/graphKit.cpp:515
if (treat_throw_as_hot
  && (!StackTraceInThrowable || OmitStackTraceInFastThrow)) {
// If the throw is local, we use a pre-existing instance and
// punt on the backtrace.  This would lead to a missing backtrace
// (a repeat of 4292742) if the backtrace object is ever asked
// for its backtrace.
// Fixing this remaining case of 4292742 requires some flavor of
// escape analysis.  Leave that for the future.
ciInstance* ex_obj = NULL;
switch (reason) {
case Deoptimization::Reason_null_check:
  ex_obj = env()->NullPointerException_instance();
  break;
case Deoptimization::Reason_div0_check:
  ex_obj = env()->ArithmeticException_instance();
  break;
case Deoptimization::Reason_range_check:
  ex_obj = env()->ArrayIndexOutOfBoundsException_instance();
  break;
case Deoptimization::Reason_class_check:
  if (java_bc() == Bytecodes::_aastore) {
    ex_obj = env()->ArrayStoreException_instance();
  } else {
    ex_obj = env()->ClassCastException_instance();
  }
  break;
}
if (failing()) { stop(); return; }  // exception allocation might fail
if (ex_obj != NULL) {
  ...
  // Clear the detail message of the preallocated exception object.
  // Weblogic sometimes mutates the detail message of exceptions
  // using reflection.
  ...
}
           

那麼用戶端看到的異常堆棧從何而來?

問題推斷1:dubbo用戶端将用戶端目前執行的堆棧填充至NPE異常

服務端故意抛出無堆棧的NPE,使用dubbo用戶端調用服務端,異常不會被填充,說明不是dubbo内部自動填充的異常。那麼問題在哪?因為公司封裝了很多業務異常,對于dubbo服務抛出的業務異常要進行定制化處理,于是加了業務異常過濾器,那麼是不是該過濾器導緻的呢?

dubbo調用鍊簡單回顧下:

RegistryProtocol引用服務提供者-》ProtocolFilterWrapper将調用者Invoker使用過濾鍊包裝-》調用者DubboInvoker

服務端調用目标方法出現異常,将異常包裝為RpcResult響應用戶端:com.alibaba.dubbo.rpc.protocol.AbstractInvoker#invoke

try {
    return doInvoke(invocation);
} catch (InvocationTargetException e) { // biz exception
    Throwable te = e.getTargetException();
    if (te == null) {
        return new RpcResult(e);
    } else {
        if (te instanceof RpcException) {
            ((RpcException) te).setCode(RpcException.BIZ_EXCEPTION);
        }
        return new RpcResult(te);
    }
} catch (RpcException e) {
    if (e.isBiz()) {
        return new RpcResult(e);
    } else {
        throw e;
    }
} catch (Throwable e) {
    return new RpcResult(e);
}
           

過濾器:服務端調用同樣存在業務異常過濾器,源碼與用戶端類似,過濾器輸出的日志結合源碼可以确認綁定的異常為NPE異常

過濾器擷取調用結果RpcResult進行業務處理,此時我們服務端抛出了NPE異常,也就是RpcResult綁定的異常類型為NPE,檢視我們業務異常處理過濾器代碼如下:

public Result invoke(Invoker<?> invoker, Invocation invocation)
        throws RpcException {
    try {
        ...
        Result result = invoker.invoke(invocation);
		...
        if (result.hasException()
                && GenericService.class != invoker.getInterface()) {
            try {
                Throwable exception = result.getException();
                // 如果是checked異常,直接抛出
                if (!(exception instanceof RuntimeException)
                        && (exception instanceof Exception)) {
                    return result;
                }
                // 在方法簽名上有聲明,直接抛出
                ...
				// 業務異常直接抛出
                if(exception instanceof BusinessException){
                    return result;
                }
                ...
                // 異常類和接口類在同一jar包裡,直接抛出
                ...
                // 是JDK自帶的異常,直接抛出
                String className = exception.getClass().getName();
                if (className.startsWith("java.")
                        || className.startsWith("javax.")) {
                    return result;
                }
                // 是Dubbo本身的異常,直接抛出
                if (exception instanceof RpcException) {
                    return result;
                }
                // 否則,包裝成RuntimeException抛給用戶端
                return new RpcResult(new RuntimeException(
                        StringUtils.toString(exception)));
            } catch (Throwable e) {
                ...
                return result;
            }
        }
        return result;
    } catch (RuntimeException e) {
        ...
        throw e;
    }
}
           

并沒有填充異常堆棧的代碼。那麼是否日志列印輸出時自動填充,同樣适用自己寫的案例嘗試log.error輸出也依然是沒有堆棧,故不是該原因

問題推斷2:hessian序列化時填充的目前堆棧

代碼成功複現,為什麼前面dubbo抛出NPE沒能複現呢?因為我當時本地執行代碼時使用單線程循環調用,下面案例代碼如果不使用線程池調用也是無法觸發JVM的異常堆棧優化,由于沒有觸發JVM的異常堆棧優化,是以就沒能複現咯

import com.alibaba.com.caucho.hessian.io.HessianInput;
import com.alibaba.com.caucho.hessian.io.HessianOutput;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

/**
 * @author 會灰翔的灰機
 * @date 2020/10/11
 */
public class NpeThief extends Thread {

    private Set<Integer> lens = new HashSet<>();
    private static int count = 0;
    private Throwable throwable;
    @Override
    public void run() {
        try{
            System.out.println(this.getClass().getSimpleName()+"--"+(++count));
            String str = null;
            // 制造空指針NPE
            System.out.println(str.length());
        }catch (Throwable e){
            lens.add(e.getStackTrace().length);
            if (throwable == null && e.getStackTrace().length==0) {
                throwable=e;
            }
        }
    }
    public static void main(String[] args) throws InterruptedException, IOException {
        NpeThief npeThief = new NpeThief();
        ExecutorService executorService = Executors.newFixedThreadPool(20);
        for (int i=0; i<8000; i++) {
            executorService.execute(npeThief);
            // 稍微sleep一下, 是為了不要讓異常抛出太快, 導緻控制台輸出太快, 把有異常棧資訊沖掉, 隻留下fast throw方式抛出的異常
            Thread.sleep(2);
        }
        System.out.println(npeThief.lens);
        System.out.println(npeThief.throwable.getStackTrace().length);
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        //Hessian的序列化輸出
        HessianOutput ho = new HessianOutput(os);
        Throwable old = npeThief.throwable;
        ho.writeObject(old);
        old.printStackTrace();

        byte[] data = os.toByteArray();

        ByteArrayInputStream is = new ByteArrayInputStream(data);
        //Hessian的反序列化讀取對象
        HessianInput hi = new HessianInput(is);
        //前面的ho.writeObject(old);必須有,不然報錯HessianProtocolException: unknown code for readObject at end of file
        Throwable throwable = (Throwable) hi.readObject();
        throwable.printStackTrace();
    }
}

           

總結

  1. 異常的堆棧并非服務端的堆棧,因為服務端日志可以看到堆棧已經被JVM優化掉了,堆棧實際是用戶端側當時的堆棧
  2. 問題是hessian反序列化+JVM優化異常堆棧導緻,簡單的處理方式就是關閉JVM的異常堆棧優化功能即可:-XX:-OmitStackTraceInFastThrow