【Android 内存优化】 native内存泄漏监控方案源码分析

文章目录

前言
使用效果
使用api
JNI的动态注册
- native方法
- 动态注册
hook的实现
- - android_dlopen_ext和dl_iterate_phdr
naive监控的实现
- nativeGetLeakAllocs
总结

前言

Android的native泄漏怎么检测？下面通过研究开源项目KOOM来一探究竟。

使用效果

未触发泄漏前的日志：

0 bytes in 0 allocations unreachable out of 13587448 bytes in 36227 allocations

特意触发泄漏，再次触发：

sweeping done
folding related leaks
folding done
unreachable memory detection done
208233696 bytes in 268 allocations unreachable out of 306475224 bytes in 59154 allocations
LeakRecordMap size: 0

可以看到，泄漏的检测精准到了字节级别。

KOOM文档里面写着，native的实现是使用了Google官方给出的libmemunreachable 动态库。

参考地址：https://android.googlesource.com/platform/system/memory/libmemunreachable/+/master/README.md

使用api

 LeakMonitor.INSTANCE.start()LeakMonitor.INSTANCE.checkLeaks()LeakMonitor.INSTANCE.stop()

简单行代码即可完成，但是别忘了，我们的目的是研究他是怎么实现的。

来看代码。

JNI的动态注册

native方法的定义，简单看方法名可以大概知道，有安装监控器、卸载监控器、设置阈值和获取存储泄漏数据的容器。
但是一眼看不出来nativeGetAllocIndex的作用。

native方法

  @JvmStaticprivate external fun nativeInstallMonitor(selectedList: Array<String>,ignoreList: Array<String>, enableLocalSymbolic: Boolean): Boolean@JvmStaticprivate external fun nativeUninstallMonitor()@JvmStaticprivate external fun nativeSetMonitorThreshold(size: Int)@JvmStaticprivate external fun nativeGetAllocIndex(): Long@JvmStaticprivate external fun nativeGetLeakAllocs(leakRecordMap: Map<String, LeakRecord>)

来看看native层的实现：

动态注册

jni_leak_monitor.cpp

static const JNINativeMethod kLeakMonitorMethods[] = {{"nativeInstallMonitor", "([Ljava/lang/String;[Ljava/lang/String;Z)Z",reinterpret_cast<void *>(InstallMonitor)},{"nativeUninstallMonitor", "()V",reinterpret_cast<void *>(UninstallMonitor)},{"nativeSetMonitorThreshold", "(I)V",reinterpret_cast<void *>(SetMonitorThreshold)},{"nativeGetAllocIndex", "()J", reinterpret_cast<void *>(GetAllocIndex)},{"nativeGetLeakAllocs", "(Ljava/util/Map;)V",reinterpret_cast<void *>(GetLeakAllocs)}};extern "C" JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *reserved) {JNIEnv *env;if (vm->GetEnv(reinterpret_cast<void **>(&env), JNI_VERSION_1_4) != JNI_OK) {ALOGE("GetEnv Fail!");return JNI_ERR;}jclass leak_monitor;FIND_CLASS(leak_monitor, kLeakMonitorFullyName);
#define NELEM(x) (sizeof(x) / sizeof((x)[0]))if (env->RegisterNatives(leak_monitor, kLeakMonitorMethods,NELEM(kLeakMonitorMethods)) != JNI_OK) {ALOGE("RegisterNatives Fail!");return JNI_ERR;}return JNI_VERSION_1_4;
}

这里进行了JNI函数的动态注册，至于动态注册的优点，就是可以提高一点代码性能，无需通过重复查表来链接对应的native方法。

JNINativeMethod 的结构如下：

    // 方法1{"methodName1", "methodSignature1", (void*) methodPointer1},// 方法2{"methodName2", "methodSignature2", (void*) methodPointer2},// ... };

我们找到了对应的native方法，先来看看InstallMonitor方法的实现：

jni_leak_monitor.cpp

static bool InstallMonitor(JNIEnv *env, jclass clz, jobjectArray selected_array,jobjectArray ignore_array,jboolean enable_local_symbolic) {jclass leak_record;FIND_CLASS(leak_record, kLeakRecordFullyName);g_leak_record.global_ref =reinterpret_cast<jclass>(env->NewGlobalRef(leak_record));if (!CheckedClean(env, g_leak_record.global_ref)) {return false;}GET_METHOD_ID(g_leak_record.construct_method, leak_record, "<init>","(JILjava/lang/String;[Lcom/kwai/koom/nativeoom/leakmonitor/""FrameInfo;)V");jclass frame_info;FIND_CLASS(frame_info, kFrameInfoFullyName);g_frame_info.global_ref =reinterpret_cast<jclass>(env->NewGlobalRef(frame_info));if (!CheckedClean(env, g_frame_info.global_ref)) {return false;}GET_METHOD_ID(g_frame_info.construct_method, frame_info, "<init>","(JLjava/lang/String;)V");g_enable_local_symbolic = enable_local_symbolic;auto array_to_vector =[](JNIEnv *env, jobjectArray jobject_array) -> std::vector<std::string> {std::vector<std::string> ret;int length = env->GetArrayLength(jobject_array);if (length <= 0) {return ret;}for (jsize i = 0; i < length; i++) {auto str = reinterpret_cast<jstring>(env->GetObjectArrayElement(jobject_array, i));const char *data = env->GetStringUTFChars(str, nullptr);ret.emplace_back(data);env->ReleaseStringUTFChars(str, data);}return std::move(ret);};std::vector<std::string> selected_so = array_to_vector(env, selected_array);std::vector<std::string> ignore_so = array_to_vector(env, ignore_array);return CheckedClean(env, LeakMonitor::GetInstance().Install(&selected_so, &ignore_so));
}

这里代码比较长，我们只需获取重点信息。

首先定义了一个jclass变量leak_record和frame_info，并通过FIND_CLASS宏查找Java类并创建全局引用。

使用GET_METHOD_ID宏获取leak_record和frame_info类的构造方法ID，用于后续实例化对象。

定义了一个lambda函数array_to_vector，该函数接受一个JNIEnv指针和一个jobjectArray对象作为参数，将其转换为C++的字符串向量。

接着把向量传递给LeakMonitor的Install方法：

hook的实现

leak_monitor.cpp

bool LeakMonitor::Install(std::vector<std::string> *selected_list,std::vector<std::string> *ignore_list) {KCHECK(!has_install_monitor_);// Reinstall can't hook againif (has_install_monitor_) {return true;}memory_analyzer_ = std::make_unique<MemoryAnalyzer>();if (!memory_analyzer_->IsValid()) {ALOGE("memory_analyzer_ NOT Valid");return false;}std::vector<const std::string> register_pattern = {"^/data/.*\\.so$"};std::vector<const std::string> ignore_pattern = {".*/libkoom-native.so$",".*/libxhook_lib.so$"};if (ignore_list != nullptr) {for (std::string &item : *ignore_list) {ignore_pattern.push_back(".*/" + item + ".so$");}}if (selected_list != nullptr && !selected_list->empty()) {// only hook the so in selected listregister_pattern.clear();for (std::string &item : *selected_list) {register_pattern.push_back("^/data/.*/" + item + ".so$");}}std::vector<std::pair<const std::string, void *const>> hook_entries = {std::make_pair("malloc", reinterpret_cast<void *>(WRAP(malloc))),std::make_pair("realloc", reinterpret_cast<void *>(WRAP(realloc))),std::make_pair("calloc", reinterpret_cast<void *>(WRAP(calloc))),std::make_pair("memalign", reinterpret_cast<void *>(WRAP(memalign))),std::make_pair("posix_memalign",reinterpret_cast<void *>(WRAP(posix_memalign))),std::make_pair("free", reinterpret_cast<void *>(WRAP(free)))};if (HookHelper::HookMethods(register_pattern, ignore_pattern, hook_entries)) {has_install_monitor_ = true;return true;}HookHelper::UnHookMethods();live_alloc_records_.Clear();memory_analyzer_.reset(nullptr);ALOGE("%s Fail", __FUNCTION__);return false;
}

这个函数前面通过正则表达式定义了需要hook和不需要hook的动态库。

后续就是把需要hook的系统内存管理函数put到容器里面，接着传给HookHelper去实现hook。

hook_helper.cpp

bool HookHelper::HookMethods(std::vector<const std::string> &register_pattern,std::vector<const std::string> &ignore_pattern,std::vector<std::pair<const std::string, void *const>> &methods) {if (register_pattern.empty() || methods.empty()) {ALOGE("Hook nothing");return false;}register_pattern_ = std::move(register_pattern);ignore_pattern_ = std::move(ignore_pattern);methods_ = std::move(methods);DlopenCb::GetInstance().AddCallback(Callback);return HookImpl();
}void HookHelper::Callback(std::set<std::string> &, int, std::string &) {HookImpl();
}

做了一些数据准备工作，顺便加了个回调，便于后续的hook操作。
来看下HookImpl


bool HookHelper::HookImpl() {pthread_mutex_lock(&DlopenCb::hook_mutex);xhook_clear();for (auto &pattern : register_pattern_) {for (auto &method : methods_) {if (xhook_register(pattern.c_str(), method.first.c_str(), method.second,nullptr) != EXIT_SUCCESS) {ALOGE("xhook_register pattern %s method %s fail", pattern.c_str(),method.first.c_str());pthread_mutex_unlock(&DlopenCb::hook_mutex);return false;}}}for (auto &pattern : ignore_pattern_) {for (auto &method : methods_) {if (xhook_ignore(pattern.c_str(), method.first.c_str()) != EXIT_SUCCESS) {ALOGE("xhook_ignore pattern %s method %s fail", pattern.c_str(),method.first.c_str());pthread_mutex_unlock(&DlopenCb::hook_mutex);return false;}}}int ret = xhook_refresh(0);pthread_mutex_unlock(&DlopenCb::hook_mutex);return ret == 0;
}

这里就是hook的调用了，使用了爱奇艺的开源框架xhook。

看下实现类：
dlopencb.cpp

int Callback(struct dl_phdr_info *info, size_t size, void *data) {auto *pair = static_cast<std::pair<std::set<std::string> *, std::set<std::string> *> *>(data);auto origin = pair->first;auto add = pair->second;auto name = info->dlpi_name;if (name != nullptr && hookDlopen(name) && origin->insert(name).second) {add->insert(name);}return 0;
}

Callback 函数是一个回调函数，它用于迭代动态链接器的程序头部信息。它的功能如下：

接受三个参数：struct dl_phdr_info* info，size_t size，void* data。
将 data 转换为 std::pair<std::setstd::string, std::setstd::string> 类型的指针。
从 pair 中获取 origin（原始共享库集合）和 add（新增共享库集合）。
判断动态链接库的名称是否非空，并且是否需要 hookDlopen。如果是，则将其添加到 origin 集合，并且添加到 add 集合中。

dlopencb.cpp

void DlopenCb::Refresh(int source, std::string &loadLibName) {
//一开始输出日志，表示刷新操作开始。XH_LOG_INFO("Refresh start %d", source);//接着创建一个空的 addLibs 集合，用于存储新增的共享库。std::set<std::string> addLibs;pthread_mutex_lock(&add_lib_mutex);//获取 hooked_libs 和 addLibs 的指针对，并调用 dl_iterate_phdr 函数进行迭代，每次迭代调用 Callback 函数。auto callbackData =make_pair(&hooked_libs, &addLibs);dl_iterate_phdr(Callback, &callbackData);pthread_mutex_unlock(&add_lib_mutex);//如果 addLibs 集合不为空，则对 hook_mutex 进行加锁，清除现有的 xhook 钩子，并根据新增的共享库重新注册钩子。if (!addLibs.empty()) {pthread_mutex_lock(&hook_mutex);xhook_clear();//根据调试模式进行设置。if (is_debug) {xhook_enable_sigsegv_protection(0);xhook_enable_debug(1);} else {xhook_enable_sigsegv_protection(1);}for (const auto &lib : addLibs) {auto lib_ctr = lib.c_str();xhook_register(lib_ctr, "android_dlopen_ext", (void *) (HookDlopenExt), nullptr);
//      xhook_register(lib_ctr, "dlopen", (void *) (HookDlopen), nullptr);
//输出日志，表示新增的共享库已添加。XH_LOG_INFO("Refresh new lib added %s", lib_ctr);}//刷新 xhook 钩子。xhook_refresh(0);pthread_mutex_unlock(&hook_mutex);// notifyXH_LOG_INFO("Refresh hooked");pthread_mutex_lock(&callback_mutex);//对回调函数进行通知，传递新增的共享库信息。for (auto &callback:callbacks) {callback(addLibs, source, loadLibName);}pthread_mutex_unlock(&callback_mutex);} else {//如果 addLibs 集合为空，则输出日志，表示没有发现新增的共享库。XH_LOG_INFO("Refresh no lib found");}
}

前面该宏定义：

// 兼容编译失败，实际API 21以下不支持开启
#if __ANDROID_API__ < 21
void* android_dlopen_ext(const char* __filename, int __flags, const android_dlextinfo* __info) {return 0;
}
int dl_iterate_phdr(int (*__callback)(struct dl_phdr_info*, size_t, void*), void* __data) {return 0;
}
#endif

这段代码是一个条件编译，它检查了当前的 Android API 版本是否低于 21。如果当前的 Android API 版本低于 21，则定义了两个函数 android_dlopen_ext 和 dl_iterate_phdr，但这两个函数的实现只是简单地返回了 0，所以在 API 21 以下的版本中这两个函数不支持。

至于怎么理解这两个函数。

android_dlopen_ext和dl_iterate_phdr


void* android_dlopen_ext(const char* __filename, int __flags, const android_dlextinfo* __info) :

这是一个函数声明，它声明了一个名为 android_dlopen_ext 的函数，该函数用于以扩展方式动态加载共享库（动态链接库）。

参数说明如下：

__filename：这是一个指向要加载的共享库文件名的 C 字符串。
__flags：这是一个整数，用于指定加载共享库的选项标志。
__info：这是一个指向 android_dlextinfo 结构体的指针，该结构体用于传递扩展加载选项的详细信息。如果不需要传递额外信息，可以传入 nullptr。

函数返回一个 void* 类型的指针，该指针通常用于表示加载的共享库的句柄或者标识符。在这个声明中，函数总是返回 0，表示加载失败或出错。

android_dlopen_ext 函数的具体实现通常由 Android 系统提供，它是 Android 平台上 dlopen 函数的一个扩展版本，用于支持更多的加载选项和功能。

int dl_iterate_phdr(int (*__callback)(struct dl_phdr_info*, size_t, void*), void* __data)

这是一个函数声明，它声明了一个名为 dl_iterate_phdr 的函数，该函数是用来迭代动态链接器的程序头部信息的。这个函数通常在操作系统中用于获取运行时链接器（Runtime Linker）加载的动态链接库的信息。

参数说明如下：

__callback: 这是一个函数指针，指向一个函数，该函数用于处理迭代过程中获取的动态链接库的信息。它接受三个参数：
- struct dl_phdr_info*: 这是一个结构体指针，用于存储动态链接库的程序头部信息。
- size_t: 这是一个表示结构体的大小的参数。
- void*: 这是一个指向用户自定义数据的指针，可以在回调函数中使用。
__data: 这是一个指向用户自定义数据的指针，会传递给回调函数 __callback。

naive监控的实现

我们终于来到了重点部分。

#define WRAP(x) x##Monitor
#define HOOK(ret_type, function, ...) \static ALWAYS_INLINE ret_type WRAP(function)(__VA_ARGS__)

这里通过宏定义了HOOK函数把前面的系统内存分配函数进行了hook，这里：

  std::vector<std::pair<const std::string, void *const>> hook_entries = {std::make_pair("malloc", reinterpret_cast<void *>(WRAP(malloc))),std::make_pair("realloc", reinterpret_cast<void *>(WRAP(realloc))),std::make_pair("calloc", reinterpret_cast<void *>(WRAP(calloc))),std::make_pair("memalign", reinterpret_cast<void *>(WRAP(memalign))),std::make_pair("posix_memalign",reinterpret_cast<void *>(WRAP(posix_memalign))),std::make_pair("free", reinterpret_cast<void *>(WRAP(free)))};

先看下hook了malloc的实现代码：

HOOK(void *, malloc, size_t size) {auto result = malloc(size);LeakMonitor::GetInstance().OnMonitor(reinterpret_cast<intptr_t>(result),size);CLEAR_MEMORY(result, size);return result;
}

执行了OnMonitor函数：

ALWAYS_INLINE void LeakMonitor::OnMonitor(uintptr_t address, size_t size) {if (!has_install_monitor_ || !address ||size < alloc_threshold_.load(std::memory_order_relaxed)) {return;}RegisterAlloc(address, size);
}

这里判断了一下阈值，加入达到阈值则执行RegisterAlloc：

ALWAYS_INLINE void LeakMonitor::RegisterAlloc(uintptr_t address, size_t size) {if (!address || !size) {return;}auto unwind_backtrace = [](uintptr_t *frames, uint32_t *frame_count) {*frame_count = StackTrace::FastUnwind(frames, kMaxBacktraceSize);};thread_local ThreadInfo thread_info;auto alloc_record = std::make_shared<AllocRecord>();alloc_record->address = CONFUSE(address);alloc_record->size = size;alloc_record->index = alloc_index_++;memcpy(alloc_record->thread_name, thread_info.name, kMaxThreadNameLen);unwind_backtrace(alloc_record->backtrace, &(alloc_record->num_backtraces));live_alloc_records_.Put(CONFUSE(address), std::move(alloc_record));
}

看下FastUnwind方法：

KWAI_EXPORT size_t StackTrace::FastUnwind(uintptr_t *buf, size_t num_entries) {pthread_once(&once_control_tls, fast_unwind_init);auto begin = reinterpret_cast<uintptr_t>(__builtin_frame_address(0));auto end = get_thread_stack_top();stack_t ss;if (sigaltstack(nullptr, &ss) == 0 && (ss.ss_flags & SS_ONSTACK)) {end = reinterpret_cast<uintptr_t>(ss.ss_sp) + ss.ss_size;}size_t num_frames = 0;while (num_frames < kMaxBacktraceSize) {auto *frame = reinterpret_cast<frame_record *>(begin);if (num_frames < num_entries) {buf[num_frames] = GetAdjustPC(frame->return_addr);}++num_frames;if (frame->next_frame < begin + sizeof(frame_record) ||frame->next_frame >= end || frame->next_frame % sizeof(void *) != 0) {break;}begin = frame->next_frame;}return num_frames;
}

这里记录了函数调用的回溯栈相关数据，包括起始地址和大小。结合前面的方法可以知道，就是获取调用栈的信息，然后put到live_alloc_records_里面。

nativeGetLeakAllocs

分析到这里，我们回头看下外面调用的nativeGetLeakAllocs方法。分析到这里我们可以知道它会执行到这里：

std::vector<std::shared_ptr<AllocRecord>> LeakMonitor::GetLeakAllocs() {KCHECK(has_install_monitor_);auto unreachable_allocs = memory_analyzer_->CollectUnreachableMem();std::vector<std::shared_ptr<AllocRecord>> live_allocs;std::vector<std::shared_ptr<AllocRecord>> leak_allocs;// Collect live memory blocksauto collect_func = [&](std::shared_ptr<AllocRecord> &alloc_info) -> void {live_allocs.push_back(alloc_info);};live_alloc_records_.Dump(collect_func);auto is_leak = [&](decltype(unreachable_allocs)::value_type &unreachable,std::shared_ptr<AllocRecord> &live) -> bool {auto live_start = CONFUSE(live->address);auto live_end = live_start + live->size;auto unreachable_start = unreachable.first;auto unreachable_end = unreachable_start + unreachable.second;// TODO whyreturn live_start == unreachable_start ||live_start >= unreachable_start && live_end <= unreachable_end;};// Check leak allocation (unreachable && not free)for (auto &live : live_allocs) {for (auto &unreachable : unreachable_allocs) {if (is_leak(unreachable, live)) {leak_allocs.push_back(live);// Just remove leak allocation(never be free)// live->address has been confused, we need to revert it firstUnregisterAlloc(CONFUSE(live->address));}}}return leak_allocs;
}