insmod 入口函數(shù)
本文用到的 busybox 版本為 1.34.1,Linux 內(nèi)核版本為 4.14.294
insmod_main()
函數(shù)是 insmod 命令的入口函數(shù),該函數(shù)首先通過(guò)函數(shù)參數(shù)獲取被加載模塊的名字并存入局部指針變量 filename,然后調(diào)用bb_init_module()
函數(shù)進(jìn)行后續(xù)操作。
int insmod_main(int argc UNUSED_PARAM, char **argv)
{
char *filename;
int rc;
/* Compat note:
* 2.6 style insmod has no options and required filename
* (not module name - .ko can't be omitted).
* 2.4 style insmod can take module name without .o
* and performs module search in default directories
* or in $MODPATH.
*/
IF_FEATURE_2_4_MODULES(
getopt32(argv, INSMOD_OPTS INSMOD_ARGS);
argv += optind - 1;
);
filename = *++argv;
if (!filename)
bb_show_usage();
rc = bb_init_module(filename, parse_cmdline_module_options(argv, /*quote_spaces:*/ 0));
if (rc)
bb_error_msg("can't insert '%s': %s", filename, moderror(rc));
return rc;
}
模塊參數(shù)解析函數(shù)
parse_cmdline_module_options()
函數(shù)會(huì)解析模塊加載時(shí)傳給模塊的參數(shù),通過(guò)while
循環(huán)挨個(gè)解析模塊后面?zhèn)鹘o模塊的參數(shù),并將解析出來(lái)的參數(shù)值val
存入指針變量options
指向的內(nèi)存空間,最后返回該內(nèi)存空間的首地址。
char* FAST_FUNC parse_cmdline_module_options(char **argv, int quote_spaces)
{
char *options;
int optlen;
options = xzalloc(1);
optlen = 0;
while (*++argv) {
const char *fmt;
const char *var;
const char *val;
var = *argv;
options = xrealloc(options, optlen + 2 + strlen(var) + 2);
fmt = "%.*s%s ";
val = strchrnul(var, '=');
if (quote_spaces) {
/*
* modprobe (module-init-tools version 3.11.1) compat:
* quote only value:
* var="val with spaces", not "var=val with spaces"
* (note: var *name* is not checked for spaces!)
*/
if (*val) { /* has var=val format. skip '=' */
val++;
if (strchr(val, ' '))
fmt = "%.*s\"%s\" ";
}
}
optlen += sprintf(options + optlen, fmt, (int)(val - var), var, val);
}
/* Remove trailing space. Disabled */
/* if (optlen != 0) options[optlen-1] = '\\0'; */
return options;
}
映射模塊文件
bb_init_module()
函數(shù)首先判斷模塊有沒(méi)有參數(shù)傳入,調(diào)用try_to_mmap_module()
函數(shù)完成后續(xù)映射工作,該函數(shù)接收兩個(gè)參數(shù):被加載模塊的名字(filename),模塊的大小(image_size)作為出參參數(shù)傳入。最后調(diào)用init_module()
函數(shù),init_module()
函數(shù)是系統(tǒng)調(diào)用函數(shù),對(duì)應(yīng)的內(nèi)核函數(shù)是sys_init_module()
函數(shù),進(jìn)入到內(nèi)核空間。傳入的參數(shù)分別是:模塊內(nèi)存空間首地址(image),模塊大小(image_size),模塊參數(shù)內(nèi)存空間首地址(options)。
int FAST_FUNC bb_init_module(const char *filename, const char *options)
{
size_t image_size;
char *image;
int rc;
bool mmaped;
if (!options)
options = "";
//TODO: audit bb_init_module_24 to match error code convention
#if ENABLE_FEATURE_2_4_MODULES
if (get_linux_version_code() < KERNEL_VERSION(2,6,0))
return bb_init_module_24(filename, options);
#endif
/*
* First we try finit_module if available. Some kernels are configured
* to only allow loading of modules off of secure storage (like a read-
* only rootfs) which needs the finit_module call. If it fails, we fall
* back to normal module loading to support compressed modules.
*/
# ifdef __NR_finit_module
{
int fd = open(filename, O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
rc = finit_module(fd, options, 0) != 0;
close(fd);
if (rc == 0)
return rc;
}
}
# endif
image_size = INT_MAX - 4095;
mmaped = 0;
image = try_to_mmap_module(filename, &image_size);
if (image) {
mmaped = 1;
} else {
errno = ENOMEM; /* may be changed by e.g. open errors below */
image = xmalloc_open_zipped_read_close(filename, &image_size);
if (!image)
return -errno;
}
errno = 0;
init_module(image, image_size, options);
rc = errno;
if (mmaped)
munmap(image, image_size);
else
free(image);
return rc;
}
try_to_mmap_module()
函數(shù)首先打開(kāi)模塊文件獲取模塊文件描述符fd
,然后通過(guò)fstat()
函數(shù)獲取模塊文件的詳細(xì)信息,判斷模塊文件的大小st_size
是否超過(guò)了設(shè)定的文件最大值,調(diào)用mmap_read()
函數(shù)以只讀的方式將模塊文件的內(nèi)容映射進(jìn)內(nèi)存空間,并返回該內(nèi)存空間的首地址,通過(guò)*(uint32_t*)image != SWAP_BE32(0x7f454C46)
檢查模塊文件是否符號(hào) ELF 標(biāo)準(zhǔn)格式,最后將內(nèi)存空間的首地址image
返回。通過(guò)try_to_mmap_module()
函數(shù)我們就獲取了模塊文件內(nèi)容在內(nèi)存空間的地址。
void* FAST_FUNC try_to_mmap_module(const char *filename, size_t *image_size_p)
{
/* We have user reports of failure to load 3MB module
* on a 16MB RAM machine. Apparently even a transient
* memory spike to 6MB during module load
* is too big for that system. */
void *image;
struct stat st;
int fd;
fd = xopen(filename, O_RDONLY);
fstat(fd, &st);
image = NULL;
/* st.st_size is off_t, we can't just pass it to mmap */
if (st.st_size <= *image_size_p) {
size_t image_size = st.st_size;
image = mmap_read(fd, image_size);
if (image == MAP_FAILED) {
image = NULL;
} else if (*(uint32_t*)image != SWAP_BE32(0x7f454C46)) {
/* No ELF signature. Compressed module? */
munmap(image, image_size);
image = NULL;
} else {
/* Success. Report the size */
*image_size_p = image_size;
}
}
close(fd);
return image;
}
從init_module()
開(kāi)始調(diào)用關(guān)系會(huì)進(jìn)入到 Linux 內(nèi)核源碼。
init_module()
其實(shí)是一個(gè)宏定義,最終會(huì)調(diào)用到__NR_init_module
系統(tǒng)調(diào)用號(hào)對(duì)應(yīng)的系統(tǒng)調(diào)用函數(shù)是sys_init_module()
,該對(duì)應(yīng)關(guān)系位于 Linux 內(nèi)核源碼include/uapi/asm-generic/unistd.h
文件中。關(guān)于 Linux 系統(tǒng)調(diào)用的知識(shí),后面會(huì)專門(mén)寫(xiě)個(gè)文章分析 Linux 系統(tǒng)調(diào)用的實(shí)現(xiàn)機(jī)制,并手寫(xiě)一個(gè)內(nèi)核沒(méi)有的系統(tǒng)調(diào)用。
#define init_module(mod, len, opts) syscall(__NR_init_module, mod, len, opts)
#define __NR_init_module 105
__SYSCALL(__NR_init_module, sys_init_module)
而sys_init_module()
函數(shù)是由宏定義SYSCALL_DEFINE3
展開(kāi)形成的,該定義位于文件include/linux/syscalls.h
中
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINEx(x, sname, ...) \\
SYSCALL_METADATA(sname, x, __VA_ARGS__) \\
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...) \\
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \\
__attribute__((alias(__stringify(SyS##name)))); \\
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \\
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \\
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \\
{ \\
long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \\
__MAP(x,__SC_TEST,__VA_ARGS__); \\
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \\
return ret; \\
} \\
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
SYSCALL_DEFINE3
的實(shí)現(xiàn)位于kernel/module.c
文件中,該函數(shù)首先調(diào)用may_init_module()
函數(shù)判斷用戶是否有加載模塊的權(quán)限,調(diào)用copy_module_from_user()
函數(shù)將模塊文件的內(nèi)容從用戶空間內(nèi)存地址拷貝到內(nèi)核空間內(nèi)存地址,具體實(shí)現(xiàn)后面會(huì)分析,最后調(diào)用load_module()
函數(shù),細(xì)節(jié)詳見(jiàn)下面分析。
SYSCALL_DEFINE3(init_module, void __user *, umod,
unsigned long, len, const char __user *, uargs)
{
int err;
struct load_info info = { };
err = may_init_module();
if (err)
return err;
pr_debug("init_module: umod=%p, len=%lu, uargs=%p\\n",
umod, len, uargs);
err = copy_module_from_user(umod, len, &info);
if (err)
return err;
return load_module(&info, uargs, 0);
}
copy_module_from_user()
函數(shù)首先給load_info
結(jié)構(gòu)體成員info->len
賦值為模塊大小len
,調(diào)用__vmalloc()
函數(shù)在內(nèi)核空間為模塊分配info->len
大小的內(nèi)存空間,并返回內(nèi)核內(nèi)存空間的的起始地址info->hdr
,最后調(diào)用copy_chunked_from_user()
函數(shù)其實(shí)就是copy_from_user()
函數(shù)將用戶空間內(nèi)存模塊文件內(nèi)容拷貝到info->hdr
所指向的內(nèi)核空間內(nèi)存地址
static int copy_module_from_user(const void __user *umod, unsigned long len,
struct load_info *info)
{
int err;
info- >len = len;
if (info- >len < sizeof(*(info- >hdr)))
return -ENOEXEC;
err = security_kernel_read_file(NULL, READING_MODULE);
if (err)
return err;
/* Suck in entire file: we'll want most of it. */
info- >hdr = __vmalloc(info- >len,
GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL);
if (!info- >hdr)
return -ENOMEM;
if (copy_chunked_from_user(info- >hdr, umod, info- >len) != 0) {
vfree(info- >hdr);
return -EFAULT;
}
return 0;
}
至此,模塊文件已經(jīng)從用戶空間拷貝到內(nèi)核空間。
模塊加載
鑒于模塊加載函數(shù)load_module()
比較復(fù)雜,限于篇幅限制,具體的加載過(guò)程會(huì)在《Linux內(nèi)核模塊加載深度剖析(中篇)》一文中分析。
static int load_module(struct load_info *info, const char __user *uargs,
int flags)
{
struct module *mod;
long err;
char *after_dashes;
err = module_sig_check(info, flags);
if (err)
goto free_copy;
err = elf_header_check(info);
if (err)
goto free_copy;
/* Figure out module layout, and allocate all the memory. */
mod = layout_and_allocate(info, flags);
if (IS_ERR(mod)) {
err = PTR_ERR(mod);
goto free_copy;
}
audit_log_kern_module(mod- >name);
/* Reserve our place in the list. */
err = add_unformed_module(mod);
if (err)
goto free_module;
#ifdef CONFIG_MODULE_SIG
mod- >sig_ok = info- >sig_ok;
if (!mod- >sig_ok) {
pr_notice_once("%s: module verification failed: signature "
"and/or required key missing - tainting "
"kernel\\n", mod- >name);
add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
}
#endif
/* To avoid stressing percpu allocator, do this once we're unique. */
err = percpu_modalloc(mod, info);
if (err)
goto unlink_mod;
/* Now module is in final location, initialize linked lists, etc. */
err = module_unload_init(mod);
if (err)
goto unlink_mod;
init_param_lock(mod);
/* Now we've got everything in the final locations, we can
* find optional sections. */
err = find_module_sections(mod, info);
if (err)
goto free_unload;
err = check_module_license_and_versions(mod);
if (err)
goto free_unload;
/* Set up MODINFO_ATTR fields */
setup_modinfo(mod, info);
/* Fix up syms, so that st_value is a pointer to location. */
err = simplify_symbols(mod, info);
if (err < 0)
goto free_modinfo;
err = apply_relocations(mod, info);
if (err < 0)
goto free_modinfo;
err = post_relocation(mod, info);
if (err < 0)
goto free_modinfo;
flush_module_icache(mod);
/* Now copy in args */
mod- >args = strndup_user(uargs, ~0UL > > 1);
if (IS_ERR(mod- >args)) {
err = PTR_ERR(mod- >args);
goto free_arch_cleanup;
}
dynamic_debug_setup(mod, info- >debug, info- >num_debug);
/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
ftrace_module_init(mod);
/* Finally it's fully formed, ready to start executing. */
err = complete_formation(mod, info);
if (err)
goto ddebug_cleanup;
err = prepare_coming_module(mod);
if (err)
goto bug_cleanup;
/* Module is ready to execute: parsing args may do that. */
after_dashes = parse_args(mod- >name, mod- >args, mod- >kp, mod- >num_kp,
-32768, 32767, mod,
unknown_module_param_cb);
if (IS_ERR(after_dashes)) {
err = PTR_ERR(after_dashes);
goto coming_cleanup;
} else if (after_dashes) {
pr_warn("%s: parameters '%s' after `--' ignored\\n",
mod- >name, after_dashes);
}
/* Link in to sysfs. */
err = mod_sysfs_setup(mod, info, mod- >kp, mod- >num_kp);
if (err < 0)
goto coming_cleanup;
if (is_livepatch_module(mod)) {
err = copy_module_elf(mod, info);
if (err < 0)
goto sysfs_cleanup;
}
/* Get rid of temporary copy. */
free_copy(info);
/* Done! */
trace_module_load(mod);
return do_init_module(mod);
sysfs_cleanup:
mod_sysfs_teardown(mod);
coming_cleanup:
mod- >state = MODULE_STATE_GOING;
destroy_params(mod- >kp, mod- >num_kp);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
klp_module_going(mod);
bug_cleanup:
mod- >state = MODULE_STATE_GOING;
/* module_bug_cleanup needs module_mutex protection */
mutex_lock(&module_mutex);
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
/* we can't deallocate the module until we clear memory protection */
module_disable_ro(mod);
module_disable_nx(mod);
ddebug_cleanup:
dynamic_debug_remove(mod, info- >debug);
synchronize_sched();
kfree(mod- >args);
free_arch_cleanup:
module_arch_cleanup(mod);
free_modinfo:
free_modinfo(mod);
free_unload:
module_unload_free(mod);
unlink_mod:
mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod- >list);
mod_tree_remove(mod);
wake_up_all(&module_wq);
/* Wait for RCU-sched synchronizing before releasing mod- >list. */
synchronize_sched();
mutex_unlock(&module_mutex);
free_module:
/*
* Ftrace needs to clean up what it initialized.
* This does nothing if ftrace_module_init() wasn't called,
* but it must be called outside of module_mutex.
*/
ftrace_release_mod(mod);
/* Free lock-classes; relies on the preceding sync_rcu() */
lockdep_free_key_range(