内核模块里访问struct rq及获取rq_clock_task时间的方法
一、背景
在之前的 CFS及RT调度整体介绍_cfs任务和rt任务-CSDN博客 博客里,我们讲到了内核调度里的一个很重要的概念rq,即运行队列run queue。我们知道,每个cpu上都有一个struct rq的结构体,管理着per-cpu的运行队列的情况,而在内核模块要获取某个cpu上的rq的结构体并不容易,我们在下面第二章里会展示获取的源码,并在第三章里说明获取的方法。
获取到了rq,也自然能获取到rq结构体里的clock和clock_task,而clock_task也就是rq_clock_task,是统计各个任务的执行时间的标尺,rq_clock_task由于根据内核选项不同,rq_clock_task时间并不确定和local_clock的时间一致(因为会根据内核选项去掉中断的时间),所以为了在内核模块里去用local_clock的时间来作为rq_clock_task时间是不精准的,为了拿到更加精确的rq_clock_task时间,我们需要用这篇博客里的方法,先拿到rq再拿到rq里的clock_task,从而获取rq_clock_task时间。rq_clock_task函数是inline函数,在cat /proc/kallsyms里并没有它:
关于内核里的各个常用的时间接口,尤其调度用的时间及local_clock时间等细节,参考之前的博客 与调度相关的内核时间接口的分析及实现介绍-CSDN博客 。
在下面第二章里,我们先贴出内核模块里获取rq_clock_task的实现源码,在第三章里,我们讲里面的细节。
二、获取rq_clock_task的实现源码
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/proc_fs.h>
#include <linux/ctype.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/types.h>
#include <linux/ioctl.h>
#include <linux/errno.h>
#include <linux/stddef.h>
#include <linux/lockdep.h>
#include <linux/kthread.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/wait.h>
#include <linux/init.h>
#include <asm/atomic.h>
#include <trace/events/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/tracepoint.h>
#include <trace/events/osmonitor.h>
#include <trace/events/sched.h>
#include <trace/events/irq.h>
#include <trace/events/kmem.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/sched/task_stack.h>
#include <linux/nmi.h>
#include <asm/apic.h>
#include <linux/version.h>
#include <linux/sched/mm.h>
#include <asm/irq_regs.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>#include <linux/stop_machine.h>MODULE_LICENSE("GPL");
MODULE_AUTHOR("zhaoxin");
MODULE_DESCRIPTION("Module use rq_clock_task.");
MODULE_VERSION("1.0");#define IODELAY_TRACEPOINT_ENABLEstruct uclamp_bucket {unsigned long value : bits_per(SCHED_CAPACITY_SCALE);unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
};struct uclamp_rq {unsigned int value;struct uclamp_bucket bucket[UCLAMP_BUCKETS];
};/* CFS-related fields in a runqueue */
struct cfs_rq {struct load_weight load;unsigned int nr_running;unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */unsigned int idle_nr_running; /* SCHED_IDLE */unsigned int idle_h_nr_running; /* SCHED_IDLE */u64 exec_clock;u64 min_vruntime;
#ifdef CONFIG_SCHED_COREunsigned int forceidle_seq;u64 min_vruntime_fi;
#endif#ifndef CONFIG_64BITu64 min_vruntime_copy;
#endifstruct rb_root_cached tasks_timeline;/** 'curr' points to currently running entity on this cfs_rq.* It is set to NULL otherwise (i.e when none are currently running).*/struct sched_entity *curr;struct sched_entity *next;struct sched_entity *last;struct sched_entity *skip;#ifdef CONFIG_SCHED_DEBUGunsigned int nr_spread_over;
#endif#ifdef CONFIG_SMP/** CFS load tracking*/struct sched_avg avg;
#ifndef CONFIG_64BITu64 last_update_time_copy;
#endifstruct {raw_spinlock_t lock ____cacheline_aligned;int nr;unsigned long load_avg;unsigned long util_avg;unsigned long runnable_avg;} removed;#ifdef CONFIG_FAIR_GROUP_SCHEDunsigned long tg_load_avg_contrib;long propagate;long prop_runnable_sum;/** h_load = weight * f(tg)** Where f(tg) is the recursive weight fraction assigned to* this group.*/unsigned long h_load;u64 last_h_load_update;struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */#ifdef CONFIG_FAIR_GROUP_SCHEDstruct rq *rq; /* CPU runqueue to which this cfs_rq is attached *//** leaf cfs_rqs are those that hold tasks (lowest schedulable entity in* a hierarchy). Non-leaf lrqs hold other higher schedulable entities* (like users, containers etc.)** leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.* This list is used during load balance.*/int on_list;struct list_head leaf_cfs_rq_list;struct task_group *tg; /* group that "owns" this runqueue *//* Locally cached copy of our task_group's idle value */int idle;#ifdef CONFIG_CFS_BANDWIDTHint runtime_enabled;s64 runtime_remaining;u64 throttled_pelt_idle;
#ifndef CONFIG_64BITu64 throttled_pelt_idle_copy;
#endifu64 throttled_clock;u64 throttled_clock_pelt;u64 throttled_clock_pelt_time;int throttled;int throttle_count;struct list_head throttled_list;
#ifdef CONFIG_SMPstruct list_head throttled_csd_list;
#endif
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};struct rt_prio_array {DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */struct list_head queue[MAX_RT_PRIO];
};/* Real-Time classes' related field in a runqueue: */
struct rt_rq {struct rt_prio_array active;unsigned int rt_nr_running;unsigned int rr_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHEDstruct {int curr; /* highest queued rt task prio */
#ifdef CONFIG_SMPint next; /* next highest */
#endif} highest_prio;
#endif
#ifdef CONFIG_SMPunsigned int rt_nr_migratory;unsigned int rt_nr_total;int overloaded;struct plist_head pushable_tasks;#endif /* CONFIG_SMP */int rt_queued;int rt_throttled;u64 rt_time;u64 rt_runtime;/* Nests inside the rq lock: */raw_spinlock_t rt_runtime_lock;#ifdef CONFIG_RT_GROUP_SCHEDunsigned int rt_nr_boosted;struct rq *rq;struct task_group *tg;
#endif
};/* Deadline class' related fields in a runqueue */
struct dl_rq {/* runqueue is an rbtree, ordered by deadline */struct rb_root_cached root;unsigned int dl_nr_running;#ifdef CONFIG_SMP/** Deadline values of the currently executing and the* earliest ready task on this rq. Caching these facilitates* the decision whether or not a ready but not running task* should migrate somewhere else.*/struct {u64 curr;u64 next;} earliest_dl;unsigned int dl_nr_migratory;int overloaded;/** Tasks on this rq that can be pushed away. They are kept in* an rb-tree, ordered by tasks' deadlines, with caching* of the leftmost (earliest deadline) element.*/struct rb_root_cached pushable_dl_tasks_root;
#elsestruct dl_bw dl_bw;
#endif/** "Active utilization" for this runqueue: increased when a* task wakes up (becomes TASK_RUNNING) and decreased when a* task blocks*/u64 running_bw;/** Utilization of the tasks "assigned" to this runqueue (including* the tasks that are in runqueue and the tasks that executed on this* CPU and blocked). Increased when a task moves to this runqueue, and* decreased when the task moves away (migrates, changes scheduling* policy, or terminates).* This is needed to compute the "inactive utilization" for the* runqueue (inactive utilization = this_bw - running_bw).*/u64 this_bw;u64 extra_bw;/** Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM* tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB).*/u64 max_bw;/** Inverse of the fraction of CPU utilization that can be reclaimed* by the GRUB algorithm.*/u64 bw_ratio;
};struct rq {/* runqueue lock: */raw_spinlock_t __lock;/** nr_running and cpu_load should be in the same cacheline because* remote CPUs use both these fields when doing load calculation.*/unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCINGunsigned int nr_numa_running;unsigned int nr_preferred_running;unsigned int numa_migrate_on;
#endif
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMPunsigned long last_blocked_load_update_tick;unsigned int has_blocked_load;call_single_data_t nohz_csd;
#endif /* CONFIG_SMP */unsigned int nohz_tick_stopped;atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */#ifdef CONFIG_SMPunsigned int ttwu_pending;
#endifu64 nr_switches;#ifdef CONFIG_UCLAMP_TASK/* Utilization clamp values based on CPU's RUNNABLE tasks */struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;unsigned int uclamp_flags;
#define UCLAMP_FLAG_IDLE 0x01
#endifstruct cfs_rq cfs;struct rt_rq rt;struct dl_rq dl;#ifdef CONFIG_FAIR_GROUP_SCHED/* list of leaf cfs_rq on this CPU: */struct list_head leaf_cfs_rq_list;struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED *//** This is part of a global counter where only the total sum* over all CPUs matters. A task can increase this counter on* one CPU and if it got migrated afterwards it may decrease* it on another CPU. Always updated under the runqueue lock:*/unsigned int nr_uninterruptible;struct task_struct __rcu *curr;struct task_struct *idle;struct task_struct *stop;unsigned long next_balance;struct mm_struct *prev_mm;unsigned int clock_update_flags;u64 clock;/* Ensure that all clocks are in the same cache line */u64 clock_task ____cacheline_aligned;u64 clock_pelt;unsigned long lost_idle_time;atomic_t nr_iowait;#ifdef CONFIG_SCHED_DEBUGu64 last_seen_need_resched_ns;int ticks_without_resched;
#endif#ifdef CONFIG_MEMBARRIERint membarrier_state;
#endif#ifdef CONFIG_SMPstruct root_domain *rd;struct sched_domain __rcu *sd;unsigned long cpu_capacity;unsigned long cpu_capacity_orig;struct callback_head *balance_callback;unsigned char nohz_idle_balance;unsigned char idle_balance;unsigned long misfit_task_load;/* For active balancing */int active_balance;int push_cpu;struct cpu_stop_work active_balance_work;/* CPU of this runqueue: */int cpu;int online;struct list_head cfs_tasks;struct sched_avg avg_rt;struct sched_avg avg_dl;
#ifdef CONFIG_HAVE_SCHED_AVG_IRQstruct sched_avg avg_irq;
#endif
#ifdef CONFIG_SCHED_THERMAL_PRESSUREstruct sched_avg avg_thermal;
#endifu64 idle_stamp;u64 avg_idle;unsigned long wake_stamp;u64 wake_avg_idle;/* This is used to determine avg_idle's max value */u64 max_idle_balance_cost;#ifdef CONFIG_HOTPLUG_CPUstruct rcuwait hotplug_wait;
#endif
#endif /* CONFIG_SMP */#ifdef CONFIG_IRQ_TIME_ACCOUNTINGu64 prev_irq_time;
#endif
#ifdef CONFIG_PARAVIRTu64 prev_steal_time;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTINGu64 prev_steal_time_rq;
#endif/* calc_load related fields */unsigned long calc_load_update;long calc_load_active;#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMPcall_single_data_t hrtick_csd;
#endifstruct hrtimer hrtick_timer;ktime_t hrtick_time;
#endif#ifdef CONFIG_SCHEDSTATS/* latency stats */struct sched_info rq_sched_info;unsigned long long rq_cpu_time;/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? *//* sys_sched_yield() stats */unsigned int yld_count;/* schedule() stats */unsigned int sched_count;unsigned int sched_goidle;/* try_to_wake_up() stats */unsigned int ttwu_count;unsigned int ttwu_local;
#endif#ifdef CONFIG_CPU_IDLE/* Must be inspected within a rcu lock section */struct cpuidle_state *idle_state;
#endif#ifdef CONFIG_SMPunsigned int nr_pinned;
#endifunsigned int push_busy;struct cpu_stop_work push_work;#ifdef CONFIG_SCHED_CORE/* per rq */struct rq *core;struct task_struct *core_pick;unsigned int core_enabled;unsigned int core_sched_seq;struct rb_root core_tree;/* shared state -- careful with sched_core_cpu_deactivate() */unsigned int core_task_seq;unsigned int core_pick_seq;unsigned long core_cookie;unsigned int core_forceidle_count;unsigned int core_forceidle_seq;unsigned int core_forceidle_occupation;u64 core_forceidle_start;
#endif
};// runqueues (not export symbol)
struct rq* _prq = NULL;struct rq* my_cpu_rq(int i_cpu)
{return per_cpu_ptr(_prq, i_cpu);
}static void cb_sched_stat_runtime(void *i_data, struct task_struct *i_curr,u64 i_runtime, u64 i_vruntime)
{struct rq* prq = my_cpu_rq(smp_processor_id());printk("rq_clock_task[%llu],rq_clock[%llu],local_clock[%llu]\n", prq->clock_task, prq->clock, local_clock());
}struct kern_tracepoint {void *callback;struct tracepoint *ptr;bool bregister;
};
static void clear_kern_tracepoint(struct kern_tracepoint *tp)
{if (tp->bregister) {tracepoint_probe_unregister(tp->ptr, tp->callback, NULL);}
}#define INIT_KERN_TRACEPOINT(tracepoint_name) \static struct kern_tracepoint mykern_##tracepoint_name = {.callback = NULL, .ptr = NULL, .bregister = false};#define TRACEPOINT_CHECK_AND_SET(tracepoint_name) \static void tracepoint_name##_tracepoint_check_and_set(struct tracepoint *tp, void *priv) \{ \if (!strcmp(#tracepoint_name, tp->name)) \{ \((struct kern_tracepoint *)priv)->ptr = tp; \return; \} \}INIT_KERN_TRACEPOINT(sched_stat_runtime)
TRACEPOINT_CHECK_AND_SET(sched_stat_runtime)typedef unsigned long (*kallsyms_lookup_name_func)(const char *name);
kallsyms_lookup_name_func _kallsyms_lookup_name_func;void* get_func_by_symbol_name_kallsyms_lookup_name(void)
{int ret;void* pfunc = NULL;struct kprobe kp;memset(&kp, 0, sizeof(kp));kp.symbol_name = "kallsyms_lookup_name";kp.pre_handler = NULL;kp.addr = NULL; // 作为强调,提示使用symbol_nameret = register_kprobe(&kp);if (ret < 0) {printk("register_kprobe fail!\n");return NULL;}printk("register_kprobe succeed!\n");pfunc = (void*)kp.addr;unregister_kprobe(&kp);return pfunc;
}void* get_func_by_symbol_name(const char* i_symbol)
{if (_kallsyms_lookup_name_func == NULL) {return NULL;}return _kallsyms_lookup_name_func(i_symbol);
}static int __init testrqclocktask_init(void)
{_kallsyms_lookup_name_func = get_func_by_symbol_name_kallsyms_lookup_name();_prq = get_func_by_symbol_name("runqueues");if (_prq == NULL) {printk(KERN_ERR "get_func_by_symbol_name runqueues failed!\n");return -1;}mykern_sched_stat_runtime.callback = cb_sched_stat_runtime;for_each_kernel_tracepoint(sched_stat_runtime_tracepoint_check_and_set, &mykern_sched_stat_runtime);if (!mykern_sched_stat_runtime.ptr) {printk(KERN_ERR "mykern_sched_stat_runtime register failed!\n");return 0;}else {printk(KERN_INFO "mykern_sched_stat_runtime register succeeded!\n");}tracepoint_probe_register(mykern_sched_stat_runtime.ptr, mykern_sched_stat_runtime.callback, NULL);mykern_sched_stat_runtime.bregister = 1;return 0;
}static void __exit testrqclocktask_exit(void)
{clear_kern_tracepoint(&mykern_sched_stat_runtime);tracepoint_synchronize_unregister();
}module_init(testrqclocktask_init);
module_exit(testrqclocktask_exit);
运行后的dmesg的打印如下:
可以从上图中看到当前系统下rq_clock_task和rq_clock是一样的,rq_clock和local_clock差了少量的时间,这是来自于rq_clock的更新是事件触发型,所以就无法做到实时的动态变化,而我们程序里在sched_stat_runtime这个tracepoint点里去进行的printk打印这些时间,而sched_stat_runtime时,调度系统逻辑里也会去更新一下这个rq_clock时间,但是更新完以后的逻辑再运行的这个时间就更新不到rq->clock和clock_task里去了,所以就从打印上看到有100到几百纳秒的时间上的落后。
三、实现原理
我们在 3.1 一节里介绍struct rq是如何存储的。在 3.2 一节里说明如何在内核模块里获取各个cpu上的这个struct rq的指针,有了rq的指针,也就自然能获取到rq上的clock和clock_task变量了,clock_task变量对应的就是内核函数rq_clock_task返回的时间。
3.1 运行队列rq在内核里的存储
我们看内核代码里常用的cpu_rq宏的实现:
从上图中可以看到,它是用的per-cpu变量runqueues来存储的。
3.2 运行队列rq在内核模块里如何获取
我们先在 3.2.1 里介绍如何通过kprobe来获取kallsyms_lookup_name函数指针,继而获取struct rq*地址的方法,然后在 3.2.2 里介绍如何让系统认识struct rq这个结构体,从而获取rq里的clock_task的值
3.2.1 通过kprobe获取kallsyms_lookup_name函数指针,继而获取struct rq*地址
runqueues这个变量,其实并没有export symbol出来,所以我们不能直接通过per_cpu的宏来直接拿到它,得先通过之前的博客 获取任意一个进程的共享内存的fd对应的资源,增加引用,实现数据的接管——包含非export的内核函数的模块内使用_kprobe能追踪模块的未export函数吗?-CSDN博客 里第三章讲到的方法,通过kprobe找到kallsyms_lookup_name函数的函数指针,再调用拿到的kallsyms_lookup_name的函数来获取runqueues变量的地址。
第二章代码中,获取kallsyms_lookup_name函数指针的实现部分:
第二章代码里的通过kallsyms_lookup_name函数的函数指针获取符号的地址或者函数的地址的部分:
在初始化时,获取到struct rq*指针:
根据cpu号获取具体某cpu对应的strcut rq*指针:
3.2.2 内核模块里如何认识struct rq结构体,从而获取rq里的clock_task数值
我们看struct rq的定义是在哪里的,是在kernel/sched/sched.h里的:
而这个kernel/sched/sched.h是内核调度代码内部定义和使用的一个头文件,在内核模块里是包含不到的,我们需要把这个文件里的struct rq的定义部分给复制出来,复制到我们写的这个测试的内核模块的testrqclock.c里:
而这个struct rq结构体依赖了一些结构体定义,我们需要也包含进来,有依赖struct ulamp_bucket,struct uclamp_rq,struct cfs_rq,struct rt_prio_array,struct rt_rq,struct dl_rq这些结构体。所以我们在第二章的代码里也把这些依赖的定义也复制过来了。
内核模块能认识了struct rq结构体了以后,就可以取struct rq里的变量了,而rq_clock_task这个内核调度代码里计算任务执行时间的标尺所使用的这个inline函数的实现就是访问的rq里的clock_task变量:
所以,在我们代码里,注册的tracepoint里直接获取这个变量即可:
关于如何在内核模块里使用tracepoint的细节,之前的博客里有多次涉及,这里就不赘述了。