From 8126726847252a5b14e8e08553ac727c7cd37bfe Mon Sep 17 00:00:00 2001 From: Tk-Glitch Date: Tue, 15 Jun 2021 14:08:13 +0200 Subject: [PATCH] linux54/510-tkg: Add cacule scheduler patchsets - https://github.com/hamadmarri/cacule-cpu-scheduler Fixes https://github.com/Frogging-Family/linux-tkg/issues/263 --- PKGBUILD | 4 + linux-tkg-config/prepare | 8 +- linux-tkg-patches/5.10/0003-cacule-5.10.patch | 1268 ++++++++++++++++ linux-tkg-patches/5.4/0003-cacule-5.4.patch | 1269 +++++++++++++++++ 4 files changed, 2547 insertions(+), 2 deletions(-) create mode 100644 linux-tkg-patches/5.10/0003-cacule-5.10.patch create mode 100644 linux-tkg-patches/5.4/0003-cacule-5.4.patch diff --git a/PKGBUILD b/PKGBUILD index 6789a98..0af370a 100644 --- a/PKGBUILD +++ b/PKGBUILD @@ -87,6 +87,7 @@ case $_basever in 0002-clear-patches.patch 0003-glitched-base.patch 0003-glitched-cfs.patch + 0003-cacule-5.4.patch 0004-glitched-ondemand-muqss.patch 0004-glitched-muqss.patch 0004-5.4-ck1.patch @@ -111,6 +112,7 @@ case $_basever in 'd02bf5ca08fd610394b9d3a0c3b176d74af206f897dee826e5cbaec97bb4a4aa' '3ef74dd80dc8c4323803cbf868178cf01d8ed69f09fe2f537ff91124cbd6e611' '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' + '50f99a263482289cd6d5fe9415b4c287afb376f38af711756e808ccb10522885' 'c605f638d74c61861ebdc36ebd4cb8b6475eae2f6273e1ccb2bbb3e10a2ec3fe' 'bc69d6e5ee8172b0242c8fa72d13cfe2b8d2b6601468836908a7dfe8b78a3bbb' '815974c65f47301d2a5d1577bf95e8a4b54cad7d77f226e0065f83e763837c48' @@ -297,6 +299,7 @@ case $_basever in 0002-clear-patches.patch 0003-glitched-base.patch 0003-glitched-cfs.patch + 0003-cacule-5.10.patch 0004-glitched-ondemand-muqss.patch 0004-glitched-muqss.patch 0004-5.10-ck1.patch @@ -325,6 +328,7 @@ case $_basever in '35a7cde86fb94939c0f25a62b8c47f3de0dbd3c65f876f460b263181b3e92fc0' '1ac97da07e72ec7e2b0923d32daacacfaa632a44c714d6942d9f143fe239e1b5' '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' + '5bff350e3463acdddb5d8b2ed1af0b54586500c4b4ee6be1f14744a65b383497' 'c605f638d74c61861ebdc36ebd4cb8b6475eae2f6273e1ccb2bbb3e10a2ec3fe' '2bbbac963b6ca44ef3f8a71ec7c5cad7d66df860869a73059087ee236775970a' 'e00096244e5cddaa5500d08b5f692fd3f25be9401dfa3b0fc624625ff2f5e198' diff --git a/linux-tkg-config/prepare b/linux-tkg-config/prepare index 1a23ca2..2134a3f 100644 --- a/linux-tkg-config/prepare +++ b/linux-tkg-config/prepare @@ -116,12 +116,16 @@ _set_cpu_scheduler() { ) # CPU SCHED selector - if [ "$_basever" = "54" ] || [ "$_basever" = "57" ]; then + if [ "$_basever" = "54" ]; then + _avail_cpu_scheds=("pds" "bmq" "muqss" "cacule" "cfs") + elif [ "$_basever" = "57" ]; then _avail_cpu_scheds=("pds" "bmq" "muqss" "cfs") elif [ "$_basever" = "58" ]; then _avail_cpu_scheds=("upds" "pds" "bmq" "cfs") - elif [[ "$_basever" =~ ^(59|510|511)$ ]]; then + elif [[ "$_basever" =~ ^(59|511)$ ]]; then _avail_cpu_scheds=("upds" "pds" "bmq" "muqss" "cfs") + elif [ "$_basever" = "510" ]; then + _avail_cpu_scheds=("upds" "pds" "bmq" "muqss" "cacule" "cfs") elif [ "$_basever" = "512" ]; then _avail_cpu_scheds=("pds" "bmq" "muqss" "cacule" "cfs") elif [ "$_basever" = "513" ]; then diff --git a/linux-tkg-patches/5.10/0003-cacule-5.10.patch b/linux-tkg-patches/5.10/0003-cacule-5.10.patch new file mode 100644 index 0000000..1f92c97 --- /dev/null +++ b/linux-tkg-patches/5.10/0003-cacule-5.10.patch @@ -0,0 +1,1268 @@ +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index d4b32cc32bb7..2788c5bbd870 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -1087,6 +1087,10 @@ Model available). If your platform happens to meet the + requirements for EAS but you do not want to use it, change + this value to 0. + ++sched_interactivity_factor (CacULE scheduler only) ++================================================== ++Sets the value *m* for interactivity score calculations. See ++Figure 1 in https://web.cs.ucdavis.edu/~roper/ecs150/ULE.pdf + + sched_schedstats + ================ +diff --git a/Documentation/scheduler/sched-CacULE.rst b/Documentation/scheduler/sched-CacULE.rst +new file mode 100644 +index 000000000000..82b0847c468a +--- /dev/null ++++ b/Documentation/scheduler/sched-CacULE.rst +@@ -0,0 +1,76 @@ ++====================================== ++The CacULE Scheduler by Hamad Al Marri. ++====================================== ++ ++1. Overview ++============= ++ ++The CacULE CPU scheduler is based on interactivity score mechanism. ++The interactivity score is inspired by the ULE scheduler (FreeBSD ++scheduler). ++ ++1.1 About CacULE Scheduler ++-------------------------- ++ ++ - Each CPU has its own runqueue. ++ ++ - NORMAL runqueue is a linked list of sched_entities (instead of RB-Tree). ++ ++ - RT and other runqueues are just the same as the CFS's. ++ ++ - Wake up tasks preempt currently running tasks if its interactivity score value ++ is higher. ++ ++ ++1.2. Complexity ++---------------- ++ ++The complexity of Enqueue and Dequeue a task is O(1). ++ ++The complexity of pick the next task is in O(n), where n is the number of tasks ++in a runqueue (each CPU has its own runqueue). ++ ++Note: O(n) sounds scary, but usually for a machine with 4 CPUS where it is used ++for desktop or mobile jobs, the maximum number of runnable tasks might not ++exceeds 10 (at the pick next run time) - the idle tasks are excluded since they ++are dequeued when sleeping and enqueued when they wake up. ++ ++ ++2. The CacULE Interactivity Score ++======================================================= ++ ++The interactivity score is inspired by the ULE scheduler (FreeBSD scheduler). ++For more information see: https://web.cs.ucdavis.edu/~roper/ecs150/ULE.pdf ++CacULE doesn't replace CFS with ULE, it only changes the CFS' pick next task ++mechanism to ULE's interactivity score mechanism for picking next task to run. ++ ++ ++2.3 sched_interactivity_factor ++================= ++Sets the value *m* for interactivity score calculations. See Figure 1 in ++https://web.cs.ucdavis.edu/~roper/ecs150/ULE.pdf ++The default value of in CacULE is 10 which means that the Maximum Interactive ++Score is 20 (since m = Maximum Interactive Score / 2). ++You can tune sched_interactivity_factor with sysctl command: ++ ++ sysctl kernel.sched_interactivity_factor=50 ++ ++This command changes the sched_interactivity_factor from 10 to 50. ++ ++ ++3. Scheduling policies ++======================= ++ ++CacULE some CFS, implements three scheduling policies: ++ ++ - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling ++ policy that is used for regular tasks. ++ ++ - SCHED_BATCH: Does not preempt nearly as often as regular tasks ++ would, thereby allowing tasks to run longer and make better use of ++ caches but at the cost of interactivity. This is well suited for ++ batch jobs. ++ ++ - SCHED_IDLE: This is even weaker than nice 19, but its not a true ++ idle timer scheduler in order to avoid to get into priority ++ inversion problems which would deadlock the machine. +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 76cd21fa5501..0abad9f1247a 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -448,10 +448,22 @@ struct sched_statistics { + #endif + }; + ++#ifdef CONFIG_CACULE_SCHED ++struct cacule_node { ++ struct cacule_node* next; ++ struct cacule_node* prev; ++ u64 cacule_start_time; ++ u64 vruntime; ++}; ++#endif ++ + struct sched_entity { + /* For load-balancing: */ + struct load_weight load; + struct rb_node run_node; ++#ifdef CONFIG_CACULE_SCHED ++ struct cacule_node cacule_node; ++#endif + struct list_head group_node; + unsigned int on_rq; + +diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h +index 3c31ba88aca5..4cf162341ab8 100644 +--- a/include/linux/sched/sysctl.h ++++ b/include/linux/sched/sysctl.h +@@ -31,6 +31,12 @@ extern unsigned int sysctl_sched_min_granularity; + extern unsigned int sysctl_sched_wakeup_granularity; + extern unsigned int sysctl_sched_child_runs_first; + ++#ifdef CONFIG_CACULE_SCHED ++extern unsigned int interactivity_factor; ++extern unsigned int interactivity_threshold; ++extern unsigned int cacule_max_lifetime; ++#endif ++ + enum sched_tunable_scaling { + SCHED_TUNABLESCALING_NONE, + SCHED_TUNABLESCALING_LOG, +diff --git a/init/Kconfig b/init/Kconfig +index fc4c9f416fad..16676cfd11d7 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -825,6 +825,17 @@ config UCLAMP_BUCKETS_COUNT + + endmenu + ++config CACULE_SCHED ++ bool "CacULE CPU scheduler" ++ default y ++ help ++ The CacULE CPU scheduler is based on interactivity score mechanism. ++ The interactivity score is inspired by the ULE scheduler (FreeBSD ++ scheduler). ++ ++ If unsure, say Y here. ++ ++ + # + # For architectures that want to enable the support for NUMA-affine scheduler + # balancing logic: +@@ -1208,6 +1219,7 @@ config SCHED_AUTOGROUP + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED ++ default y + help + This option optimizes the scheduler for common desktop workloads by + automatically creating and populating task groups. This separation +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 38ef6d06888e..865f8dbddca8 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -46,6 +46,9 @@ choice + 1000 Hz is the preferred choice for desktop systems and other + systems requiring fast interactive responses to events. + ++ config HZ_2000 ++ bool "2000 HZ" ++ + endchoice + + config HZ +@@ -54,6 +57,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 1000 if HZ_1000 ++ default 2000 if HZ_2000 + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 57b236251884..6ad7f6f33ed5 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -3068,6 +3068,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; + p->se.vruntime = 0; ++ ++#ifdef CONFIG_CACULE_SCHED ++ p->se.cacule_node.vruntime = 0; ++#endif ++ + INIT_LIST_HEAD(&p->se.group_node); + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -3352,6 +3357,10 @@ void wake_up_new_task(struct task_struct *p) + update_rq_clock(rq); + post_init_entity_util_avg(p); + ++#ifdef CONFIG_CACULE_SCHED ++ p->se.cacule_node.cacule_start_time = sched_clock(); ++#endif ++ + activate_task(rq, p, ENQUEUE_NOCLOCK); + trace_sched_wakeup_new(p); + check_preempt_curr(rq, p, WF_FORK); +@@ -7066,6 +7075,10 @@ void __init sched_init(void) + BUG_ON(&dl_sched_class + 1 != &stop_sched_class); + #endif + ++#ifdef CONFIG_CACULE_SCHED ++ printk(KERN_INFO "CacULE CPU scheduler v5.10-r2 by Hamad Al Marri."); ++#endif ++ + wait_bit_init(); + + #ifdef CONFIG_FAIR_GROUP_SCHED +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 6264584b51c2..bd3901a849c4 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -554,8 +554,11 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) + + void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) + { +- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, +- spread, rq0_min_vruntime, spread0; ++ s64 MIN_vruntime = -1, max_vruntime = -1, ++#if !defined(CONFIG_CACULE_SCHED) ++ min_vruntime, rq0_min_vruntime, spread0, ++#endif ++ spread; + struct rq *rq = cpu_rq(cpu); + struct sched_entity *last; + unsigned long flags; +@@ -576,21 +579,27 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) + last = __pick_last_entity(cfs_rq); + if (last) + max_vruntime = last->vruntime; ++#if !defined(CONFIG_CACULE_SCHED) + min_vruntime = cfs_rq->min_vruntime; + rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; ++#endif + raw_spin_unlock_irqrestore(&rq->lock, flags); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", + SPLIT_NS(MIN_vruntime)); ++#if !defined(CONFIG_CACULE_SCHED) + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", + SPLIT_NS(min_vruntime)); ++#endif + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", + SPLIT_NS(max_vruntime)); + spread = max_vruntime - MIN_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", + SPLIT_NS(spread)); ++#if !defined(CONFIG_CACULE_SCHED) + spread0 = min_vruntime - rq0_min_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", + SPLIT_NS(spread0)); ++#endif + SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", + cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 1ad0e52487f6..bfd953e6b871 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -19,6 +19,10 @@ + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra ++ * ++ * CacULE enhancements CPU cache and scheduler based on ++ * Interactivity Score. ++ * (C) 2020 Hamad Al Marri + */ + #include "sched.h" + +@@ -113,6 +117,17 @@ int __weak arch_asym_cpu_priority(int cpu) + */ + #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) + ++#endif ++#ifdef CONFIG_CACULE_SCHED ++unsigned int __read_mostly cacule_max_lifetime = 22000; // in ms ++unsigned int __read_mostly interactivity_factor = 32768; ++ ++#ifdef CONFIG_FAIR_GROUP_SCHED ++unsigned int __read_mostly interactivity_threshold = 0; ++#else ++unsigned int __read_mostly interactivity_threshold = 1000; ++#endif ++ + #endif + + #ifdef CONFIG_CFS_BANDWIDTH +@@ -253,6 +268,14 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight + + const struct sched_class fair_sched_class; + ++ ++#ifdef CONFIG_CACULE_SCHED ++static inline struct sched_entity *se_of(struct cacule_node *cn) ++{ ++ return container_of(cn, struct sched_entity, cacule_node); ++} ++#endif ++ + /************************************************************** + * CFS operations on generic schedulable entities: + */ +@@ -512,7 +535,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); + /************************************************************** + * Scheduling class tree data structure manipulation methods: + */ +- ++#if !defined(CONFIG_CACULE_SCHED) + static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) + { + s64 delta = (s64)(vruntime - max_vruntime); +@@ -568,7 +591,170 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; + #endif + } ++#endif /* CONFIG_CACULE_SCHED */ ++ ++#ifdef CONFIG_CACULE_SCHED ++static unsigned int ++calc_interactivity(u64 now, struct cacule_node *se) ++{ ++ u64 l_se, vr_se, sleep_se = 1ULL, u64_factor_m, _2m; ++ unsigned int score_se; ++ ++ /* ++ * in case of vruntime==0, logical OR with 1 would ++ * make sure that the least sig. bit is 1 ++ */ ++ l_se = now - se->cacule_start_time; ++ vr_se = se->vruntime | 1; ++ u64_factor_m = interactivity_factor; ++ _2m = u64_factor_m << 1; ++ ++ /* safety check */ ++ if (likely(l_se > vr_se)) ++ sleep_se = (l_se - vr_se) | 1; ++ ++ if (sleep_se >= vr_se) ++ score_se = u64_factor_m / (sleep_se / vr_se); ++ else ++ score_se = _2m - (u64_factor_m / (vr_se / sleep_se)); ++ ++ return score_se; ++} ++ ++static inline int is_interactive(struct cacule_node *cn) ++{ ++ if (!interactivity_threshold || se_of(cn)->vruntime == 0) ++ return 0; ++ ++ return calc_interactivity(sched_clock(), cn) < interactivity_threshold; ++} ++ ++static inline int ++entity_before_cached(u64 now, unsigned int score_curr, struct cacule_node *se) ++{ ++ unsigned int score_se; ++ int diff; ++ ++ score_se = calc_interactivity(now, se); ++ diff = score_se - score_curr; ++ ++ if (diff <= 0) ++ return 1; ++ ++ return -1; ++} ++ ++/* ++ * Does se have lower interactivity score value (i.e. interactive) than curr? If yes, return 1, ++ * otherwise return -1 ++ * se is before curr if se has lower interactivity score value ++ * the lower score, the more interactive ++ */ ++static inline int ++entity_before(u64 now, struct cacule_node *curr, struct cacule_node *se) ++{ ++ unsigned int score_curr, score_se; ++ int diff; ++ ++ score_curr = calc_interactivity(now, curr); ++ score_se = calc_interactivity(now, se); ++ ++ diff = score_se - score_curr; ++ ++ if (diff < 0) ++ return 1; ++ ++ return -1; ++} ++ ++/* ++ * Enqueue an entity ++ */ ++static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *_se) ++{ ++ struct cacule_node *se = &(_se->cacule_node); ++ struct cacule_node *iter, *next = NULL; ++ u64 now = sched_clock(); ++ unsigned int score_se = calc_interactivity(now, se); ++ ++ se->next = NULL; ++ se->prev = NULL; ++ ++ if (likely(cfs_rq->head)) { ++ ++ // start from tail ++ iter = cfs_rq->tail; ++ ++ // does se have higher IS than iter? ++ while (iter && entity_before_cached(now, score_se, iter) == -1) { ++ next = iter; ++ iter = iter->prev; ++ } ++ ++ // se in tail position ++ if (iter == cfs_rq->tail) { ++ cfs_rq->tail->next = se; ++ se->prev = cfs_rq->tail; ++ ++ cfs_rq->tail = se; ++ } ++ // else if not head no tail, insert se after iter ++ else if (iter) { ++ se->next = next; ++ se->prev = iter; ++ ++ iter->next = se; ++ next->prev = se; ++ } ++ // insert se at head ++ else { ++ se->next = cfs_rq->head; ++ cfs_rq->head->prev = se; ++ ++ // lastly reset the head ++ cfs_rq->head = se; ++ } ++ } else { ++ // if empty rq ++ cfs_rq->head = se; ++ cfs_rq->tail = se; ++ } ++} ++ ++static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *_se) ++{ ++ struct cacule_node *se = &(_se->cacule_node); ++ ++ // if only one se in rq ++ if (cfs_rq->head == cfs_rq->tail) { ++ cfs_rq->head = NULL; ++ cfs_rq->tail = NULL; ++ ++ } else if (se == cfs_rq->head) { ++ // if it is the head ++ cfs_rq->head = cfs_rq->head->next; ++ cfs_rq->head->prev = NULL; ++ } else if (se == cfs_rq->tail) { ++ // if it is the tail ++ cfs_rq->tail = cfs_rq->tail->prev; ++ cfs_rq->tail->next = NULL; ++ } else { ++ // if in the middle ++ struct cacule_node *prev = se->prev; ++ struct cacule_node *next = se->next; ++ ++ prev->next = next; ++ ++ if (next) ++ next->prev = prev; ++ } ++} + ++struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) ++{ ++ return se_of(cfs_rq->head); ++} ++#else + /* + * Enqueue an entity into the rb-tree: + */ +@@ -626,16 +812,24 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) + + return rb_entry(next, struct sched_entity, run_node); + } ++#endif /* CONFIG_CACULE_SCHED */ + + #ifdef CONFIG_SCHED_DEBUG + struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) + { ++#ifdef CONFIG_CACULE_SCHED ++ if (!cfs_rq->tail) ++ return NULL; ++ ++ return se_of(cfs_rq->tail); ++#else + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); + + if (!last) + return NULL; + + return rb_entry(last, struct sched_entity, run_node); ++#endif /* CONFIG_CACULE_SCHED */ + } + + /************************************************************** +@@ -730,6 +924,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) + return slice; + } + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * We calculate the vruntime slice of a to-be-inserted task. + * +@@ -739,6 +934,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) + { + return calc_delta_fair(sched_slice(cfs_rq, se), se); + } ++#endif /* CONFIG_CACULE_SCHED */ + + #include "pelt.h" + #ifdef CONFIG_SMP +@@ -846,14 +1042,51 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) + } + #endif /* CONFIG_SMP */ + ++#ifdef CONFIG_CACULE_SCHED ++static void normalize_lifetime(u64 now, struct sched_entity *se) ++{ ++ struct cacule_node *cn = &se->cacule_node; ++ u64 max_life_ns, life_time; ++ s64 diff; ++ ++ /* ++ * left shift 20 bits is approximately = * 1000000 ++ * we don't need the precision of life time ++ * Ex. for 30s, with left shift (20bits) == 31.457s ++ */ ++ max_life_ns = ((u64) cacule_max_lifetime) << 20; ++ life_time = now - cn->cacule_start_time; ++ diff = life_time - max_life_ns; ++ ++ if (diff > 0) { ++ // multiply life_time by 1024 for more precision ++ u64 old_hrrn_x = (life_time << 7) / ((cn->vruntime >> 3) | 1); ++ ++ // reset life to half max_life (i.e ~15s) ++ cn->cacule_start_time = now - (max_life_ns >> 1); ++ ++ // avoid division by zero ++ if (old_hrrn_x == 0) old_hrrn_x = 1; ++ ++ // reset vruntime based on old hrrn ratio ++ cn->vruntime = (max_life_ns << 9) / old_hrrn_x; ++ } ++} ++#endif /* CONFIG_CACULE_SCHED */ ++ + /* + * Update the current task's runtime statistics. + */ + static void update_curr(struct cfs_rq *cfs_rq) + { + struct sched_entity *curr = cfs_rq->curr; ++#ifdef CONFIG_CACULE_SCHED ++ u64 now = sched_clock(); ++ u64 delta_exec, delta_fair; ++#else + u64 now = rq_clock_task(rq_of(cfs_rq)); + u64 delta_exec; ++#endif + + if (unlikely(!curr)) + return; +@@ -870,8 +1103,15 @@ static void update_curr(struct cfs_rq *cfs_rq) + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq->exec_clock, delta_exec); + ++#ifdef CONFIG_CACULE_SCHED ++ delta_fair = calc_delta_fair(delta_exec, curr); ++ curr->vruntime += delta_fair; ++ curr->cacule_node.vruntime += delta_fair; ++ normalize_lifetime(now, curr); ++#else + curr->vruntime += calc_delta_fair(delta_exec, curr); + update_min_vruntime(cfs_rq); ++#endif + + if (entity_is_task(curr)) { + struct task_struct *curtask = task_of(curr); +@@ -1030,7 +1270,6 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + static inline void + update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { +- + if (!schedstat_enabled()) + return; + +@@ -1062,7 +1301,11 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) + /* + * We are starting a new run period: + */ ++#ifdef CONFIG_CACULE_SCHED ++ se->exec_start = sched_clock(); ++#else + se->exec_start = rq_clock_task(rq_of(cfs_rq)); ++#endif + } + + /************************************************** +@@ -4123,7 +4366,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} + + static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) + { +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_CACULE_SCHED) + s64 d = se->vruntime - cfs_rq->min_vruntime; + + if (d < 0) +@@ -4134,6 +4377,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) + #endif + } + ++#if !defined(CONFIG_CACULE_SCHED) + static void + place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) + { +@@ -4165,6 +4409,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) + /* ensure we never gain time by being placed backwards. */ + se->vruntime = max_vruntime(se->vruntime, vruntime); + } ++#endif /* CONFIG_CACULE_SCHED */ + + static void check_enqueue_throttle(struct cfs_rq *cfs_rq); + +@@ -4223,18 +4468,23 @@ static inline bool cfs_bandwidth_used(void); + static void + enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { ++#if !defined(CONFIG_CACULE_SCHED) + bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); ++#endif + bool curr = cfs_rq->curr == se; + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * If we're the current task, we must renormalise before calling + * update_curr(). + */ + if (renorm && curr) + se->vruntime += cfs_rq->min_vruntime; ++#endif + + update_curr(cfs_rq); + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Otherwise, renormalise after, such that we're placed at the current + * moment in time, instead of some random moment in the past. Being +@@ -4243,6 +4493,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + */ + if (renorm && !curr) + se->vruntime += cfs_rq->min_vruntime; ++#endif + + /* + * When enqueuing a sched_entity, we must: +@@ -4257,8 +4508,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + update_cfs_group(se); + account_entity_enqueue(cfs_rq, se); + ++#if !defined(CONFIG_CACULE_SCHED) + if (flags & ENQUEUE_WAKEUP) + place_entity(cfs_rq, se, 0); ++#endif + + check_schedstat_required(); + update_stats_enqueue(cfs_rq, se, flags); +@@ -4279,6 +4532,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + check_enqueue_throttle(cfs_rq); + } + ++#if !defined(CONFIG_CACULE_SCHED) + static void __clear_buddies_last(struct sched_entity *se) + { + for_each_sched_entity(se) { +@@ -4323,6 +4577,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) + if (cfs_rq->skip == se) + __clear_buddies_skip(se); + } ++#endif /* !CONFIG_CACULE_SCHED */ + + static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); + +@@ -4347,13 +4602,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + + update_stats_dequeue(cfs_rq, se, flags); + ++#if !defined(CONFIG_CACULE_SCHED) + clear_buddies(cfs_rq, se); ++#endif + + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->on_rq = 0; + account_entity_dequeue(cfs_rq, se); + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Normalize after update_curr(); which will also have moved + * min_vruntime if @se is the one holding it back. But before doing +@@ -4362,12 +4620,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + */ + if (!(flags & DEQUEUE_SLEEP)) + se->vruntime -= cfs_rq->min_vruntime; ++#endif + + /* return excess runtime on last dequeue */ + return_cfs_rq_runtime(cfs_rq); + + update_cfs_group(se); + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Now advance min_vruntime if @se was the entity holding it back, + * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be +@@ -4376,8 +4636,21 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + */ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) + update_min_vruntime(cfs_rq); ++#endif + } + ++#ifdef CONFIG_CACULE_SCHED ++/* ++ * Preempt the current task with a newly woken task if needed: ++ */ ++static void ++check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ++{ ++ // does head have higher IS than curr ++ if (entity_before(sched_clock(), &curr->cacule_node, cfs_rq->head) == 1) ++ resched_curr(rq_of(cfs_rq)); ++} ++#else + /* + * Preempt the current task with a newly woken task if needed: + */ +@@ -4417,6 +4690,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + if (delta > ideal_runtime) + resched_curr(rq_of(cfs_rq)); + } ++#endif /* CONFIG_CACULE_SCHED */ + + static void + set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -4451,6 +4725,21 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + se->prev_sum_exec_runtime = se->sum_exec_runtime; + } + ++#ifdef CONFIG_CACULE_SCHED ++static struct sched_entity * ++pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) ++{ ++ struct cacule_node *se = cfs_rq->head; ++ ++ if (unlikely(!se)) ++ se = &curr->cacule_node; ++ else if (unlikely(curr ++ && entity_before(sched_clock(), se, &curr->cacule_node) == 1)) ++ se = &curr->cacule_node; ++ ++ return se_of(se); ++} ++#else + static int + wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + +@@ -4511,6 +4800,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) + + return se; + } ++#endif /* CONFIG_CACULE_SCHED */ + + static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); + +@@ -5604,7 +5894,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + hrtick_update(rq); + } + ++#if !defined(CONFIG_CACULE_SCHED) + static void set_next_buddy(struct sched_entity *se); ++#endif + + /* + * The dequeue_task method is called before nr_running is +@@ -5636,12 +5928,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + if (cfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Bias pick_next to pick a task from this cfs_rq, as + * p is sleeping when it is within its sched_slice. + */ + if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + set_next_buddy(se); ++#endif + break; + } + flags |= DEQUEUE_SLEEP; +@@ -5757,6 +6051,7 @@ static unsigned long capacity_of(int cpu) + return cpu_rq(cpu)->cpu_capacity; + } + ++#if !defined(CONFIG_CACULE_SCHED) + static void record_wakee(struct task_struct *p) + { + /* +@@ -5803,6 +6098,7 @@ static int wake_wide(struct task_struct *p) + return 0; + return 1; + } ++#endif /* CONFIG_CACULE_SCHED */ + + /* + * The purpose of wake_affine() is to quickly determine on which CPU we can run +@@ -6479,6 +6775,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) + return min_t(unsigned long, util, capacity_orig_of(cpu)); + } + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) + * to @dst_cpu. +@@ -6712,6 +7009,57 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + + return -1; + } ++#endif /* CONFIG_CACULE_SCHED */ ++ ++#ifdef CONFIG_CACULE_SCHED ++static int ++find_least_IS_cpu(struct task_struct *p) ++{ ++ struct cfs_rq *cfs_rq; ++ unsigned int max_IS = 0; ++ unsigned int IS, IS_c, IS_h; ++ struct sched_entity *curr_se; ++ struct cacule_node *cn, *head; ++ int cpu_i; ++ int new_cpu = -1; ++ ++ for_each_online_cpu(cpu_i) { ++ if (!cpumask_test_cpu(cpu_i, p->cpus_ptr)) ++ continue; ++ ++ cn = NULL; ++ cfs_rq = &cpu_rq(cpu_i)->cfs; ++ ++ curr_se = cfs_rq->curr; ++ head = cfs_rq->head; ++ ++ if (!curr_se && head) ++ cn = head; ++ else if (curr_se && !head) ++ cn = &curr_se->cacule_node; ++ else if (curr_se && head) { ++ IS_c = calc_interactivity(sched_clock(), &curr_se->cacule_node); ++ IS_h = calc_interactivity(sched_clock(), head); ++ ++ IS = IS_c > IS_h? IS_c : IS_h; ++ goto compare; ++ } ++ ++ if (!cn) ++ return cpu_i; ++ ++ IS = calc_interactivity(sched_clock(), cn); ++ ++compare: ++ if (IS > max_IS) { ++ max_IS = IS; ++ new_cpu = cpu_i; ++ } ++ } ++ ++ return new_cpu; ++} ++#endif + + /* + * select_task_rq_fair: Select target runqueue for the waking task in domains +@@ -6734,6 +7082,26 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f + int want_affine = 0; + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + ++#ifdef CONFIG_CACULE_SCHED ++ struct sched_entity *se = &p->se; ++ ++ if (!is_interactive(&se->cacule_node)) ++ goto cfs_way; ++ ++ // check first if the prev cpu ++ // has 0 tasks ++ if (cpumask_test_cpu(prev_cpu, p->cpus_ptr) && ++ cpu_rq(prev_cpu)->cfs.nr_running == 0) ++ return prev_cpu; ++ ++ new_cpu = find_least_IS_cpu(p); ++ ++ if (new_cpu != -1) ++ return new_cpu; ++ ++ new_cpu = prev_cpu; ++cfs_way: ++#else + if (sd_flag & SD_BALANCE_WAKE) { + record_wakee(p); + +@@ -6746,6 +7114,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f + + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); + } ++#endif /* CONFIG_CACULE_SCHED */ + + rcu_read_lock(); + for_each_domain(cpu, tmp) { +@@ -6793,6 +7162,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se); + */ + static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) + { ++#if !defined(CONFIG_CACULE_SCHED) + /* + * As blocked tasks retain absolute vruntime the migration needs to + * deal with this by subtracting the old and adding the new +@@ -6818,6 +7188,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) + + se->vruntime -= min_vruntime; + } ++#endif /* CONFIG_CACULE_SCHED */ + + if (p->on_rq == TASK_ON_RQ_MIGRATING) { + /* +@@ -6863,6 +7234,7 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + } + #endif /* CONFIG_SMP */ + ++#if !defined(CONFIG_CACULE_SCHED) + static unsigned long wakeup_gran(struct sched_entity *se) + { + unsigned long gran = sysctl_sched_wakeup_granularity; +@@ -6941,6 +7313,7 @@ static void set_skip_buddy(struct sched_entity *se) + for_each_sched_entity(se) + cfs_rq_of(se)->skip = se; + } ++#endif /* CONFIG_CACULE_SCHED */ + + /* + * Preempt the current task with a newly woken task if needed: +@@ -6949,9 +7322,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + { + struct task_struct *curr = rq->curr; + struct sched_entity *se = &curr->se, *pse = &p->se; ++ ++#if !defined(CONFIG_CACULE_SCHED) + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + int scale = cfs_rq->nr_running >= sched_nr_latency; + int next_buddy_marked = 0; ++#endif /* CONFIG_CACULE_SCHED */ + + if (unlikely(se == pse)) + return; +@@ -6965,10 +7341,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + return; + ++#if !defined(CONFIG_CACULE_SCHED) + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { + set_next_buddy(pse); + next_buddy_marked = 1; + } ++#endif /* CONFIG_CACULE_SCHED */ + + /* + * We can come here with TIF_NEED_RESCHED already set from new task +@@ -6998,6 +7376,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + find_matching_se(&se, &pse); + update_curr(cfs_rq_of(se)); + BUG_ON(!pse); ++ ++#ifdef CONFIG_CACULE_SCHED ++ if (entity_before(sched_clock(), &se->cacule_node, &pse->cacule_node) == 1) ++ goto preempt; ++#else + if (wakeup_preempt_entity(se, pse) == 1) { + /* + * Bias pick_next to pick the sched entity that is +@@ -7007,11 +7390,14 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + set_next_buddy(pse); + goto preempt; + } ++#endif /* CONFIG_CACULE_SCHED */ + + return; + + preempt: + resched_curr(rq); ++ ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved +@@ -7026,6 +7412,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); ++#endif /* CONFIG_CACULE_SCHED */ + } + + struct task_struct * +@@ -7200,7 +7587,10 @@ static void yield_task_fair(struct rq *rq) + { + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); ++ ++#if !defined(CONFIG_CACULE_SCHED) + struct sched_entity *se = &curr->se; ++#endif + + /* + * Are we the only task in the tree? +@@ -7208,7 +7598,9 @@ static void yield_task_fair(struct rq *rq) + if (unlikely(rq->nr_running == 1)) + return; + ++#if !defined(CONFIG_CACULE_SCHED) + clear_buddies(cfs_rq, se); ++#endif + + if (curr->policy != SCHED_BATCH) { + update_rq_clock(rq); +@@ -7224,7 +7616,9 @@ static void yield_task_fair(struct rq *rq) + rq_clock_skip_update(rq); + } + ++#if !defined(CONFIG_CACULE_SCHED) + set_skip_buddy(se); ++#endif + } + + static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) +@@ -7235,8 +7629,10 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) + if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) + return false; + ++#if !defined(CONFIG_CACULE_SCHED) + /* Tell the scheduler that we'd really like pse to run next. */ + set_next_buddy(se); ++#endif + + yield_task_fair(rq); + +@@ -7464,6 +7860,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) + if (env->sd->flags & SD_SHARE_CPUCAPACITY) + return 0; + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Buddy candidates are cache hot: + */ +@@ -7471,6 +7868,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) + (&p->se == cfs_rq_of(&p->se)->next || + &p->se == cfs_rq_of(&p->se)->last)) + return 1; ++#endif + + if (sysctl_sched_migration_cost == -1) + return 1; +@@ -10742,11 +11140,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + update_overutilized_status(task_rq(curr)); + } + ++#ifdef CONFIG_CACULE_SCHED + /* + * called on fork with the child task as argument from the parent's context + * - child not yet on the tasklist + * - preemption disabled + */ ++ static void task_fork_fair(struct task_struct *p) ++{ ++ struct cfs_rq *cfs_rq; ++ struct sched_entity *curr; ++ struct rq *rq = this_rq(); ++ struct rq_flags rf; ++ ++ rq_lock(rq, &rf); ++ update_rq_clock(rq); ++ ++ cfs_rq = task_cfs_rq(current); ++ curr = cfs_rq->curr; ++ if (curr) ++ update_curr(cfs_rq); ++ ++ rq_unlock(rq, &rf); ++} ++#else + static void task_fork_fair(struct task_struct *p) + { + struct cfs_rq *cfs_rq; +@@ -10777,6 +11194,7 @@ static void task_fork_fair(struct task_struct *p) + se->vruntime -= cfs_rq->min_vruntime; + rq_unlock(rq, &rf); + } ++#endif /* CONFIG_CACULE_SCHED */ + + /* + * Priority of the task has changed. Check to see if we preempt +@@ -10895,6 +11313,8 @@ static void attach_entity_cfs_rq(struct sched_entity *se) + static void detach_task_cfs_rq(struct task_struct *p) + { + struct sched_entity *se = &p->se; ++ ++#if !defined(CONFIG_CACULE_SCHED) + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { +@@ -10905,6 +11325,7 @@ static void detach_task_cfs_rq(struct task_struct *p) + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } ++#endif + + detach_entity_cfs_rq(se); + } +@@ -10912,12 +11333,17 @@ static void detach_task_cfs_rq(struct task_struct *p) + static void attach_task_cfs_rq(struct task_struct *p) + { + struct sched_entity *se = &p->se; ++ ++#if !defined(CONFIG_CACULE_SCHED) + struct cfs_rq *cfs_rq = cfs_rq_of(se); ++#endif + + attach_entity_cfs_rq(se); + ++#if !defined(CONFIG_CACULE_SCHED) + if (!vruntime_normalized(p)) + se->vruntime += cfs_rq->min_vruntime; ++#endif + } + + static void switched_from_fair(struct rq *rq, struct task_struct *p) +@@ -10973,13 +11399,22 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) + void init_cfs_rq(struct cfs_rq *cfs_rq) + { + cfs_rq->tasks_timeline = RB_ROOT_CACHED; ++ ++#if !defined(CONFIG_CACULE_SCHED) + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); + #ifndef CONFIG_64BIT + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; + #endif ++#endif /* CONFIG_CACULE_SCHED */ ++ + #ifdef CONFIG_SMP + raw_spin_lock_init(&cfs_rq->removed.lock); + #endif ++ ++#ifdef CONFIG_CACULE_SCHED ++ cfs_rq->head = NULL; ++ cfs_rq->tail = NULL; ++#endif + } + + #ifdef CONFIG_FAIR_GROUP_SCHED +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index fdebfcbdfca9..e7fd62e89154 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -524,10 +524,13 @@ struct cfs_rq { + unsigned int idle_h_nr_running; /* SCHED_IDLE */ + + u64 exec_clock; ++ ++#if !defined(CONFIG_CACULE_SCHED) + u64 min_vruntime; + #ifndef CONFIG_64BIT + u64 min_vruntime_copy; + #endif ++#endif /* CONFIG_CACULE_SCHED */ + + struct rb_root_cached tasks_timeline; + +@@ -536,9 +539,15 @@ struct cfs_rq { + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr; ++#ifdef CONFIG_CACULE_SCHED ++ struct cacule_node *head; ++ struct cacule_node *tail; ++ ++#else + struct sched_entity *next; + struct sched_entity *last; + struct sched_entity *skip; ++#endif // CONFIG_CACULE_SCHED + + #ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b9306d2bb426..2103ee43de9e 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -1659,6 +1659,29 @@ static struct ctl_table kern_table[] = { + .mode = 0644, + .proc_handler = proc_dointvec, + }, ++#ifdef CONFIG_CACULE_SCHED ++ { ++ .procname = "sched_interactivity_factor", ++ .data = &interactivity_factor, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { ++ .procname = "sched_interactivity_threshold", ++ .data = &interactivity_threshold, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { ++ .procname = "sched_max_lifetime_ms", ++ .data = &cacule_max_lifetime, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_SCHED_DEBUG + { + .procname = "sched_min_granularity_ns", diff --git a/linux-tkg-patches/5.4/0003-cacule-5.4.patch b/linux-tkg-patches/5.4/0003-cacule-5.4.patch new file mode 100644 index 0000000..6675d2c --- /dev/null +++ b/linux-tkg-patches/5.4/0003-cacule-5.4.patch @@ -0,0 +1,1269 @@ +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 032c7cd3cede..8c3d751a0f16 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -1060,6 +1060,10 @@ compilation sees a 1% slowdown, other systems and workloads may vary. + 1: kernel stack erasing is enabled (default), it is performed before + returning to the userspace at the end of syscalls. + ++sched_interactivity_factor (CacULE scheduler only) ++================================================== ++Sets the value *m* for interactivity score calculations. See ++Figure 1 in https://web.cs.ucdavis.edu/~roper/ecs150/ULE.pdf + + tainted + ======= +diff --git a/Documentation/scheduler/sched-CacULE.rst b/Documentation/scheduler/sched-CacULE.rst +new file mode 100644 +index 000000000000..82b0847c468a +--- /dev/null ++++ b/Documentation/scheduler/sched-CacULE.rst +@@ -0,0 +1,76 @@ ++====================================== ++The CacULE Scheduler by Hamad Al Marri. ++====================================== ++ ++1. Overview ++============= ++ ++The CacULE CPU scheduler is based on interactivity score mechanism. ++The interactivity score is inspired by the ULE scheduler (FreeBSD ++scheduler). ++ ++1.1 About CacULE Scheduler ++-------------------------- ++ ++ - Each CPU has its own runqueue. ++ ++ - NORMAL runqueue is a linked list of sched_entities (instead of RB-Tree). ++ ++ - RT and other runqueues are just the same as the CFS's. ++ ++ - Wake up tasks preempt currently running tasks if its interactivity score value ++ is higher. ++ ++ ++1.2. Complexity ++---------------- ++ ++The complexity of Enqueue and Dequeue a task is O(1). ++ ++The complexity of pick the next task is in O(n), where n is the number of tasks ++in a runqueue (each CPU has its own runqueue). ++ ++Note: O(n) sounds scary, but usually for a machine with 4 CPUS where it is used ++for desktop or mobile jobs, the maximum number of runnable tasks might not ++exceeds 10 (at the pick next run time) - the idle tasks are excluded since they ++are dequeued when sleeping and enqueued when they wake up. ++ ++ ++2. The CacULE Interactivity Score ++======================================================= ++ ++The interactivity score is inspired by the ULE scheduler (FreeBSD scheduler). ++For more information see: https://web.cs.ucdavis.edu/~roper/ecs150/ULE.pdf ++CacULE doesn't replace CFS with ULE, it only changes the CFS' pick next task ++mechanism to ULE's interactivity score mechanism for picking next task to run. ++ ++ ++2.3 sched_interactivity_factor ++================= ++Sets the value *m* for interactivity score calculations. See Figure 1 in ++https://web.cs.ucdavis.edu/~roper/ecs150/ULE.pdf ++The default value of in CacULE is 10 which means that the Maximum Interactive ++Score is 20 (since m = Maximum Interactive Score / 2). ++You can tune sched_interactivity_factor with sysctl command: ++ ++ sysctl kernel.sched_interactivity_factor=50 ++ ++This command changes the sched_interactivity_factor from 10 to 50. ++ ++ ++3. Scheduling policies ++======================= ++ ++CacULE some CFS, implements three scheduling policies: ++ ++ - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling ++ policy that is used for regular tasks. ++ ++ - SCHED_BATCH: Does not preempt nearly as often as regular tasks ++ would, thereby allowing tasks to run longer and make better use of ++ caches but at the cost of interactivity. This is well suited for ++ batch jobs. ++ ++ - SCHED_IDLE: This is even weaker than nice 19, but its not a true ++ idle timer scheduler in order to avoid to get into priority ++ inversion problems which would deadlock the machine. +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 5710b80f8050..95531a635122 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -441,11 +441,23 @@ struct sched_statistics { + #endif + }; + ++#ifdef CONFIG_CACULE_SCHED ++struct cacule_node { ++ struct cacule_node* next; ++ struct cacule_node* prev; ++ u64 cacule_start_time; ++ u64 vruntime; ++}; ++#endif ++ + struct sched_entity { + /* For load-balancing: */ + struct load_weight load; + unsigned long runnable_weight; + struct rb_node run_node; ++#ifdef CONFIG_CACULE_SCHED ++ struct cacule_node cacule_node; ++#endif + struct list_head group_node; + unsigned int on_rq; + +diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h +index d4f6215ee03f..7f8f058f5e78 100644 +--- a/include/linux/sched/sysctl.h ++++ b/include/linux/sched/sysctl.h +@@ -25,6 +25,12 @@ extern unsigned int sysctl_sched_min_granularity; + extern unsigned int sysctl_sched_wakeup_granularity; + extern unsigned int sysctl_sched_child_runs_first; + ++#ifdef CONFIG_CACULE_SCHED ++extern unsigned int interactivity_factor; ++extern unsigned int interactivity_threshold; ++extern unsigned int cacule_max_lifetime; ++#endif ++ + enum sched_tunable_scaling { + SCHED_TUNABLESCALING_NONE, + SCHED_TUNABLESCALING_LOG, +diff --git a/init/Kconfig b/init/Kconfig +index 4f9fd78e2200..3d558d9a9b20 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -733,6 +733,17 @@ config UCLAMP_BUCKETS_COUNT + + endmenu + ++config CACULE_SCHED ++ bool "CacULE CPU scheduler" ++ default y ++ help ++ The CacULE CPU scheduler is based on interactivity score mechanism. ++ The interactivity score is inspired by the ULE scheduler (FreeBSD ++ scheduler). ++ ++ If unsure, say Y here. ++ ++ + # + # For architectures that want to enable the support for NUMA-affine scheduler + # balancing logic: +@@ -1118,6 +1129,7 @@ config SCHED_AUTOGROUP + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED ++ default y + help + This option optimizes the scheduler for common desktop workloads by + automatically creating and populating task groups. This separation +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 38ef6d06888e..865f8dbddca8 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -46,6 +46,9 @@ choice + 1000 Hz is the preferred choice for desktop systems and other + systems requiring fast interactive responses to events. + ++ config HZ_2000 ++ bool "2000 HZ" ++ + endchoice + + config HZ +@@ -54,6 +57,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 1000 if HZ_1000 ++ default 2000 if HZ_2000 + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index a3e95d7779e1..9e7d4d904da8 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -2771,6 +2771,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; + p->se.vruntime = 0; ++ ++#ifdef CONFIG_CACULE_SCHED ++ p->se.cacule_node.vruntime = 0; ++#endif ++ + INIT_LIST_HEAD(&p->se.group_node); + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -3047,6 +3052,10 @@ void wake_up_new_task(struct task_struct *p) + update_rq_clock(rq); + post_init_entity_util_avg(p); + ++#ifdef CONFIG_CACULE_SCHED ++ p->se.cacule_node.cacule_start_time = sched_clock(); ++#endif ++ + activate_task(rq, p, ENQUEUE_NOCLOCK); + trace_sched_wakeup_new(p); + check_preempt_curr(rq, p, WF_FORK); +@@ -6645,6 +6654,10 @@ void __init sched_init(void) + unsigned long ptr = 0; + int i; + ++#ifdef CONFIG_CACULE_SCHED ++ printk(KERN_INFO "CacULE CPU scheduler v5.4-r2 by Hamad Al Marri."); ++#endif ++ + wait_bit_init(); + + #ifdef CONFIG_FAIR_GROUP_SCHED +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index faada713cfae..b06c87fda972 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -501,8 +501,11 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) + + void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) + { +- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, +- spread, rq0_min_vruntime, spread0; ++ s64 MIN_vruntime = -1, max_vruntime = -1, ++#if !defined(CONFIG_CACULE_SCHED) ++ min_vruntime, rq0_min_vruntime, spread0, ++#endif ++ spread; + struct rq *rq = cpu_rq(cpu); + struct sched_entity *last; + unsigned long flags; +@@ -523,21 +526,27 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) + last = __pick_last_entity(cfs_rq); + if (last) + max_vruntime = last->vruntime; ++#if !defined(CONFIG_CACULE_SCHED) + min_vruntime = cfs_rq->min_vruntime; + rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; ++#endif + raw_spin_unlock_irqrestore(&rq->lock, flags); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", + SPLIT_NS(MIN_vruntime)); ++#if !defined(CONFIG_CACULE_SCHED) + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", + SPLIT_NS(min_vruntime)); ++#endif + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", + SPLIT_NS(max_vruntime)); + spread = max_vruntime - MIN_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", + SPLIT_NS(spread)); ++#if !defined(CONFIG_CACULE_SCHED) + spread0 = min_vruntime - rq0_min_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", + SPLIT_NS(spread0)); ++#endif + SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", + cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 092aa5e47251..d0018ec66700 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -19,6 +19,10 @@ + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra ++ * ++ * CacULE enhancements CPU cache and scheduler based on ++ * Interactivity Score. ++ * (C) 2020 Hamad Al Marri + */ + #include "sched.h" + +@@ -102,6 +106,17 @@ int __weak arch_asym_cpu_priority(int cpu) + */ + #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) + ++#endif ++#ifdef CONFIG_CACULE_SCHED ++unsigned int __read_mostly cacule_max_lifetime = 22000; // in ms ++unsigned int __read_mostly interactivity_factor = 32768; ++ ++#ifdef CONFIG_FAIR_GROUP_SCHED ++unsigned int __read_mostly interactivity_threshold = 0; ++#else ++unsigned int __read_mostly interactivity_threshold = 1000; ++#endif ++ + #endif + + #ifdef CONFIG_CFS_BANDWIDTH +@@ -243,6 +258,14 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight + + const struct sched_class fair_sched_class; + ++ ++#ifdef CONFIG_CACULE_SCHED ++static inline struct sched_entity *se_of(struct cacule_node *cn) ++{ ++ return container_of(cn, struct sched_entity, cacule_node); ++} ++#endif ++ + /************************************************************** + * CFS operations on generic schedulable entities: + */ +@@ -502,7 +525,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); + /************************************************************** + * Scheduling class tree data structure manipulation methods: + */ +- ++#if !defined(CONFIG_CACULE_SCHED) + static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) + { + s64 delta = (s64)(vruntime - max_vruntime); +@@ -558,7 +581,170 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; + #endif + } ++#endif /* CONFIG_CACULE_SCHED */ ++ ++#ifdef CONFIG_CACULE_SCHED ++static unsigned int ++calc_interactivity(u64 now, struct cacule_node *se) ++{ ++ u64 l_se, vr_se, sleep_se = 1ULL, u64_factor_m, _2m; ++ unsigned int score_se; ++ ++ /* ++ * in case of vruntime==0, logical OR with 1 would ++ * make sure that the least sig. bit is 1 ++ */ ++ l_se = now - se->cacule_start_time; ++ vr_se = se->vruntime | 1; ++ u64_factor_m = interactivity_factor; ++ _2m = u64_factor_m << 1; ++ ++ /* safety check */ ++ if (likely(l_se > vr_se)) ++ sleep_se = (l_se - vr_se) | 1; ++ ++ if (sleep_se >= vr_se) ++ score_se = u64_factor_m / (sleep_se / vr_se); ++ else ++ score_se = _2m - (u64_factor_m / (vr_se / sleep_se)); ++ ++ return score_se; ++} ++ ++static inline int is_interactive(struct cacule_node *cn) ++{ ++ if (!interactivity_threshold || se_of(cn)->vruntime == 0) ++ return 0; ++ ++ return calc_interactivity(sched_clock(), cn) < interactivity_threshold; ++} ++ ++static inline int ++entity_before_cached(u64 now, unsigned int score_curr, struct cacule_node *se) ++{ ++ unsigned int score_se; ++ int diff; ++ ++ score_se = calc_interactivity(now, se); ++ diff = score_se - score_curr; ++ ++ if (diff <= 0) ++ return 1; ++ ++ return -1; ++} ++ ++/* ++ * Does se have lower interactivity score value (i.e. interactive) than curr? If yes, return 1, ++ * otherwise return -1 ++ * se is before curr if se has lower interactivity score value ++ * the lower score, the more interactive ++ */ ++static inline int ++entity_before(u64 now, struct cacule_node *curr, struct cacule_node *se) ++{ ++ unsigned int score_curr, score_se; ++ int diff; ++ ++ score_curr = calc_interactivity(now, curr); ++ score_se = calc_interactivity(now, se); ++ ++ diff = score_se - score_curr; ++ ++ if (diff < 0) ++ return 1; ++ ++ return -1; ++} ++ ++/* ++ * Enqueue an entity ++ */ ++static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *_se) ++{ ++ struct cacule_node *se = &(_se->cacule_node); ++ struct cacule_node *iter, *next = NULL; ++ u64 now = sched_clock(); ++ unsigned int score_se = calc_interactivity(now, se); ++ ++ se->next = NULL; ++ se->prev = NULL; ++ ++ if (likely(cfs_rq->head)) { ++ ++ // start from tail ++ iter = cfs_rq->tail; ++ ++ // does se have higher IS than iter? ++ while (iter && entity_before_cached(now, score_se, iter) == -1) { ++ next = iter; ++ iter = iter->prev; ++ } ++ ++ // se in tail position ++ if (iter == cfs_rq->tail) { ++ cfs_rq->tail->next = se; ++ se->prev = cfs_rq->tail; ++ ++ cfs_rq->tail = se; ++ } ++ // else if not head no tail, insert se after iter ++ else if (iter) { ++ se->next = next; ++ se->prev = iter; ++ ++ iter->next = se; ++ next->prev = se; ++ } ++ // insert se at head ++ else { ++ se->next = cfs_rq->head; ++ cfs_rq->head->prev = se; ++ ++ // lastly reset the head ++ cfs_rq->head = se; ++ } ++ } else { ++ // if empty rq ++ cfs_rq->head = se; ++ cfs_rq->tail = se; ++ } ++} ++ ++static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *_se) ++{ ++ struct cacule_node *se = &(_se->cacule_node); ++ ++ // if only one se in rq ++ if (cfs_rq->head == cfs_rq->tail) { ++ cfs_rq->head = NULL; ++ cfs_rq->tail = NULL; ++ ++ } else if (se == cfs_rq->head) { ++ // if it is the head ++ cfs_rq->head = cfs_rq->head->next; ++ cfs_rq->head->prev = NULL; ++ } else if (se == cfs_rq->tail) { ++ // if it is the tail ++ cfs_rq->tail = cfs_rq->tail->prev; ++ cfs_rq->tail->next = NULL; ++ } else { ++ // if in the middle ++ struct cacule_node *prev = se->prev; ++ struct cacule_node *next = se->next; ++ ++ prev->next = next; ++ ++ if (next) ++ next->prev = prev; ++ } ++} + ++struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) ++{ ++ return se_of(cfs_rq->head); ++} ++#else + /* + * Enqueue an entity into the rb-tree: + */ +@@ -616,16 +802,24 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) + + return rb_entry(next, struct sched_entity, run_node); + } ++#endif /* CONFIG_CACULE_SCHED */ + + #ifdef CONFIG_SCHED_DEBUG + struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) + { ++#ifdef CONFIG_CACULE_SCHED ++ if (!cfs_rq->tail) ++ return NULL; ++ ++ return se_of(cfs_rq->tail); ++#else + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); + + if (!last) + return NULL; + + return rb_entry(last, struct sched_entity, run_node); ++#endif /* CONFIG_CACULE_SCHED */ + } + + /************************************************************** +@@ -711,6 +905,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) + return slice; + } + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * We calculate the vruntime slice of a to-be-inserted task. + * +@@ -720,6 +915,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) + { + return calc_delta_fair(sched_slice(cfs_rq, se), se); + } ++#endif /* CONFIG_CACULE_SCHED */ + + #include "pelt.h" + #ifdef CONFIG_SMP +@@ -827,14 +1023,51 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) + } + #endif /* CONFIG_SMP */ + ++#ifdef CONFIG_CACULE_SCHED ++static void normalize_lifetime(u64 now, struct sched_entity *se) ++{ ++ struct cacule_node *cn = &se->cacule_node; ++ u64 max_life_ns, life_time; ++ s64 diff; ++ ++ /* ++ * left shift 20 bits is approximately = * 1000000 ++ * we don't need the precision of life time ++ * Ex. for 30s, with left shift (20bits) == 31.457s ++ */ ++ max_life_ns = ((u64) cacule_max_lifetime) << 20; ++ life_time = now - cn->cacule_start_time; ++ diff = life_time - max_life_ns; ++ ++ if (diff > 0) { ++ // multiply life_time by 1024 for more precision ++ u64 old_hrrn_x = (life_time << 7) / ((cn->vruntime >> 3) | 1); ++ ++ // reset life to half max_life (i.e ~15s) ++ cn->cacule_start_time = now - (max_life_ns >> 1); ++ ++ // avoid division by zero ++ if (old_hrrn_x == 0) old_hrrn_x = 1; ++ ++ // reset vruntime based on old hrrn ratio ++ cn->vruntime = (max_life_ns << 9) / old_hrrn_x; ++ } ++} ++#endif /* CONFIG_CACULE_SCHED */ ++ + /* + * Update the current task's runtime statistics. + */ + static void update_curr(struct cfs_rq *cfs_rq) + { + struct sched_entity *curr = cfs_rq->curr; ++#ifdef CONFIG_CACULE_SCHED ++ u64 now = sched_clock(); ++ u64 delta_exec, delta_fair; ++#else + u64 now = rq_clock_task(rq_of(cfs_rq)); + u64 delta_exec; ++#endif + + if (unlikely(!curr)) + return; +@@ -851,8 +1084,15 @@ static void update_curr(struct cfs_rq *cfs_rq) + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq->exec_clock, delta_exec); + ++#ifdef CONFIG_CACULE_SCHED ++ delta_fair = calc_delta_fair(delta_exec, curr); ++ curr->vruntime += delta_fair; ++ curr->cacule_node.vruntime += delta_fair; ++ normalize_lifetime(now, curr); ++#else + curr->vruntime += calc_delta_fair(delta_exec, curr); + update_min_vruntime(cfs_rq); ++#endif + + if (entity_is_task(curr)) { + struct task_struct *curtask = task_of(curr); +@@ -1011,7 +1251,6 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + static inline void + update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { +- + if (!schedstat_enabled()) + return; + +@@ -1043,7 +1282,11 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) + /* + * We are starting a new run period: + */ ++#ifdef CONFIG_CACULE_SCHED ++ se->exec_start = sched_clock(); ++#else + se->exec_start = rq_clock_task(rq_of(cfs_rq)); ++#endif + } + + /************************************************** +@@ -3866,7 +4109,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} + + static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) + { +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_CACULE_SCHED) + s64 d = se->vruntime - cfs_rq->min_vruntime; + + if (d < 0) +@@ -3877,6 +4120,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) + #endif + } + ++#if !defined(CONFIG_CACULE_SCHED) + static void + place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) + { +@@ -3908,6 +4152,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) + /* ensure we never gain time by being placed backwards. */ + se->vruntime = max_vruntime(se->vruntime, vruntime); + } ++#endif /* CONFIG_CACULE_SCHED */ + + static void check_enqueue_throttle(struct cfs_rq *cfs_rq); + +@@ -3966,18 +4211,23 @@ static inline bool cfs_bandwidth_used(void); + static void + enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { ++#if !defined(CONFIG_CACULE_SCHED) + bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); ++#endif + bool curr = cfs_rq->curr == se; + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * If we're the current task, we must renormalise before calling + * update_curr(). + */ + if (renorm && curr) + se->vruntime += cfs_rq->min_vruntime; ++#endif + + update_curr(cfs_rq); + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Otherwise, renormalise after, such that we're placed at the current + * moment in time, instead of some random moment in the past. Being +@@ -3986,6 +4236,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + */ + if (renorm && !curr) + se->vruntime += cfs_rq->min_vruntime; ++#endif + + /* + * When enqueuing a sched_entity, we must: +@@ -4000,8 +4251,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + enqueue_runnable_load_avg(cfs_rq, se); + account_entity_enqueue(cfs_rq, se); + ++#if !defined(CONFIG_CACULE_SCHED) + if (flags & ENQUEUE_WAKEUP) + place_entity(cfs_rq, se, 0); ++#endif + + check_schedstat_required(); + update_stats_enqueue(cfs_rq, se, flags); +@@ -4022,6 +4275,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + check_enqueue_throttle(cfs_rq); + } + ++#if !defined(CONFIG_CACULE_SCHED) + static void __clear_buddies_last(struct sched_entity *se) + { + for_each_sched_entity(se) { +@@ -4066,6 +4320,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) + if (cfs_rq->skip == se) + __clear_buddies_skip(se); + } ++#endif /* !CONFIG_CACULE_SCHED */ + + static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); + +@@ -4090,13 +4345,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + + update_stats_dequeue(cfs_rq, se, flags); + ++#if !defined(CONFIG_CACULE_SCHED) + clear_buddies(cfs_rq, se); ++#endif + + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->on_rq = 0; + account_entity_dequeue(cfs_rq, se); + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Normalize after update_curr(); which will also have moved + * min_vruntime if @se is the one holding it back. But before doing +@@ -4105,12 +4363,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + */ + if (!(flags & DEQUEUE_SLEEP)) + se->vruntime -= cfs_rq->min_vruntime; ++#endif + + /* return excess runtime on last dequeue */ + return_cfs_rq_runtime(cfs_rq); + + update_cfs_group(se); + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Now advance min_vruntime if @se was the entity holding it back, + * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be +@@ -4119,8 +4379,21 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + */ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) + update_min_vruntime(cfs_rq); ++#endif + } + ++#ifdef CONFIG_CACULE_SCHED ++/* ++ * Preempt the current task with a newly woken task if needed: ++ */ ++static void ++check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ++{ ++ // does head have higher IS than curr ++ if (entity_before(sched_clock(), &curr->cacule_node, cfs_rq->head) == 1) ++ resched_curr(rq_of(cfs_rq)); ++} ++#else + /* + * Preempt the current task with a newly woken task if needed: + */ +@@ -4160,6 +4433,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + if (delta > ideal_runtime) + resched_curr(rq_of(cfs_rq)); + } ++#endif /* CONFIG_CACULE_SCHED */ + + static void + set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -4194,6 +4468,21 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + se->prev_sum_exec_runtime = se->sum_exec_runtime; + } + ++#ifdef CONFIG_CACULE_SCHED ++static struct sched_entity * ++pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) ++{ ++ struct cacule_node *se = cfs_rq->head; ++ ++ if (unlikely(!se)) ++ se = &curr->cacule_node; ++ else if (unlikely(curr ++ && entity_before(sched_clock(), se, &curr->cacule_node) == 1)) ++ se = &curr->cacule_node; ++ ++ return se_of(se); ++} ++#else + static int + wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + +@@ -4254,6 +4543,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) + + return se; + } ++#endif /* CONFIG_CACULE_SCHED */ + + static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); + +@@ -5345,7 +5635,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + hrtick_update(rq); + } + ++#if !defined(CONFIG_CACULE_SCHED) + static void set_next_buddy(struct sched_entity *se); ++#endif + + /* + * The dequeue_task method is called before nr_running is +@@ -5374,12 +5666,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + if (cfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Bias pick_next to pick a task from this cfs_rq, as + * p is sleeping when it is within its sched_slice. + */ + if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + set_next_buddy(se); ++#endif + break; + } + flags |= DEQUEUE_SLEEP; +@@ -5457,6 +5751,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) + return 0; + } + ++#if !defined(CONFIG_CACULE_SCHED) + static void record_wakee(struct task_struct *p) + { + /* +@@ -5503,6 +5798,7 @@ static int wake_wide(struct task_struct *p) + return 0; + return 1; + } ++#endif /* CONFIG_CACULE_SCHED */ + + /* + * The purpose of wake_affine() is to quickly determine on which CPU we can run +@@ -6269,6 +6565,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) + return !task_fits_capacity(p, min_cap); + } + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) + * to @dst_cpu. +@@ -6493,6 +6790,57 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + + return -1; + } ++#endif /* CONFIG_CACULE_SCHED */ ++ ++#ifdef CONFIG_CACULE_SCHED ++static int ++find_least_IS_cpu(struct task_struct *p) ++{ ++ struct cfs_rq *cfs_rq; ++ unsigned int max_IS = 0; ++ unsigned int IS, IS_c, IS_h; ++ struct sched_entity *curr_se; ++ struct cacule_node *cn, *head; ++ int cpu_i; ++ int new_cpu = -1; ++ ++ for_each_online_cpu(cpu_i) { ++ if (!cpumask_test_cpu(cpu_i, p->cpus_ptr)) ++ continue; ++ ++ cn = NULL; ++ cfs_rq = &cpu_rq(cpu_i)->cfs; ++ ++ curr_se = cfs_rq->curr; ++ head = cfs_rq->head; ++ ++ if (!curr_se && head) ++ cn = head; ++ else if (curr_se && !head) ++ cn = &curr_se->cacule_node; ++ else if (curr_se && head) { ++ IS_c = calc_interactivity(sched_clock(), &curr_se->cacule_node); ++ IS_h = calc_interactivity(sched_clock(), head); ++ ++ IS = IS_c > IS_h? IS_c : IS_h; ++ goto compare; ++ } ++ ++ if (!cn) ++ return cpu_i; ++ ++ IS = calc_interactivity(sched_clock(), cn); ++ ++compare: ++ if (IS > max_IS) { ++ max_IS = IS; ++ new_cpu = cpu_i; ++ } ++ } ++ ++ return new_cpu; ++} ++#endif + + /* + * select_task_rq_fair: Select target runqueue for the waking task in domains +@@ -6515,6 +6863,26 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f + int want_affine = 0; + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + ++#ifdef CONFIG_CACULE_SCHED ++ struct sched_entity *se = &p->se; ++ ++ if (!is_interactive(&se->cacule_node)) ++ goto cfs_way; ++ ++ // check first if the prev cpu ++ // has 0 tasks ++ if (cpumask_test_cpu(prev_cpu, p->cpus_ptr) && ++ cpu_rq(prev_cpu)->cfs.nr_running == 0) ++ return prev_cpu; ++ ++ new_cpu = find_least_IS_cpu(p); ++ ++ if (new_cpu != -1) ++ return new_cpu; ++ ++ new_cpu = prev_cpu; ++cfs_way: ++#else + if (sd_flag & SD_BALANCE_WAKE) { + record_wakee(p); + +@@ -6528,6 +6896,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && + cpumask_test_cpu(cpu, p->cpus_ptr); + } ++#endif /* CONFIG_CACULE_SCHED */ + + rcu_read_lock(); + for_each_domain(cpu, tmp) { +@@ -6578,6 +6947,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se); + */ + static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) + { ++#if !defined(CONFIG_CACULE_SCHED) + /* + * As blocked tasks retain absolute vruntime the migration needs to + * deal with this by subtracting the old and adding the new +@@ -6603,6 +6973,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) + + se->vruntime -= min_vruntime; + } ++#endif /* CONFIG_CACULE_SCHED */ + + if (p->on_rq == TASK_ON_RQ_MIGRATING) { + /* +@@ -6648,6 +7019,7 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + } + #endif /* CONFIG_SMP */ + ++#if !defined(CONFIG_CACULE_SCHED) + static unsigned long wakeup_gran(struct sched_entity *se) + { + unsigned long gran = sysctl_sched_wakeup_granularity; +@@ -6726,6 +7098,7 @@ static void set_skip_buddy(struct sched_entity *se) + for_each_sched_entity(se) + cfs_rq_of(se)->skip = se; + } ++#endif /* CONFIG_CACULE_SCHED */ + + /* + * Preempt the current task with a newly woken task if needed: +@@ -6734,9 +7107,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + { + struct task_struct *curr = rq->curr; + struct sched_entity *se = &curr->se, *pse = &p->se; ++ ++#if !defined(CONFIG_CACULE_SCHED) + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + int scale = cfs_rq->nr_running >= sched_nr_latency; + int next_buddy_marked = 0; ++#endif /* CONFIG_CACULE_SCHED */ + + if (unlikely(se == pse)) + return; +@@ -6750,10 +7126,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + return; + ++#if !defined(CONFIG_CACULE_SCHED) + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { + set_next_buddy(pse); + next_buddy_marked = 1; + } ++#endif /* CONFIG_CACULE_SCHED */ + + /* + * We can come here with TIF_NEED_RESCHED already set from new task +@@ -6783,6 +7161,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + find_matching_se(&se, &pse); + update_curr(cfs_rq_of(se)); + BUG_ON(!pse); ++ ++#ifdef CONFIG_CACULE_SCHED ++ if (entity_before(sched_clock(), &se->cacule_node, &pse->cacule_node) == 1) ++ goto preempt; ++#else + if (wakeup_preempt_entity(se, pse) == 1) { + /* + * Bias pick_next to pick the sched entity that is +@@ -6792,11 +7175,14 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + set_next_buddy(pse); + goto preempt; + } ++#endif /* CONFIG_CACULE_SCHED */ + + return; + + preempt: + resched_curr(rq); ++ ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved +@@ -6811,6 +7197,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); ++#endif /* CONFIG_CACULE_SCHED */ + } + + static struct task_struct * +@@ -6980,7 +7367,10 @@ static void yield_task_fair(struct rq *rq) + { + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); ++ ++#if !defined(CONFIG_CACULE_SCHED) + struct sched_entity *se = &curr->se; ++#endif + + /* + * Are we the only task in the tree? +@@ -6988,7 +7378,9 @@ static void yield_task_fair(struct rq *rq) + if (unlikely(rq->nr_running == 1)) + return; + ++#if !defined(CONFIG_CACULE_SCHED) + clear_buddies(cfs_rq, se); ++#endif + + if (curr->policy != SCHED_BATCH) { + update_rq_clock(rq); +@@ -7004,7 +7396,9 @@ static void yield_task_fair(struct rq *rq) + rq_clock_skip_update(rq); + } + ++#if !defined(CONFIG_CACULE_SCHED) + set_skip_buddy(se); ++#endif + } + + static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +@@ -7015,8 +7409,10 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp + if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) + return false; + ++#if !defined(CONFIG_CACULE_SCHED) + /* Tell the scheduler that we'd really like pse to run next. */ + set_next_buddy(se); ++#endif + + yield_task_fair(rq); + +@@ -7202,6 +7598,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) + if (unlikely(task_has_idle_policy(p))) + return 0; + ++#if !defined(CONFIG_CACULE_SCHED) + /* + * Buddy candidates are cache hot: + */ +@@ -7209,6 +7606,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) + (&p->se == cfs_rq_of(&p->se)->next || + &p->se == cfs_rq_of(&p->se)->last)) + return 1; ++#endif + + if (sysctl_sched_migration_cost == -1) + return 1; +@@ -10051,11 +10449,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + update_overutilized_status(task_rq(curr)); + } + ++#ifdef CONFIG_CACULE_SCHED + /* + * called on fork with the child task as argument from the parent's context + * - child not yet on the tasklist + * - preemption disabled + */ ++ static void task_fork_fair(struct task_struct *p) ++{ ++ struct cfs_rq *cfs_rq; ++ struct sched_entity *curr; ++ struct rq *rq = this_rq(); ++ struct rq_flags rf; ++ ++ rq_lock(rq, &rf); ++ update_rq_clock(rq); ++ ++ cfs_rq = task_cfs_rq(current); ++ curr = cfs_rq->curr; ++ if (curr) ++ update_curr(cfs_rq); ++ ++ rq_unlock(rq, &rf); ++} ++#else + static void task_fork_fair(struct task_struct *p) + { + struct cfs_rq *cfs_rq; +@@ -10086,6 +10503,7 @@ static void task_fork_fair(struct task_struct *p) + se->vruntime -= cfs_rq->min_vruntime; + rq_unlock(rq, &rf); + } ++#endif /* CONFIG_CACULE_SCHED */ + + /* + * Priority of the task has changed. Check to see if we preempt +@@ -10201,6 +10619,8 @@ static void attach_entity_cfs_rq(struct sched_entity *se) + static void detach_task_cfs_rq(struct task_struct *p) + { + struct sched_entity *se = &p->se; ++ ++#if !defined(CONFIG_CACULE_SCHED) + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { +@@ -10211,6 +10631,7 @@ static void detach_task_cfs_rq(struct task_struct *p) + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } ++#endif + + detach_entity_cfs_rq(se); + } +@@ -10218,12 +10639,17 @@ static void detach_task_cfs_rq(struct task_struct *p) + static void attach_task_cfs_rq(struct task_struct *p) + { + struct sched_entity *se = &p->se; ++ ++#if !defined(CONFIG_CACULE_SCHED) + struct cfs_rq *cfs_rq = cfs_rq_of(se); ++#endif + + attach_entity_cfs_rq(se); + ++#if !defined(CONFIG_CACULE_SCHED) + if (!vruntime_normalized(p)) + se->vruntime += cfs_rq->min_vruntime; ++#endif + } + + static void switched_from_fair(struct rq *rq, struct task_struct *p) +@@ -10279,13 +10705,22 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) + void init_cfs_rq(struct cfs_rq *cfs_rq) + { + cfs_rq->tasks_timeline = RB_ROOT_CACHED; ++ ++#if !defined(CONFIG_CACULE_SCHED) + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); + #ifndef CONFIG_64BIT + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; + #endif ++#endif /* CONFIG_CACULE_SCHED */ ++ + #ifdef CONFIG_SMP + raw_spin_lock_init(&cfs_rq->removed.lock); + #endif ++ ++#ifdef CONFIG_CACULE_SCHED ++ cfs_rq->head = NULL; ++ cfs_rq->tail = NULL; ++#endif + } + + #ifdef CONFIG_FAIR_GROUP_SCHED +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 4e490e3db2f8..ba73c73350ec 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -495,10 +495,13 @@ struct cfs_rq { + unsigned int idle_h_nr_running; /* SCHED_IDLE */ + + u64 exec_clock; ++ ++#if !defined(CONFIG_CACULE_SCHED) + u64 min_vruntime; + #ifndef CONFIG_64BIT + u64 min_vruntime_copy; + #endif ++#endif /* CONFIG_CACULE_SCHED */ + + struct rb_root_cached tasks_timeline; + +@@ -507,9 +510,15 @@ struct cfs_rq { + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr; ++#ifdef CONFIG_CACULE_SCHED ++ struct cacule_node *head; ++ struct cacule_node *tail; ++ ++#else + struct sched_entity *next; + struct sched_entity *last; + struct sched_entity *skip; ++#endif // CONFIG_CACULE_SCHED + + #ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index eae6a078619f..9201f91316a5 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -324,6 +324,29 @@ static struct ctl_table kern_table[] = { + .mode = 0644, + .proc_handler = proc_dointvec, + }, ++#ifdef CONFIG_CACULE_SCHED ++ { ++ .procname = "sched_interactivity_factor", ++ .data = &interactivity_factor, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { ++ .procname = "sched_interactivity_threshold", ++ .data = &interactivity_threshold, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { ++ .procname = "sched_max_lifetime_ms", ++ .data = &cacule_max_lifetime, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_SCHED_DEBUG + { + .procname = "sched_min_granularity_ns",