sched: Add snapshot of Window Assisted Load Tracking (WALT)

This snapshot is taken from msm-4.14 as of
commit 871eac76e6be567 ("sched: Improve the scheduler").

Change-Id: Ib4e0b39526d3009cedebb626ece5a767d8247846
Signed-off-by: Satya Durga Srinivasu Prabhala <satyap@codeaurora.org>
This commit is contained in:
Satya Durga Srinivasu Prabhala
2018-11-07 13:55:58 -08:00
parent a3d9530ab5
commit 7ebdf76d85
25 changed files with 6798 additions and 81 deletions

View File

@@ -206,6 +206,21 @@ struct task_group;
/* Task command name length: */
#define TASK_COMM_LEN 16
enum task_event {
PUT_PREV_TASK = 0,
PICK_NEXT_TASK = 1,
TASK_WAKE = 2,
TASK_MIGRATE = 3,
TASK_UPDATE = 4,
IRQ_UPDATE = 5,
};
/* Note: this need to be in sync with migrate_type_names array */
enum migrate_types {
GROUP_TO_RQ,
RQ_TO_GROUP,
};
extern void scheduler_tick(void);
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
@@ -480,6 +495,102 @@ struct sched_entity {
#endif
};
struct sched_load {
unsigned long prev_load;
unsigned long new_task_load;
unsigned long predicted_load;
};
struct cpu_cycle_counter_cb {
u64 (*get_cpu_cycle_counter)(int cpu);
};
#define MAX_NUM_CGROUP_COLOC_ID 20
extern DEFINE_PER_CPU_READ_MOSTLY(int, sched_load_boost);
#ifdef CONFIG_SCHED_WALT
extern void sched_exit(struct task_struct *p);
extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb);
extern void sched_set_io_is_busy(int val);
extern int sched_set_group_id(struct task_struct *p, unsigned int group_id);
extern unsigned int sched_get_group_id(struct task_struct *p);
extern int sched_set_init_task_load(struct task_struct *p, int init_load_pct);
extern u32 sched_get_init_task_load(struct task_struct *p);
extern void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin,
u32 fmax);
extern int sched_set_boost(int enable);
extern void free_task_load_ptrs(struct task_struct *p);
#define RAVG_HIST_SIZE_MAX 5
#define NUM_BUSY_BUCKETS 10
/* ravg represents frequency scaled cpu-demand of tasks */
struct ravg {
/*
* 'mark_start' marks the beginning of an event (task waking up, task
* starting to execute, task being preempted) within a window
*
* 'sum' represents how runnable a task has been within current
* window. It incorporates both running time and wait time and is
* frequency scaled.
*
* 'sum_history' keeps track of history of 'sum' seen over previous
* RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
* ignored.
*
* 'demand' represents maximum sum seen over previous
* sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
* demand for tasks.
*
* 'curr_window_cpu' represents task's contribution to cpu busy time on
* various CPUs in the current window
*
* 'prev_window_cpu' represents task's contribution to cpu busy time on
* various CPUs in the previous window
*
* 'curr_window' represents the sum of all entries in curr_window_cpu
*
* 'prev_window' represents the sum of all entries in prev_window_cpu
*
* 'pred_demand' represents task's current predicted cpu busy time
*
* 'busy_buckets' groups historical busy time into different buckets
* used for prediction
*
* 'demand_scaled' represents task's demand scaled to 1024
*/
u64 mark_start;
u32 sum, demand;
u32 coloc_demand;
u32 sum_history[RAVG_HIST_SIZE_MAX];
u32 *curr_window_cpu, *prev_window_cpu;
u32 curr_window, prev_window;
u16 active_windows;
u32 pred_demand;
u8 busy_buckets[NUM_BUSY_BUCKETS];
u16 demand_scaled;
u16 pred_demand_scaled;
};
#else
static inline void sched_exit(struct task_struct *p) { }
static inline int
register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
{
return 0;
}
static inline void sched_set_io_is_busy(int val) {};
static inline int sched_set_boost(int enable)
{
return -EINVAL;
}
static inline void free_task_load_ptrs(struct task_struct *p) { }
static inline void sched_update_cpu_freq_min_max(const cpumask_t *cpus,
u32 fmin, u32 fmax) { }
#endif /* CONFIG_SCHED_WALT */
struct sched_rt_entity {
struct list_head run_list;
unsigned long timeout;
@@ -644,6 +755,22 @@ struct task_struct {
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
u64 last_sleep_ts;
#ifdef CONFIG_SCHED_WALT
struct ravg ravg;
/*
* 'init_load_pct' represents the initial task load assigned to children
* of this task
*/
u32 init_load_pct;
u64 last_wake_ts;
u64 last_enqueued_ts;
struct related_thread_group *grp;
struct list_head grp_list;
u64 cpu_cycles;
bool misfit;
#endif
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
@@ -1394,6 +1521,7 @@ extern struct pid *cad_pid;
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
#define PF_WAKE_UP_IDLE 0x01000000 /* TTWU on an idle CPU */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
@@ -1904,4 +2032,32 @@ static inline void rseq_syscall(struct pt_regs *regs)
#endif
static inline u32 sched_get_wake_up_idle(struct task_struct *p)
{
u32 enabled = p->flags & PF_WAKE_UP_IDLE;
return !!enabled;
}
static inline int sched_set_wake_up_idle(struct task_struct *p,
int wake_up_idle)
{
int enable = !!wake_up_idle;
if (enable)
p->flags |= PF_WAKE_UP_IDLE;
else
p->flags &= ~PF_WAKE_UP_IDLE;
return 0;
}
static inline void set_wake_up_idle(bool enabled)
{
if (enabled)
current->flags |= PF_WAKE_UP_IDLE;
else
current->flags &= ~PF_WAKE_UP_IDLE;
}
#endif

View File

@@ -0,0 +1,28 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2016, 2018, The Linux Foundation. All rights reserved.
*/
#ifndef __CORE_CTL_H
#define __CORE_CTL_H
struct core_ctl_notif_data {
unsigned int nr_big;
unsigned int coloc_load_pct;
};
#ifdef CONFIG_SCHED_CORE_CTL
void core_ctl_check(u64 wallclock);
int core_ctl_set_boost(bool boost);
void core_ctl_notifier_register(struct notifier_block *n);
void core_ctl_notifier_unregister(struct notifier_block *n);
#else
static inline void core_ctl_check(u64 wallclock) {}
static inline int core_ctl_set_boost(bool boost)
{
return 0;
}
static inline void core_ctl_notifier_register(struct notifier_block *n) {}
static inline void core_ctl_notifier_unregister(struct notifier_block *n) {}
#endif
#endif

View File

@@ -11,6 +11,12 @@
#define SCHED_CPUFREQ_IOWAIT (1U << 0)
#define SCHED_CPUFREQ_MIGRATION (1U << 1)
#define SCHED_CPUFREQ_INTERCLUSTER_MIG (1U << 3)
#define SCHED_CPUFREQ_WALT (1U << 4)
#define SCHED_CPUFREQ_PL (1U << 5)
#define SCHED_CPUFREQ_EARLY_DET (1U << 6)
#define SCHED_CPUFREQ_FORCE_UPDATE (1U << 7)
#define SCHED_CPUFREQ_CONTINUE (1U << 8)
#ifdef CONFIG_CPU_FREQ
struct update_util_data {

View File

@@ -22,6 +22,20 @@ extern unsigned long nr_iowait(void);
extern unsigned long nr_iowait_cpu(int cpu);
extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
#ifdef CONFIG_SMP
extern unsigned int sched_get_cpu_util(int cpu);
extern u64 sched_get_cpu_last_busy_time(int cpu);
#else
static inline unsigned int sched_get_cpu_util(int cpu)
{
return 0;
}
static inline u64 sched_get_cpu_last_busy_time(int cpu)
{
return 0;
}
#endif
static inline int sched_info_on(void)
{
#ifdef CONFIG_SCHEDSTATS

View File

@@ -26,6 +26,25 @@ extern unsigned int sysctl_sched_sync_hint_enable;
extern unsigned int sysctl_sched_cstate_aware;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_child_runs_first;
#ifdef CONFIG_SCHED_WALT
extern unsigned int sysctl_sched_use_walt_cpu_util;
extern unsigned int sysctl_sched_use_walt_task_util;
extern unsigned int sysctl_sched_walt_init_task_load_pct;
extern unsigned int sysctl_sched_cpu_high_irqload;
extern unsigned int sysctl_sched_boost;
extern unsigned int sysctl_sched_group_upmigrate_pct;
extern unsigned int sysctl_sched_group_downmigrate_pct;
extern unsigned int sysctl_sched_walt_rotate_big_tasks;
extern unsigned int sysctl_sched_min_task_util_for_boost;
extern unsigned int sysctl_sched_min_task_util_for_colocation;
extern unsigned int sysctl_sched_little_cluster_coloc_fmin_khz;
extern int
walt_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
#endif
enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE,
@@ -49,6 +68,8 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
loff_t *ppos);
#endif
extern int sched_boost_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
/*
* control realtime throttling:
*
@@ -85,6 +106,12 @@ extern int sysctl_schedstats(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
#ifdef CONFIG_SCHED_WALT
extern int sched_little_cluster_coloc_fmin_khz_handler(struct ctl_table *table,
int write, void __user *buffer,
size_t *lenp, loff_t *ppos);
#endif
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
extern unsigned int sysctl_sched_energy_aware;
extern int sched_energy_aware_handler(struct ctl_table *table, int write,

View File

@@ -66,6 +66,8 @@ struct sched_domain_attr {
extern int sched_domain_level_max;
unsigned long capacity_curr_of(int cpu);
struct sched_group;
struct sched_domain_shared {
@@ -173,6 +175,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
typedef int (*sched_domain_flags_f)(void);
extern bool sched_is_energy_aware(void);
#define SDTL_OVERLAP 0x01

View File

@@ -10,6 +10,9 @@
#define DECLARE_BITMAP(name,bits) \
unsigned long name[BITS_TO_LONGS(bits)]
#define DECLARE_BITMAP_ARRAY(name, nr, bits) \
unsigned long name[nr][BITS_TO_LONGS(bits)]
typedef u32 __kernel_dev_t;
typedef __kernel_fd_set fd_set;

View File

@@ -705,6 +705,51 @@ TRACE_EVENT(sched_load_rt_rq,
__entry->util)
);
#ifdef CONFIG_SCHED_WALT
extern unsigned int sysctl_sched_use_walt_cpu_util;
extern unsigned int sysctl_sched_use_walt_task_util;
extern unsigned int sched_ravg_window;
extern unsigned int walt_disabled;
#endif
/*
* Tracepoint for accounting cpu root cfs_rq
*/
TRACE_EVENT(sched_load_avg_cpu,
TP_PROTO(int cpu, struct cfs_rq *cfs_rq),
TP_ARGS(cpu, cfs_rq),
TP_STRUCT__entry(
__field(int, cpu)
__field(unsigned long, load_avg)
__field(unsigned long, util_avg)
__field(unsigned long, util_avg_pelt)
__field(unsigned long, util_avg_walt)
),
TP_fast_assign(
__entry->cpu = cpu;
__entry->load_avg = cfs_rq->avg.load_avg;
__entry->util_avg = cfs_rq->avg.util_avg;
__entry->util_avg_pelt = cfs_rq->avg.util_avg;
__entry->util_avg_walt = 0;
#ifdef CONFIG_SCHED_WALT
__entry->util_avg_walt =
cpu_rq(cpu)->prev_runnable_sum << SCHED_CAPACITY_SHIFT;
do_div(__entry->util_avg_walt, sched_ravg_window);
if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
__entry->util_avg = __entry->util_avg_walt;
#endif
),
TP_printk("cpu=%d load_avg=%lu util_avg=%lu util_avg_pelt=%lu util_avg_walt=%lu",
__entry->cpu, __entry->load_avg, __entry->util_avg,
__entry->util_avg_pelt, __entry->util_avg_walt)
);
/*
* Tracepoint for sched_entity load tracking:
*/
@@ -1022,6 +1067,8 @@ TRACE_EVENT(sched_overutilized,
__entry->overutilized)
);
#include "walt.h"
#endif /* CONFIG_SMP */
#endif /* _TRACE_SCHED_H */

604
include/trace/events/walt.h Normal file
View File

@@ -0,0 +1,604 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2017-2018, The Linux Foundation. All rights reserved.
*/
#ifdef CONFIG_SCHED_WALT
struct rq;
struct group_cpu_time;
extern const char *task_event_names[];
#if defined(CREATE_TRACE_POINTS) && defined(CONFIG_SCHED_WALT)
static inline void __window_data(u32 *dst, u32 *src)
{
if (src)
memcpy(dst, src, nr_cpu_ids * sizeof(u32));
else
memset(dst, 0, nr_cpu_ids * sizeof(u32));
}
struct trace_seq;
const char *__window_print(struct trace_seq *p, const u32 *buf, int buf_len)
{
int i;
const char *ret = p->buffer + seq_buf_used(&p->seq);
for (i = 0; i < buf_len; i++)
trace_seq_printf(p, "%u ", buf[i]);
trace_seq_putc(p, 0);
return ret;
}
static inline s64 __rq_update_sum(struct rq *rq, bool curr, bool new)
{
if (curr)
if (new)
return rq->nt_curr_runnable_sum;
else
return rq->curr_runnable_sum;
else
if (new)
return rq->nt_prev_runnable_sum;
else
return rq->prev_runnable_sum;
}
static inline s64 __grp_update_sum(struct rq *rq, bool curr, bool new)
{
if (curr)
if (new)
return rq->grp_time.nt_curr_runnable_sum;
else
return rq->grp_time.curr_runnable_sum;
else
if (new)
return rq->grp_time.nt_prev_runnable_sum;
else
return rq->grp_time.prev_runnable_sum;
}
static inline s64
__get_update_sum(struct rq *rq, enum migrate_types migrate_type,
bool src, bool new, bool curr)
{
switch (migrate_type) {
case RQ_TO_GROUP:
if (src)
return __rq_update_sum(rq, curr, new);
else
return __grp_update_sum(rq, curr, new);
case GROUP_TO_RQ:
if (src)
return __grp_update_sum(rq, curr, new);
else
return __rq_update_sum(rq, curr, new);
default:
WARN_ON_ONCE(1);
return -1;
}
}
#endif
TRACE_EVENT(sched_update_pred_demand,
TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int pct,
unsigned int pred_demand),
TP_ARGS(rq, p, runtime, pct, pred_demand),
TP_STRUCT__entry(
__array(char, comm, TASK_COMM_LEN)
__field(pid_t, pid)
__field(unsigned int, runtime)
__field(int, pct)
__field(unsigned int, pred_demand)
__array(u8, bucket, NUM_BUSY_BUCKETS)
__field(int, cpu)
),
TP_fast_assign(
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
__entry->runtime = runtime;
__entry->pct = pct;
__entry->pred_demand = pred_demand;
memcpy(__entry->bucket, p->ravg.busy_buckets,
NUM_BUSY_BUCKETS * sizeof(u8));
__entry->cpu = rq->cpu;
),
TP_printk("%d (%s): runtime %u pct %d cpu %d pred_demand %u (buckets: %u %u %u %u %u %u %u %u %u %u)",
__entry->pid, __entry->comm,
__entry->runtime, __entry->pct, __entry->cpu,
__entry->pred_demand, __entry->bucket[0], __entry->bucket[1],
__entry->bucket[2], __entry->bucket[3], __entry->bucket[4],
__entry->bucket[5], __entry->bucket[6], __entry->bucket[7],
__entry->bucket[8], __entry->bucket[9])
);
TRACE_EVENT(sched_update_history,
TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
enum task_event evt),
TP_ARGS(rq, p, runtime, samples, evt),
TP_STRUCT__entry(
__array(char, comm, TASK_COMM_LEN)
__field(pid_t, pid)
__field(unsigned int, runtime)
__field(int, samples)
__field(enum task_event, evt)
__field(unsigned int, demand)
__field(unsigned int, coloc_demand)
__field(unsigned int, pred_demand)
__array(u32, hist, RAVG_HIST_SIZE_MAX)
__field(unsigned int, nr_big_tasks)
__field(int, cpu)
),
TP_fast_assign(
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
__entry->runtime = runtime;
__entry->samples = samples;
__entry->evt = evt;
__entry->demand = p->ravg.demand;
__entry->coloc_demand = p->ravg.coloc_demand;
__entry->pred_demand = p->ravg.pred_demand;
memcpy(__entry->hist, p->ravg.sum_history,
RAVG_HIST_SIZE_MAX * sizeof(u32));
__entry->nr_big_tasks = rq->walt_stats.nr_big_tasks;
__entry->cpu = rq->cpu;
),
TP_printk("%d (%s): runtime %u samples %d event %s demand %u coloc_demand %u pred_demand %u (hist: %u %u %u %u %u) cpu %d nr_big %u",
__entry->pid, __entry->comm,
__entry->runtime, __entry->samples,
task_event_names[__entry->evt],
__entry->demand, __entry->coloc_demand, __entry->pred_demand,
__entry->hist[0], __entry->hist[1],
__entry->hist[2], __entry->hist[3],
__entry->hist[4], __entry->cpu, __entry->nr_big_tasks)
);
TRACE_EVENT(sched_get_task_cpu_cycles,
TP_PROTO(int cpu, int event, u64 cycles,
u64 exec_time, struct task_struct *p),
TP_ARGS(cpu, event, cycles, exec_time, p),
TP_STRUCT__entry(
__field(int, cpu)
__field(int, event)
__field(u64, cycles)
__field(u64, exec_time)
__field(u32, freq)
__field(u32, legacy_freq)
__field(u32, max_freq)
__field(pid_t, pid)
__array(char, comm, TASK_COMM_LEN)
),
TP_fast_assign(
__entry->cpu = cpu;
__entry->event = event;
__entry->cycles = cycles;
__entry->exec_time = exec_time;
__entry->freq = cpu_cycles_to_freq(cycles, exec_time);
__entry->legacy_freq = sched_cpu_legacy_freq(cpu);
__entry->max_freq = cpu_max_freq(cpu);
__entry->pid = p->pid;
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
),
TP_printk("cpu=%d event=%d cycles=%llu exec_time=%llu freq=%u legacy_freq=%u max_freq=%u task=%d (%s)",
__entry->cpu, __entry->event, __entry->cycles,
__entry->exec_time, __entry->freq, __entry->legacy_freq,
__entry->max_freq, __entry->pid, __entry->comm)
);
TRACE_EVENT(sched_update_task_ravg,
TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt,
u64 wallclock, u64 irqtime, u64 cycles, u64 exec_time,
struct group_cpu_time *cpu_time),
TP_ARGS(p, rq, evt, wallclock, irqtime, cycles, exec_time, cpu_time),
TP_STRUCT__entry(
__array(char, comm, TASK_COMM_LEN)
__field(pid_t, pid)
__field(pid_t, cur_pid)
__field(unsigned int, cur_freq)
__field(u64, wallclock)
__field(u64, mark_start)
__field(u64, delta_m)
__field(u64, win_start)
__field(u64, delta)
__field(u64, irqtime)
__field(enum task_event, evt)
__field(unsigned int, demand)
__field(unsigned int, coloc_demand)
__field(unsigned int, sum)
__field(int, cpu)
__field(unsigned int, pred_demand)
__field(u64, rq_cs)
__field(u64, rq_ps)
__field(u64, grp_cs)
__field(u64, grp_ps)
__field(u64, grp_nt_cs)
__field(u64, grp_nt_ps)
__field(u32, curr_window)
__field(u32, prev_window)
__dynamic_array(u32, curr_sum, nr_cpu_ids)
__dynamic_array(u32, prev_sum, nr_cpu_ids)
__field(u64, nt_cs)
__field(u64, nt_ps)
__field(u32, active_windows)
__field(u8, curr_top)
__field(u8, prev_top)
),
TP_fast_assign(
__entry->wallclock = wallclock;
__entry->win_start = rq->window_start;
__entry->delta = (wallclock - rq->window_start);
__entry->evt = evt;
__entry->cpu = rq->cpu;
__entry->cur_pid = rq->curr->pid;
__entry->cur_freq = cpu_cycles_to_freq(cycles, exec_time);
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
__entry->mark_start = p->ravg.mark_start;
__entry->delta_m = (wallclock - p->ravg.mark_start);
__entry->demand = p->ravg.demand;
__entry->coloc_demand = p->ravg.coloc_demand;
__entry->sum = p->ravg.sum;
__entry->irqtime = irqtime;
__entry->pred_demand = p->ravg.pred_demand;
__entry->rq_cs = rq->curr_runnable_sum;
__entry->rq_ps = rq->prev_runnable_sum;
__entry->grp_cs = cpu_time ? cpu_time->curr_runnable_sum : 0;
__entry->grp_ps = cpu_time ? cpu_time->prev_runnable_sum : 0;
__entry->grp_nt_cs = cpu_time ?
cpu_time->nt_curr_runnable_sum : 0;
__entry->grp_nt_ps = cpu_time ?
cpu_time->nt_prev_runnable_sum : 0;
__entry->curr_window = p->ravg.curr_window;
__entry->prev_window = p->ravg.prev_window;
__window_data(__get_dynamic_array(curr_sum),
p->ravg.curr_window_cpu);
__window_data(__get_dynamic_array(prev_sum),
p->ravg.prev_window_cpu);
__entry->nt_cs = rq->nt_curr_runnable_sum;
__entry->nt_ps = rq->nt_prev_runnable_sum;
__entry->active_windows = p->ravg.active_windows;
__entry->curr_top = rq->curr_top;
__entry->prev_top = rq->prev_top;
),
TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u coloc_demand: %u sum %u irqtime %llu pred_demand %u rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_wins %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu curr_top %u prev_top %u",
__entry->wallclock, __entry->win_start, __entry->delta,
task_event_names[__entry->evt], __entry->cpu,
__entry->cur_freq, __entry->cur_pid,
__entry->pid, __entry->comm, __entry->mark_start,
__entry->delta_m, __entry->demand, __entry->coloc_demand,
__entry->sum, __entry->irqtime, __entry->pred_demand,
__entry->rq_cs, __entry->rq_ps, __entry->curr_window,
__window_print(p, __get_dynamic_array(curr_sum), nr_cpu_ids),
__entry->prev_window,
__window_print(p, __get_dynamic_array(prev_sum), nr_cpu_ids),
__entry->nt_cs, __entry->nt_ps,
__entry->active_windows, __entry->grp_cs,
__entry->grp_ps, __entry->grp_nt_cs, __entry->grp_nt_ps,
__entry->curr_top, __entry->prev_top)
);
TRACE_EVENT(sched_update_task_ravg_mini,
TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt,
u64 wallclock, u64 irqtime, u64 cycles, u64 exec_time,
struct group_cpu_time *cpu_time),
TP_ARGS(p, rq, evt, wallclock, irqtime, cycles, exec_time, cpu_time),
TP_STRUCT__entry(
__array(char, comm, TASK_COMM_LEN)
__field(pid_t, pid)
__field(u64, wallclock)
__field(u64, mark_start)
__field(u64, delta_m)
__field(u64, win_start)
__field(u64, delta)
__field(enum task_event, evt)
__field(unsigned int, demand)
__field(int, cpu)
__field(u64, rq_cs)
__field(u64, rq_ps)
__field(u64, grp_cs)
__field(u64, grp_ps)
__field(u32, curr_window)
__field(u32, prev_window)
),
TP_fast_assign(
__entry->wallclock = wallclock;
__entry->win_start = rq->window_start;
__entry->delta = (wallclock - rq->window_start);
__entry->evt = evt;
__entry->cpu = rq->cpu;
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
__entry->mark_start = p->ravg.mark_start;
__entry->delta_m = (wallclock - p->ravg.mark_start);
__entry->demand = p->ravg.demand;
__entry->rq_cs = rq->curr_runnable_sum;
__entry->rq_ps = rq->prev_runnable_sum;
__entry->grp_cs = cpu_time ? cpu_time->curr_runnable_sum : 0;
__entry->grp_ps = cpu_time ? cpu_time->prev_runnable_sum : 0;
__entry->curr_window = p->ravg.curr_window;
__entry->prev_window = p->ravg.prev_window;
),
TP_printk("wc %llu ws %llu delta %llu event %s cpu %d task %d (%s) ms %llu delta %llu demand %u rq_cs %llu rq_ps %llu cur_window %u prev_window %u grp_cs %lld grp_ps %lld",
__entry->wallclock, __entry->win_start, __entry->delta,
task_event_names[__entry->evt], __entry->cpu,
__entry->pid, __entry->comm, __entry->mark_start,
__entry->delta_m, __entry->demand,
__entry->rq_cs, __entry->rq_ps, __entry->curr_window,
__entry->prev_window, __entry->grp_cs, __entry->grp_ps)
);
struct migration_sum_data;
extern const char *migrate_type_names[];
TRACE_EVENT(sched_set_preferred_cluster,
TP_PROTO(struct related_thread_group *grp, u64 total_demand),
TP_ARGS(grp, total_demand),
TP_STRUCT__entry(
__field(int, id)
__field(u64, demand)
__field(int, cluster_first_cpu)
__array(char, comm, TASK_COMM_LEN)
__field(pid_t, pid)
__field(unsigned int, task_demand)
),
TP_fast_assign(
__entry->id = grp->id;
__entry->demand = total_demand;
__entry->cluster_first_cpu = grp->preferred_cluster ?
cluster_first_cpu(grp->preferred_cluster) : -1;
),
TP_printk("group_id %d total_demand %llu preferred_cluster_first_cpu %d",
__entry->id, __entry->demand,
__entry->cluster_first_cpu)
);
TRACE_EVENT(sched_migration_update_sum,
TP_PROTO(struct task_struct *p, enum migrate_types migrate_type,
struct rq *rq),
TP_ARGS(p, migrate_type, rq),
TP_STRUCT__entry(
__field(int, tcpu)
__field(int, pid)
__field(enum migrate_types, migrate_type)
__field(s64, src_cs)
__field(s64, src_ps)
__field(s64, dst_cs)
__field(s64, dst_ps)
__field(s64, src_nt_cs)
__field(s64, src_nt_ps)
__field(s64, dst_nt_cs)
__field(s64, dst_nt_ps)
),
TP_fast_assign(
__entry->tcpu = task_cpu(p);
__entry->pid = p->pid;
__entry->migrate_type = migrate_type;
__entry->src_cs = __get_update_sum(rq, migrate_type,
true, false, true);
__entry->src_ps = __get_update_sum(rq, migrate_type,
true, false, false);
__entry->dst_cs = __get_update_sum(rq, migrate_type,
false, false, true);
__entry->dst_ps = __get_update_sum(rq, migrate_type,
false, false, false);
__entry->src_nt_cs = __get_update_sum(rq, migrate_type,
true, true, true);
__entry->src_nt_ps = __get_update_sum(rq, migrate_type,
true, true, false);
__entry->dst_nt_cs = __get_update_sum(rq, migrate_type,
false, true, true);
__entry->dst_nt_ps = __get_update_sum(rq, migrate_type,
false, true, false);
),
TP_printk("pid %d task_cpu %d migrate_type %s src_cs %llu src_ps %llu dst_cs %lld dst_ps %lld src_nt_cs %llu src_nt_ps %llu dst_nt_cs %lld dst_nt_ps %lld",
__entry->pid, __entry->tcpu,
migrate_type_names[__entry->migrate_type],
__entry->src_cs, __entry->src_ps, __entry->dst_cs,
__entry->dst_ps, __entry->src_nt_cs, __entry->src_nt_ps,
__entry->dst_nt_cs, __entry->dst_nt_ps)
);
TRACE_EVENT(sched_set_boost,
TP_PROTO(int type),
TP_ARGS(type),
TP_STRUCT__entry(
__field(int, type)
),
TP_fast_assign(
__entry->type = type;
),
TP_printk("type %d", __entry->type)
);
TRACE_EVENT(sched_load_balance_skip_tasks,
TP_PROTO(int scpu, int dcpu, int grp_type, int pid,
unsigned long h_load, unsigned long task_util,
unsigned long affinity),
TP_ARGS(scpu, dcpu, grp_type, pid, h_load, task_util, affinity),
TP_STRUCT__entry(
__field(int, scpu)
__field(unsigned long, src_util_cum)
__field(int, grp_type)
__field(int, dcpu)
__field(unsigned long, dst_util_cum)
__field(int, pid)
__field(unsigned long, affinity)
__field(unsigned long, task_util)
__field(unsigned long, h_load)
),
TP_fast_assign(
__entry->scpu = scpu;
__entry->src_util_cum =
cpu_rq(scpu)->cum_window_demand_scaled;
__entry->grp_type = grp_type;
__entry->dcpu = dcpu;
__entry->dst_util_cum =
cpu_rq(dcpu)->cum_window_demand_scaled;
__entry->pid = pid;
__entry->affinity = affinity;
__entry->task_util = task_util;
__entry->h_load = h_load;
),
TP_printk("source_cpu=%d util_cum=%lu group_type=%d dest_cpu=%d util_cum=%lu pid=%d affinity=%#lx task_util=%lu task_h_load=%lu",
__entry->scpu, __entry->src_util_cum, __entry->grp_type,
__entry->dcpu, __entry->dst_util_cum, __entry->pid,
__entry->affinity, __entry->task_util, __entry->h_load)
);
DECLARE_EVENT_CLASS(sched_cpu_load,
TP_PROTO(struct rq *rq, int idle, u64 irqload, unsigned int power_cost),
TP_ARGS(rq, idle, irqload, power_cost),
TP_STRUCT__entry(
__field(unsigned int, cpu)
__field(unsigned int, idle)
__field(unsigned int, nr_running)
__field(unsigned int, nr_big_tasks)
__field(unsigned int, load_scale_factor)
__field(unsigned int, capacity)
__field(u64, cumulative_runnable_avg)
__field(u64, irqload)
__field(unsigned int, max_freq)
__field(unsigned int, power_cost)
__field(int, cstate)
__field(int, dstate)
),
TP_fast_assign(
__entry->cpu = rq->cpu;
__entry->idle = idle;
__entry->nr_running = rq->nr_running;
__entry->nr_big_tasks = rq->walt_stats.nr_big_tasks;
__entry->load_scale_factor =
cpu_load_scale_factor(rq->cpu);
__entry->capacity = cpu_capacity(rq->cpu);
__entry->cumulative_runnable_avg =
rq->walt_stats.cumulative_runnable_avg_scaled;
__entry->irqload = irqload;
__entry->max_freq = cpu_max_freq(rq->cpu);
__entry->power_cost = power_cost;
__entry->cstate = rq->cstate;
__entry->dstate = rq->cluster->dstate;
),
TP_printk("cpu %u idle %d nr_run %u nr_big %u lsf %u capacity %u cr_avg %llu irqload %llu fmax %u power_cost %u cstate %d dstate %d",
__entry->cpu, __entry->idle, __entry->nr_running,
__entry->nr_big_tasks, __entry->load_scale_factor,
__entry->capacity, __entry->cumulative_runnable_avg,
__entry->irqload, __entry->max_freq, __entry->power_cost,
__entry->cstate, __entry->dstate)
);
DEFINE_EVENT(sched_cpu_load, sched_cpu_load_lb,
TP_PROTO(struct rq *rq, int idle, u64 irqload, unsigned int power_cost),
TP_ARGS(rq, idle, irqload, power_cost)
);
TRACE_EVENT(sched_load_to_gov,
TP_PROTO(struct rq *rq, u64 aggr_grp_load, u32 tt_load,
u64 freq_aggr_thresh, u64 load, int policy,
int big_task_rotation,
unsigned int sysctl_sched_little_cluster_coloc_fmin_khz,
u64 coloc_boost_load),
TP_ARGS(rq, aggr_grp_load, tt_load, freq_aggr_thresh, load, policy,
big_task_rotation, sysctl_sched_little_cluster_coloc_fmin_khz,
coloc_boost_load),
TP_STRUCT__entry(
__field(int, cpu)
__field(int, policy)
__field(int, ed_task_pid)
__field(u64, aggr_grp_load)
__field(u64, freq_aggr_thresh)
__field(u64, tt_load)
__field(u64, rq_ps)
__field(u64, grp_rq_ps)
__field(u64, nt_ps)
__field(u64, grp_nt_ps)
__field(u64, pl)
__field(u64, load)
__field(int, big_task_rotation)
__field(unsigned int,
sysctl_sched_little_cluster_coloc_fmin_khz)
__field(u64, coloc_boost_load)
),
TP_fast_assign(
__entry->cpu = cpu_of(rq);
__entry->policy = policy;
__entry->ed_task_pid = rq->ed_task ? rq->ed_task->pid : -1;
__entry->aggr_grp_load = aggr_grp_load;
__entry->freq_aggr_thresh = freq_aggr_thresh;
__entry->tt_load = tt_load;
__entry->rq_ps = rq->prev_runnable_sum;
__entry->grp_rq_ps = rq->grp_time.prev_runnable_sum;
__entry->nt_ps = rq->nt_prev_runnable_sum;
__entry->grp_nt_ps = rq->grp_time.nt_prev_runnable_sum;
__entry->pl =
rq->walt_stats.pred_demands_sum_scaled;
__entry->load = load;
__entry->big_task_rotation = big_task_rotation;
__entry->sysctl_sched_little_cluster_coloc_fmin_khz =
sysctl_sched_little_cluster_coloc_fmin_khz;
__entry->coloc_boost_load = coloc_boost_load;
),
TP_printk("cpu=%d policy=%d ed_task_pid=%d aggr_grp_load=%llu freq_aggr_thresh=%llu tt_load=%llu rq_ps=%llu grp_rq_ps=%llu nt_ps=%llu grp_nt_ps=%llu pl=%llu load=%llu big_task_rotation=%d sysctl_sched_little_cluster_coloc_fmin_khz=%u coloc_boost_load=%llu",
__entry->cpu, __entry->policy, __entry->ed_task_pid,
__entry->aggr_grp_load, __entry->freq_aggr_thresh,
__entry->tt_load, __entry->rq_ps, __entry->grp_rq_ps,
__entry->nt_ps, __entry->grp_nt_ps, __entry->pl, __entry->load,
__entry->big_task_rotation,
__entry->sysctl_sched_little_cluster_coloc_fmin_khz,
__entry->coloc_boost_load)
);
#endif

View File

@@ -420,6 +420,15 @@ config IRQ_TIME_ACCOUNTING
If in doubt, say N here.
config SCHED_WALT
bool "Support window based load tracking"
depends on SMP
help
This feature will allow the scheduler to maintain a tunable window
based set of metrics for tasks and runqueues. These metrics can be
used to guide task placement as well as task frequency requirements
for cpufreq governors.
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
depends on MULTIUSER

View File

@@ -811,6 +811,7 @@ void __noreturn do_exit(long code)
}
exit_signals(tsk); /* sets PF_EXITING */
sched_exit(tsk);
/*
* Ensure that all new tsk->pi_lock acquisitions must observe
* PF_EXITING. Serializes against futex.c:attach_to_pi_owner().

View File

@@ -2089,6 +2089,7 @@ static __latent_entropy struct task_struct *copy_process(
perf_event_free_task(p);
bad_fork_cleanup_policy:
lockdep_free_task(p);
free_task_load_ptrs(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:

View File

@@ -21,6 +21,7 @@ obj-y += idle.o fair.o rt.o deadline.o
obj-y += wait.o wait_bit.o swait.o completion.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
obj-$(CONFIG_SCHED_WALT) += walt.o boost.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o

255
kernel/sched/boost.c Normal file
View File

@@ -0,0 +1,255 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
*/
#include "sched.h"
#include <linux/of.h>
#include <linux/sched/core_ctl.h>
#include <trace/events/sched.h>
/*
* Scheduler boost is a mechanism to temporarily place tasks on CPUs
* with higher capacity than those where a task would have normally
* ended up with their load characteristics. Any entity enabling
* boost is responsible for disabling it as well.
*/
unsigned int sysctl_sched_boost;
static enum sched_boost_policy boost_policy;
static enum sched_boost_policy boost_policy_dt = SCHED_BOOST_NONE;
static DEFINE_MUTEX(boost_mutex);
static unsigned int freq_aggr_threshold_backup;
static int boost_refcount[MAX_NUM_BOOST_TYPE];
static inline void boost_kick(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (!test_and_set_bit(BOOST_KICK, &rq->walt_flags))
smp_send_reschedule(cpu);
}
static void boost_kick_cpus(void)
{
int i;
struct cpumask kick_mask;
if (boost_policy != SCHED_BOOST_ON_BIG)
return;
cpumask_andnot(&kick_mask, cpu_online_mask, cpu_isolated_mask);
for_each_cpu(i, &kick_mask) {
if (cpu_capacity(i) != max_capacity)
boost_kick(i);
}
}
int got_boost_kick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
return test_bit(BOOST_KICK, &rq->walt_flags);
}
void clear_boost_kick(int cpu)
{
struct rq *rq = cpu_rq(cpu);
clear_bit(BOOST_KICK, &rq->walt_flags);
}
/*
* Scheduler boost type and boost policy might at first seem unrelated,
* however, there exists a connection between them that will allow us
* to use them interchangeably during placement decisions. We'll explain
* the connection here in one possible way so that the implications are
* clear when looking at placement policies.
*
* When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED
* When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can
* neither be none nor RESTRAINED.
*/
static void set_boost_policy(int type)
{
if (type == NO_BOOST || type == RESTRAINED_BOOST) {
boost_policy = SCHED_BOOST_NONE;
return;
}
if (boost_policy_dt) {
boost_policy = boost_policy_dt;
return;
}
if (min_possible_efficiency != max_possible_efficiency) {
boost_policy = SCHED_BOOST_ON_BIG;
return;
}
boost_policy = SCHED_BOOST_ON_ALL;
}
enum sched_boost_policy sched_boost_policy(void)
{
return boost_policy;
}
static bool verify_boost_params(int type)
{
return type >= RESTRAINED_BOOST_DISABLE && type <= RESTRAINED_BOOST;
}
static void _sched_set_boost(int type)
{
switch (type) {
case NO_BOOST: /* All boost clear */
if (boost_refcount[FULL_THROTTLE_BOOST] > 0) {
core_ctl_set_boost(false);
boost_refcount[FULL_THROTTLE_BOOST] = 0;
}
if (boost_refcount[CONSERVATIVE_BOOST] > 0) {
restore_cgroup_boost_settings();
boost_refcount[CONSERVATIVE_BOOST] = 0;
}
if (boost_refcount[RESTRAINED_BOOST] > 0) {
update_freq_aggregate_threshold(
freq_aggr_threshold_backup);
boost_refcount[RESTRAINED_BOOST] = 0;
}
break;
case FULL_THROTTLE_BOOST:
boost_refcount[FULL_THROTTLE_BOOST]++;
if (boost_refcount[FULL_THROTTLE_BOOST] == 1) {
core_ctl_set_boost(true);
restore_cgroup_boost_settings();
boost_kick_cpus();
}
break;
case CONSERVATIVE_BOOST:
boost_refcount[CONSERVATIVE_BOOST]++;
if ((boost_refcount[CONSERVATIVE_BOOST] == 1) &&
!boost_refcount[FULL_THROTTLE_BOOST]) {
update_cgroup_boost_settings();
boost_kick_cpus();
}
break;
case RESTRAINED_BOOST:
boost_refcount[RESTRAINED_BOOST]++;
if (boost_refcount[RESTRAINED_BOOST] == 1) {
freq_aggr_threshold_backup =
update_freq_aggregate_threshold(1);
}
break;
case FULL_THROTTLE_BOOST_DISABLE:
if (boost_refcount[FULL_THROTTLE_BOOST] >= 1) {
boost_refcount[FULL_THROTTLE_BOOST]--;
if (!boost_refcount[FULL_THROTTLE_BOOST]) {
core_ctl_set_boost(false);
if (boost_refcount[CONSERVATIVE_BOOST] >= 1)
update_cgroup_boost_settings();
}
}
break;
case CONSERVATIVE_BOOST_DISABLE:
if (boost_refcount[CONSERVATIVE_BOOST] >= 1) {
boost_refcount[CONSERVATIVE_BOOST]--;
if (!boost_refcount[CONSERVATIVE_BOOST])
restore_cgroup_boost_settings();
}
break;
case RESTRAINED_BOOST_DISABLE:
if (boost_refcount[RESTRAINED_BOOST] >= 1) {
boost_refcount[RESTRAINED_BOOST]--;
if (!boost_refcount[RESTRAINED_BOOST])
update_freq_aggregate_threshold(
freq_aggr_threshold_backup);
}
break;
default:
WARN_ON(1);
return;
}
/* Aggregate final boost type */
if (boost_refcount[FULL_THROTTLE_BOOST] >= 1)
type = FULL_THROTTLE_BOOST;
else if (boost_refcount[CONSERVATIVE_BOOST] >= 1)
type = CONSERVATIVE_BOOST;
else if (boost_refcount[RESTRAINED_BOOST] >= 1)
type = RESTRAINED_BOOST;
else
type = NO_BOOST;
set_boost_policy(type);
sysctl_sched_boost = type;
trace_sched_set_boost(type);
}
void sched_boost_parse_dt(void)
{
struct device_node *sn;
const char *boost_policy;
sn = of_find_node_by_path("/sched-hmp");
if (!sn)
return;
if (!of_property_read_string(sn, "boost-policy", &boost_policy)) {
if (!strcmp(boost_policy, "boost-on-big"))
boost_policy_dt = SCHED_BOOST_ON_BIG;
else if (!strcmp(boost_policy, "boost-on-all"))
boost_policy_dt = SCHED_BOOST_ON_ALL;
}
}
int sched_set_boost(int type)
{
int ret = 0;
mutex_lock(&boost_mutex);
if (verify_boost_params(type))
_sched_set_boost(type);
else
ret = -EINVAL;
mutex_unlock(&boost_mutex);
return ret;
}
int sched_boost_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
unsigned int *data = (unsigned int *)table->data;
mutex_lock(&boost_mutex);
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
goto done;
if (verify_boost_params(*data))
_sched_set_boost(*data);
else
ret = -EINVAL;
done:
mutex_unlock(&boost_mutex);
return ret;
}
int sched_boost(void)
{
return sysctl_sched_boost;
}

View File

@@ -18,6 +18,7 @@
#include "../smpboot.h"
#include "pelt.h"
#include "walt.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -757,6 +758,9 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
if (task_contributes_to_load(p))
rq->nr_uninterruptible++;
if (flags & DEQUEUE_SLEEP)
clear_ed_task(p, rq);
dequeue_task(rq, p, flags);
}
@@ -918,8 +922,9 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
p->on_rq = TASK_ON_RQ_MIGRATING;
dequeue_task(rq, p, DEQUEUE_NOCLOCK);
double_lock_balance(rq, cpu_rq(new_cpu));
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
double_rq_unlock(cpu_rq(new_cpu), rq);
rq = cpu_rq(new_cpu);
@@ -1177,12 +1182,13 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->se.nr_migrations++;
rseq_migrate(p);
perf_event_task_migrate(p);
fixup_busy_time(p, new_cpu);
}
__set_task_cpu(p, new_cpu);
}
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (task_on_rq_queued(p)) {
@@ -1299,7 +1305,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
out:
return ret;
}
#endif /* CONFIG_NUMA_BALANCING */
/*
* wait_task_inactive - wait for a thread to unschedule.
@@ -1752,6 +1757,7 @@ void sched_ttwu_pending(void)
void scheduler_ipi(void)
{
int cpu = smp_processor_id();
/*
* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
* TIF_NEED_RESCHED remotely (for the first time) will also send
@@ -1759,9 +1765,18 @@ void scheduler_ipi(void)
*/
preempt_fold_need_resched();
if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
&& !got_boost_kick())
return;
if (got_boost_kick()) {
struct rq *rq = cpu_rq(cpu);
if (rq->curr->sched_class == &fair_sched_class)
check_for_migration(rq, rq->curr);
clear_boost_kick(cpu);
}
/*
* Not all reschedule IPI handlers call irq_enter/irq_exit, since
* traditionally all their work was done from the interrupt return
@@ -1934,6 +1949,36 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* accesses to the task state; see try_to_wake_up() and set_current_state().
*/
#ifdef CONFIG_SMP
#ifdef CONFIG_SCHED_WALT
/* utility function to update walt signals at wakeup */
static inline void walt_try_to_wake_up(struct task_struct *p)
{
struct rq *rq = cpu_rq(task_cpu(p));
struct rq_flags rf;
u64 wallclock;
unsigned int old_load;
struct related_thread_group *grp = NULL;
rq_lock_irqsave(rq, &rf);
old_load = task_load(p);
wallclock = sched_ktime_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
note_task_waking(p, wallclock);
rq_unlock_irqrestore(rq, &rf);
rcu_read_lock();
grp = task_related_thread_group(p);
if (update_preferred_cluster(grp, p, old_load))
set_preferred_cluster(grp);
rcu_read_unlock();
}
#else
#define walt_try_to_wake_up(a) {}
#endif
#endif
/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened
@@ -2036,6 +2081,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
walt_try_to_wake_up(p);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -2066,6 +2113,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
if (success && sched_predl) {
raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags);
if (do_pl_notif(cpu_rq(cpu)))
cpufreq_update_util(cpu_rq(cpu),
SCHED_CPUFREQ_WALT |
SCHED_CPUFREQ_PL);
raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags);
}
return success;
}
@@ -2106,11 +2161,17 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
trace_sched_waking(p);
if (!task_on_rq_queued(p)) {
u64 wallclock = sched_ktime_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&rq->nr_iowait);
}
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
note_task_waking(p, wallclock);
}
ttwu_do_wakeup(rq, p, 0, rf);
@@ -2157,6 +2218,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
p->last_sleep_ts = 0;
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2305,6 +2367,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
unsigned long flags;
init_new_task_load(p);
__sched_fork(clone_flags, p);
/*
* We mark the process as NEW here. This guarantees that
@@ -2408,7 +2471,9 @@ void wake_up_new_task(struct task_struct *p)
struct rq_flags rf;
struct rq *rq;
add_new_task_to_grp(p);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
@@ -2426,7 +2491,9 @@ void wake_up_new_task(struct task_struct *p)
update_rq_clock(rq);
post_init_entity_util_avg(&p->se);
mark_task_starting(p);
activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
@@ -3053,16 +3120,30 @@ void scheduler_tick(void)
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
struct rq_flags rf;
u64 wallclock;
bool early_notif;
u32 old_load;
struct related_thread_group *grp;
unsigned int flag = 0;
sched_clock_tick();
rq_lock(rq, &rf);
old_load = task_load(curr);
set_window_start(rq);
wallclock = sched_ktime_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
calc_global_load_tick(rq);
early_notif = early_detection_notify(rq, wallclock);
if (early_notif)
flag = SCHED_CPUFREQ_WALT | SCHED_CPUFREQ_EARLY_DET;
cpufreq_update_util(rq, flag);
rq_unlock(rq, &rf);
perf_event_task_tick();
@@ -3071,6 +3152,15 @@ void scheduler_tick(void)
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
#endif
rcu_read_lock();
grp = task_related_thread_group(curr);
if (update_preferred_cluster(grp, curr, old_load))
set_preferred_cluster(grp);
rcu_read_unlock();
if (curr->sched_class == &fair_sched_class)
check_for_migration(rq, curr);
}
#ifdef CONFIG_NO_HZ_FULL
@@ -3399,6 +3489,7 @@ static void __sched notrace __schedule(bool preempt)
struct rq_flags rf;
struct rq *rq;
int cpu;
u64 wallclock;
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -3460,7 +3551,13 @@ static void __sched notrace __schedule(bool preempt)
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
wallclock = sched_ktime_clock();
if (likely(prev != next)) {
if (!prev->on_rq)
prev->last_sleep_ts = wallclock;
update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
rq->nr_switches++;
rq->curr = next;
/*
@@ -3484,6 +3581,7 @@ static void __sched notrace __schedule(bool preempt)
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0);
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unlock_irq(rq, &rf);
}
@@ -5383,10 +5481,11 @@ void init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
__sched_fork(0, idle);
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_lock(&rq->lock);
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
idle->flags |= PF_IDLE;
@@ -5823,6 +5922,11 @@ int sched_cpu_deactivate(unsigned int cpu)
static void sched_rq_cpu_starting(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags);
set_window_start(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
rq->calc_load_update = calc_load_update;
update_max_interval();
@@ -5846,6 +5950,7 @@ int sched_cpu_dying(unsigned int cpu)
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
@@ -5854,6 +5959,8 @@ int sched_cpu_dying(unsigned int cpu)
BUG_ON(rq->nr_running != 1);
rq_unlock_irqrestore(rq, &rf);
clear_walt_request(cpu);
calc_load_migrate(rq);
update_max_interval();
nohz_balance_exit_idle(rq);
@@ -5878,6 +5985,8 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
cpus_read_unlock();
update_cluster_topology();
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
BUG();
@@ -5932,6 +6041,8 @@ void __init sched_init(void)
wait_bit_init();
init_clusters();
#ifdef CONFIG_FAIR_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
@@ -6047,6 +6158,8 @@ void __init sched_init(void)
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
rq->push_task = NULL;
walt_sched_init_rq(rq);
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -6061,6 +6174,8 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
}
BUG_ON(alloc_related_thread_groups());
set_load_weight(&init_task, false);
/*
@@ -6076,6 +6191,7 @@ void __init sched_init(void)
* when this runqueue becomes "idle".
*/
init_idle(current, smp_processor_id());
init_new_task_load(current);
calc_load_update = jiffies + LOAD_FREQ;
@@ -7071,3 +7187,49 @@ const u32 sched_prio_to_wmult[40] = {
};
#undef CREATE_TRACE_POINTS
#ifdef CONFIG_SCHED_WALT
/*
* sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field
*
* Stop accounting (exiting) task's future cpu usage
*
* We need this so that reset_all_windows_stats() can function correctly.
* reset_all_window_stats() depends on do_each_thread/for_each_thread task
* iterators to reset *all* task's statistics. Exiting tasks however become
* invisible to those iterators. sched_exit() is called on a exiting task prior
* to being removed from task_list, which will let reset_all_window_stats()
* function correctly.
*/
void sched_exit(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
u64 wallclock;
sched_set_group_id(p, 0);
rq = task_rq_lock(p, &rf);
/* rq->curr == p */
wallclock = sched_ktime_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
dequeue_task(rq, p, 0);
/*
* task's contribution is already removed from the
* cumulative window demand in dequeue. As the
* task's stats are reset, the next enqueue does
* not change the cumulative window demand.
*/
reset_task_stats(p);
p->ravg.mark_start = wallclock;
p->ravg.sum_history[0] = EXITING_TASK_MARKER;
enqueue_task(rq, p, 0);
clear_ed_task(p, rq);
task_rq_unlock(rq, p, &rf);
free_task_load_ptrs(p);
}
#endif /* CONFIG_SCHED_WALT */
__read_mostly bool sched_predl = 1;

View File

@@ -3,6 +3,7 @@
*/
#include <linux/cpufreq_times.h>
#include "sched.h"
#include "walt.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -52,11 +53,18 @@ void irqtime_account_irq(struct task_struct *curr)
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
s64 delta;
int cpu;
#ifdef CONFIG_SCHED_WALT
u64 wallclock;
bool account = true;
#endif
if (!sched_clock_irqtime)
return;
cpu = smp_processor_id();
#ifdef CONFIG_SCHED_WALT
wallclock = sched_clock_cpu(cpu);
#endif
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
@@ -70,6 +78,15 @@ void irqtime_account_irq(struct task_struct *curr)
irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
#ifdef CONFIG_SCHED_WALT
else
account = false;
if (account)
sched_account_irqtime(cpu, curr, delta, wallclock);
else if (curr != this_cpu_ksoftirqd())
sched_account_irqstart(cpu, curr, wallclock);
#endif
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);

View File

@@ -17,6 +17,7 @@
*/
#include "sched.h"
#include "pelt.h"
#include "walt.h"
struct dl_bandwidth def_dl_bandwidth;
@@ -1348,6 +1349,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
add_nr_running(rq_of_dl_rq(dl_rq), 1);
walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
inc_dl_deadline(dl_rq, deadline);
inc_dl_migration(dl_se, dl_rq);
@@ -1362,6 +1364,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
dec_dl_deadline(dl_rq, dl_se->deadline);
dec_dl_migration(dl_se, dl_rq);
@@ -2093,7 +2096,9 @@ static int push_dl_task(struct rq *rq)
deactivate_task(rq, next_task, 0);
sub_running_bw(&next_task->dl, &rq->dl);
sub_rq_bw(&next_task->dl, &rq->dl);
next_task->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(next_task, later_rq->cpu);
next_task->on_rq = TASK_ON_RQ_QUEUED;
add_rq_bw(&next_task->dl, &later_rq->dl);
/*
@@ -2191,7 +2196,9 @@ static void pull_dl_task(struct rq *this_rq)
deactivate_task(src_rq, p, 0);
sub_running_bw(&p->dl, &src_rq->dl);
sub_rq_bw(&p->dl, &src_rq->dl);
p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, this_cpu);
p->on_rq = TASK_ON_RQ_QUEUED;
add_rq_bw(&p->dl, &this_rq->dl);
add_running_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0);
@@ -2425,6 +2432,9 @@ const struct sched_class dl_sched_class = {
.switched_to = switched_to_dl,
.update_curr = update_curr_dl,
#ifdef CONFIG_SCHED_WALT
.fixup_walt_sched_stats = fixup_walt_sched_stats_common,
#endif
};
int sched_dl_global_validate(void)

View File

@@ -24,6 +24,41 @@
#include <trace/events/sched.h>
#include "walt.h"
#ifdef CONFIG_SMP
static inline bool task_fits_max(struct task_struct *p, int cpu);
#endif /* CONFIG_SMP */
#ifdef CONFIG_SCHED_WALT
static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p,
u16 updated_demand_scaled,
u16 updated_pred_demand_scaled);
#endif /* CONFIG_SCHED_WALT */
#if defined(CONFIG_SCHED_WALT) && defined(CONFIG_CFS_BANDWIDTH)
static void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq);
static void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq,
struct task_struct *p);
static void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq,
struct task_struct *p);
static void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats,
struct cfs_rq *cfs_rq);
static void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats,
struct cfs_rq *cfs_rq);
#else
static inline void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq) {}
static inline void
walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {}
#define walt_inc_throttled_cfs_rq_stats(...)
#define walt_dec_throttled_cfs_rq_stats(...)
#endif
/*
* Targeted preemption latency for CPU-bound tasks:
*
@@ -95,6 +130,14 @@ unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
DEFINE_PER_CPU_READ_MOSTLY(int, sched_load_boost);
#ifdef CONFIG_SCHED_WALT
unsigned int sysctl_sched_use_walt_cpu_util = 1;
unsigned int sysctl_sched_use_walt_task_util = 1;
__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
(10 * NSEC_PER_MSEC);
#endif
#ifdef CONFIG_SMP
/*
@@ -128,6 +171,13 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
*/
unsigned int capacity_margin = 1280;
#ifdef CONFIG_SCHED_WALT
/* 1ms default for 20ms window size scaled to 1024 */
unsigned int sysctl_sched_min_task_util_for_boost = 51;
/* 0.68ms default for 20ms window size scaled to 1024 */
unsigned int sysctl_sched_min_task_util_for_colocation = 35;
#endif
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
@@ -3637,11 +3687,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
}
static inline unsigned long _task_util_est(struct task_struct *p)
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
@@ -3651,6 +3696,10 @@ static inline unsigned long _task_util_est(struct task_struct *p)
static inline unsigned long task_util_est(struct task_struct *p)
{
#ifdef CONFIG_SCHED_WALT
if (likely(!walt_disabled && sysctl_sched_use_walt_task_util))
return p->ravg.demand_scaled;
#endif
return max(task_util(p), _task_util_est(p));
}
@@ -4515,13 +4564,16 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (dequeue)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
qcfs_rq->h_nr_running -= task_delta;
walt_dec_throttled_cfs_rq_stats(&qcfs_rq->walt_stats, cfs_rq);
if (qcfs_rq->load.weight)
dequeue = 0;
}
if (!se)
if (!se) {
sub_nr_running(rq, task_delta);
walt_dec_throttled_cfs_rq_stats(&rq->walt_stats, cfs_rq);
}
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
@@ -4555,6 +4607,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct sched_entity *se;
int enqueue = 1;
long task_delta;
struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -4582,13 +4635,16 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (enqueue)
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
cfs_rq->h_nr_running += task_delta;
walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq);
if (cfs_rq_throttled(cfs_rq))
break;
}
if (!se)
if (!se) {
add_nr_running(rq, task_delta);
walt_inc_throttled_cfs_rq_stats(&rq->walt_stats, tcfs_rq);
}
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -4943,6 +4999,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
walt_init_cfs_rq_stats(cfs_rq);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -4984,8 +5041,6 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq)
struct task_group *tg;
lockdep_assert_held(&rq->lock);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
@@ -5125,7 +5180,6 @@ static inline void hrtick_update(struct rq *rq)
#endif
#ifdef CONFIG_SMP
static inline unsigned long cpu_util(int cpu);
static unsigned long capacity_of(int cpu);
static inline bool cpu_overutilized(int cpu)
@@ -5204,6 +5258,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
walt_inc_cfs_rq_stats(cfs_rq, p);
flags = ENQUEUE_WAKEUP;
}
@@ -5211,6 +5266,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
walt_inc_cfs_rq_stats(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
@@ -5221,6 +5277,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) {
add_nr_running(rq, 1);
inc_rq_walt_stats(rq, p);
/*
* Since new tasks are assigned an initial util_avg equal to
* half of the spare capacity of their CPU, tiny tasks have the
@@ -5237,7 +5294,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
if (flags & ENQUEUE_WAKEUP)
update_overutilized_status(rq);
}
hrtick_update(rq);
@@ -5277,6 +5333,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
walt_dec_cfs_rq_stats(cfs_rq, p);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -5296,6 +5353,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
walt_dec_cfs_rq_stats(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
@@ -5304,8 +5362,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_group(se);
}
if (!se)
if (!se) {
sub_nr_running(rq, 1);
dec_rq_walt_stats(rq, p);
}
util_est_dequeue(&rq->cfs, p, task_sleep);
hrtick_update(rq);
@@ -5622,16 +5682,6 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
static unsigned long capacity_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity;
}
static unsigned long capacity_orig_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity_orig;
}
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -5661,6 +5711,15 @@ static void record_wakee(struct task_struct *p)
}
}
/*
* Externally visible function. Let's keep the one above
* so that the check is inlined/optimized in the sched paths.
*/
bool sched_is_energy_aware(void)
{
return energy_aware();
}
/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
*
@@ -6376,58 +6435,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
/**
* Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
* @cpu: the CPU to get the utilization of
*
* The unit of the return value must be the one of capacity so we can compare
* the utilization with the capacity of the CPU that is available for CFS task
* (ie cpu_capacity).
*
* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on a CPU. It represents
* the amount of utilization of a CPU in the range [0..capacity_orig] where
* capacity_orig is the cpu_capacity available at the highest frequency
* (arch_scale_freq_capacity()).
* The utilization of a CPU converges towards a sum equal to or less than the
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
* the running time on this CPU scaled by capacity_curr.
*
* The estimated utilization of a CPU is defined to be the maximum between its
* cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
* currently RUNNABLE on that CPU.
* This allows to properly represent the expected utilization of a CPU which
* has just got a big task running since a long sleep period. At the same time
* however it preserves the benefits of the "blocked utilization" in
* describing the potential for other tasks waking up on the same CPU.
*
* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
* higher than capacity_orig because of unfortunate rounding in
* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
* the average stabilizes with the new running time. We need to check that the
* utilization stays within the range of [0..capacity_orig] and cap it if
* necessary. Without utilization capping, a group could be seen as overloaded
* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
* available capacity. We allow utilization to overshoot capacity_curr (but not
* capacity_orig) as it useful for predicting the capacity required after task
* migrations (scheduler-driven DVFS).
*
* Return: the (estimated) utilization for the specified CPU
*/
static inline unsigned long cpu_util(int cpu)
{
struct cfs_rq *cfs_rq;
unsigned int util;
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
if (sched_feat(UTIL_EST))
util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
@@ -6443,13 +6450,30 @@ static inline unsigned long cpu_util(int cpu)
*/
static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
#ifndef CONFIG_SCHED_WALT
struct cfs_rq *cfs_rq;
#endif
unsigned int util;
#ifdef CONFIG_SCHED_WALT
/*
* WALT does not decay idle tasks in the same manner
* as PELT, so it makes little sense to subtract task
* utilization from cpu utilization. Instead just use
* cpu_util for this case.
*/
if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util) &&
p->state == TASK_WAKING)
return cpu_util(cpu);
#endif
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_util(cpu);
#ifdef CONFIG_SCHED_WALT
util = max_t(long, cpu_util(cpu) - task_util(p), 0);
#else
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
@@ -6509,6 +6533,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
}
util = max(util, estimated);
}
#endif
/*
* Utilization (estimated) can exceed the CPU capacity, thus let's
@@ -6574,6 +6599,9 @@ static void find_best_target(struct sched_domain *sd, cpumask_t *cpus,
if (!cpu_online(i))
continue;
if (sched_cpu_high_irqload(i))
continue;
/*
* p's blocked utilization is still accounted for on prev_cpu
* so prev_cpu will receive a negative bias due to the double
@@ -7974,7 +8002,11 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
double_lock_balance(env->src_rq, env->dst_rq);
if (!(env->src_rq->clock_update_flags & RQCF_UPDATED))
update_rq_clock(env->src_rq);
set_task_cpu(p, env->dst_cpu);
double_unlock_balance(env->src_rq, env->dst_rq);
}
/*
@@ -11164,6 +11196,10 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_change_group = task_change_group_fair,
#endif
#ifdef CONFIG_SCHED_WALT
.fixup_walt_sched_stats = walt_fixup_sched_stats_fair,
#endif
};
#ifdef CONFIG_SCHED_DEBUG
@@ -11211,3 +11247,336 @@ __init void init_sched_fair_class(void)
#endif /* SMP */
}
/* WALT sched implementation begins here */
#ifdef CONFIG_SCHED_WALT
#ifdef CONFIG_CFS_BANDWIDTH
static void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq)
{
cfs_rq->walt_stats.nr_big_tasks = 0;
cfs_rq->walt_stats.cumulative_runnable_avg_scaled = 0;
cfs_rq->walt_stats.pred_demands_sum_scaled = 0;
}
static void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p)
{
inc_nr_big_task(&cfs_rq->walt_stats, p);
fixup_cumulative_runnable_avg(&cfs_rq->walt_stats,
p->ravg.demand_scaled,
p->ravg.pred_demand_scaled);
}
static void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p)
{
dec_nr_big_task(&cfs_rq->walt_stats, p);
fixup_cumulative_runnable_avg(&cfs_rq->walt_stats,
-(s64)p->ravg.demand_scaled,
-(s64)p->ravg.pred_demand_scaled);
}
static void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats,
struct cfs_rq *tcfs_rq)
{
struct rq *rq = rq_of(tcfs_rq);
stats->nr_big_tasks += tcfs_rq->walt_stats.nr_big_tasks;
fixup_cumulative_runnable_avg(stats,
tcfs_rq->walt_stats.cumulative_runnable_avg_scaled,
tcfs_rq->walt_stats.pred_demands_sum_scaled);
if (stats == &rq->walt_stats)
walt_fixup_cum_window_demand(rq,
tcfs_rq->walt_stats.cumulative_runnable_avg_scaled);
}
static void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats,
struct cfs_rq *tcfs_rq)
{
struct rq *rq = rq_of(tcfs_rq);
stats->nr_big_tasks -= tcfs_rq->walt_stats.nr_big_tasks;
fixup_cumulative_runnable_avg(stats,
-tcfs_rq->walt_stats.cumulative_runnable_avg_scaled,
-tcfs_rq->walt_stats.pred_demands_sum_scaled);
/*
* We remove the throttled cfs_rq's tasks's contribution from the
* cumulative window demand so that the same can be added
* unconditionally when the cfs_rq is unthrottled.
*/
if (stats == &rq->walt_stats)
walt_fixup_cum_window_demand(rq,
-tcfs_rq->walt_stats.cumulative_runnable_avg_scaled);
}
static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p,
u16 updated_demand_scaled,
u16 updated_pred_demand_scaled)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
s64 task_load_delta = (s64)updated_demand_scaled -
p->ravg.demand_scaled;
s64 pred_demand_delta = (s64)updated_pred_demand_scaled -
p->ravg.pred_demand_scaled;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
fixup_cumulative_runnable_avg(&cfs_rq->walt_stats,
task_load_delta,
pred_demand_delta);
if (cfs_rq_throttled(cfs_rq))
break;
}
/* Fix up rq->walt_stats only if we didn't find any throttled cfs_rq */
if (!se) {
fixup_cumulative_runnable_avg(&rq->walt_stats,
task_load_delta,
pred_demand_delta);
walt_fixup_cum_window_demand(rq, task_load_delta);
}
}
/*
* Check if task is part of a hierarchy where some cfs_rq does not have any
* runtime left.
*
* We can't rely on throttled_hierarchy() to do this test, as
* cfs_rq->throttle_count will not be updated yet when this function is called
* from scheduler_tick()
*/
static int task_will_be_throttled(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq;
if (!cfs_bandwidth_used())
return 0;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
if (!cfs_rq->runtime_enabled)
continue;
if (cfs_rq->runtime_remaining <= 0)
return 1;
}
return 0;
}
#else /* CONFIG_CFS_BANDWIDTH */
static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p,
u16 updated_demand_scaled,
u16 updated_pred_demand_scaled)
{
fixup_walt_sched_stats_common(rq, p, updated_demand_scaled,
updated_pred_demand_scaled);
}
static int task_will_be_throttled(struct task_struct *p)
{
return false;
}
#endif /* CONFIG_CFS_BANDWIDTH */
static inline int
kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
{
unsigned long flags;
int rc = 0;
/* Invoke active balance to force migrate currently running task */
raw_spin_lock_irqsave(&rq->lock, flags);
if (!rq->active_balance) {
rq->active_balance = 1;
rq->push_cpu = new_cpu;
get_task_struct(p);
rq->push_task = p;
rc = 1;
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
return rc;
}
#ifdef CONFIG_SCHED_WALT
struct walt_rotate_work {
struct work_struct w;
struct task_struct *src_task;
struct task_struct *dst_task;
int src_cpu;
int dst_cpu;
};
static DEFINE_PER_CPU(struct walt_rotate_work, walt_rotate_works);
static void walt_rotate_work_func(struct work_struct *work)
{
struct walt_rotate_work *wr = container_of(work,
struct walt_rotate_work, w);
migrate_swap(wr->src_task, wr->dst_task, wr->dst_cpu, wr->src_cpu);
put_task_struct(wr->src_task);
put_task_struct(wr->dst_task);
clear_reserved(wr->src_cpu);
clear_reserved(wr->dst_cpu);
}
void walt_rotate_work_init(void)
{
int i;
for_each_possible_cpu(i) {
struct walt_rotate_work *wr = &per_cpu(walt_rotate_works, i);
INIT_WORK(&wr->w, walt_rotate_work_func);
}
}
#define WALT_ROTATION_THRESHOLD_NS 16000000
static void walt_check_for_rotation(struct rq *src_rq)
{
u64 wc, wait, max_wait = 0, run, max_run = 0;
int deserved_cpu = nr_cpu_ids, dst_cpu = nr_cpu_ids;
int i, src_cpu = cpu_of(src_rq);
struct rq *dst_rq;
struct walt_rotate_work *wr = NULL;
if (!walt_rotation_enabled)
return;
if (got_boost_kick())
return;
if (!is_min_capacity_cpu(src_cpu))
return;
wc = sched_ktime_clock();
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
if (!is_min_capacity_cpu(i))
break;
if (is_reserved(i))
continue;
if (!rq->misfit_task_load || rq->curr->sched_class !=
&fair_sched_class)
continue;
wait = wc - rq->curr->last_enqueued_ts;
if (wait > max_wait) {
max_wait = wait;
deserved_cpu = i;
}
}
if (deserved_cpu != src_cpu)
return;
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
if (is_min_capacity_cpu(i))
continue;
if (is_reserved(i))
continue;
if (rq->curr->sched_class != &fair_sched_class)
continue;
if (rq->nr_running > 1)
continue;
run = wc - rq->curr->last_enqueued_ts;
if (run < WALT_ROTATION_THRESHOLD_NS)
continue;
if (run > max_run) {
max_run = run;
dst_cpu = i;
}
}
if (dst_cpu == nr_cpu_ids)
return;
dst_rq = cpu_rq(dst_cpu);
double_rq_lock(src_rq, dst_rq);
if (dst_rq->curr->sched_class == &fair_sched_class) {
get_task_struct(src_rq->curr);
get_task_struct(dst_rq->curr);
mark_reserved(src_cpu);
mark_reserved(dst_cpu);
wr = &per_cpu(walt_rotate_works, src_cpu);
wr->src_task = src_rq->curr;
wr->dst_task = dst_rq->curr;
wr->src_cpu = src_cpu;
wr->dst_cpu = dst_cpu;
}
double_rq_unlock(src_rq, dst_rq);
if (wr)
queue_work_on(src_cpu, system_highpri_wq, &wr->w);
}
#else
static inline void walt_check_for_rotation(struct rq *rq)
{
}
#endif
static DEFINE_RAW_SPINLOCK(migration_lock);
void check_for_migration(struct rq *rq, struct task_struct *p)
{
int active_balance;
int new_cpu = -1;
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
if (rq->misfit_task_load) {
if (rq->curr->state != TASK_RUNNING ||
rq->curr->nr_cpus_allowed == 1)
return;
if (task_will_be_throttled(p))
return;
raw_spin_lock(&migration_lock);
rcu_read_lock();
new_cpu = find_energy_efficient_cpu(p, prev_cpu, 0);
rcu_read_unlock();
if ((new_cpu != -1) &&
(capacity_orig_of(new_cpu) > capacity_orig_of(cpu))) {
active_balance = kick_active_balance(rq, p, new_cpu);
if (active_balance) {
mark_reserved(new_cpu);
raw_spin_unlock(&migration_lock);
stop_one_cpu_nowait(cpu,
active_load_balance_cpu_stop, rq,
&rq->active_balance_work);
return;
}
} else {
walt_check_for_rotation(rq);
}
raw_spin_unlock(&migration_lock);
}
}
#endif /* CONFIG_SCHED_WALT */

View File

@@ -6,6 +6,7 @@
#include "sched.h"
#include "pelt.h"
#include "walt.h"
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
@@ -2409,6 +2410,9 @@ const struct sched_class rt_sched_class = {
.switched_to = switched_to_rt,
.update_curr = update_curr_rt,
#ifdef CONFIG_SCHED_WALT
.fixup_walt_sched_stats = fixup_walt_sched_stats_common,
#endif
};
#ifdef CONFIG_RT_GROUP_SCHED

View File

@@ -86,6 +86,72 @@
struct rq;
struct cpuidle_state;
extern __read_mostly bool sched_predl;
#ifdef CONFIG_SCHED_WALT
extern unsigned int sched_ravg_window;
extern unsigned int walt_cpu_util_freq_divisor;
struct walt_sched_stats {
int nr_big_tasks;
u64 cumulative_runnable_avg_scaled;
u64 pred_demands_sum_scaled;
};
struct cpu_cycle {
u64 cycles;
u64 time;
};
struct group_cpu_time {
u64 curr_runnable_sum;
u64 prev_runnable_sum;
u64 nt_curr_runnable_sum;
u64 nt_prev_runnable_sum;
};
struct load_subtractions {
u64 window_start;
u64 subs;
u64 new_subs;
};
#define NUM_TRACKED_WINDOWS 2
#define NUM_LOAD_INDICES 1000
struct sched_cluster {
raw_spinlock_t load_lock;
struct list_head list;
struct cpumask cpus;
int id;
int max_power_cost;
int min_power_cost;
int max_possible_capacity;
int capacity;
int efficiency; /* Differentiate cpus with different IPC capability */
int load_scale_factor;
unsigned int exec_scale_factor;
/*
* max_freq = user maximum
* max_mitigated_freq = thermal defined maximum
* max_possible_freq = maximum supported by hardware
*/
unsigned int cur_freq, max_freq, max_mitigated_freq, min_freq;
unsigned int max_possible_freq;
bool freq_init_done;
int dstate, dstate_wakeup_latency, dstate_wakeup_energy;
unsigned int static_cluster_pwr_cost;
int notifier_sent;
bool wake_up_idle;
u64 aggr_grp_load;
u64 coloc_boost_load;
};
extern unsigned int sched_disable_window_stats;
extern struct timer_list sched_grp_timer;
#endif /* CONFIG_SCHED_WALT */
/* task_struct::on_rq states: */
#define TASK_ON_RQ_QUEUED 1
#define TASK_ON_RQ_MIGRATING 2
@@ -556,6 +622,10 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
#ifdef CONFIG_SCHED_WALT
struct walt_sched_stats walt_stats;
#endif
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
int expires_seq;
@@ -885,6 +955,7 @@ struct rq {
/* For active balancing */
int active_balance;
int push_cpu;
struct task_struct *push_task;
struct cpu_stop_work active_balance_work;
/* CPU of this runqueue: */
@@ -906,6 +977,42 @@ struct rq {
u64 max_idle_balance_cost;
#endif
#ifdef CONFIG_SCHED_WALT
struct sched_cluster *cluster;
struct cpumask freq_domain_cpumask;
struct walt_sched_stats walt_stats;
int cstate, wakeup_latency, wakeup_energy;
u64 window_start;
s64 cum_window_start;
unsigned long walt_flags;
u64 cur_irqload;
u64 avg_irqload;
u64 irqload_ts;
unsigned int static_cpu_pwr_cost;
struct task_struct *ed_task;
struct cpu_cycle cc;
u64 old_busy_time, old_busy_time_group;
u64 old_estimated_time;
u64 curr_runnable_sum;
u64 prev_runnable_sum;
u64 nt_curr_runnable_sum;
u64 nt_prev_runnable_sum;
u64 cum_window_demand_scaled;
struct group_cpu_time grp_time;
struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
DECLARE_BITMAP_ARRAY(top_tasks_bitmap,
NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES);
u8 *top_tasks[NUM_TRACKED_WINDOWS];
u8 curr_table;
int prev_top;
int curr_top;
bool notif_pending;
u64 last_cc_update;
u64 cycles;
#endif /* CONFIG_SCHED_WALT */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
@@ -1137,8 +1244,6 @@ enum numa_faults_stats {
};
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *p, struct task_struct *t,
int cpu, int scpu);
extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
#else
static inline void
@@ -1147,6 +1252,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
}
#endif /* CONFIG_NUMA_BALANCING */
extern int migrate_swap(struct task_struct *p, struct task_struct *t,
int cpu, int scpu);
#ifdef CONFIG_SMP
static inline void
@@ -1602,8 +1710,15 @@ struct sched_class {
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p, int type);
#endif
#ifdef CONFIG_SCHED_WALT
void (*fixup_walt_sched_stats)(struct rq *rq, struct task_struct *p,
u16 updated_demand_scaled,
u16 updated_pred_demand_scaled);
#endif
};
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
prev->sched_class->put_prev_task(rq, prev);
@@ -1661,6 +1776,10 @@ static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
static inline int idle_get_state_idx(struct rq *rq)
{
WARN_ON(!rcu_read_lock_held());
if (rq->nr_running || cpu_of(rq) == raw_smp_processor_id())
return -1;
return rq->idle_state_idx;
}
#else
@@ -1807,6 +1926,15 @@ static inline int hrtick_enabled(struct rq *rq)
#endif /* CONFIG_SCHED_HRTICK */
#ifdef CONFIG_SCHED_WALT
u64 sched_ktime_clock(void);
#else
static inline u64 sched_ktime_clock(void)
{
return 0;
}
#endif
#ifndef arch_scale_freq_capacity
static __always_inline
unsigned long arch_scale_freq_capacity(int cpu)
@@ -1824,6 +1952,193 @@ unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
}
#endif
#ifdef CONFIG_SMP
static inline unsigned long capacity_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity;
}
static inline unsigned long capacity_orig_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity_orig;
}
extern unsigned int sysctl_sched_use_walt_cpu_util;
extern unsigned int walt_disabled;
static inline unsigned long task_util(struct task_struct *p)
{
#ifdef CONFIG_SCHED_WALT
if (likely(!walt_disabled && sysctl_sched_use_walt_task_util))
return p->ravg.demand_scaled;
#endif
return READ_ONCE(p->se.avg.util_avg);
}
/**
* Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
* @cpu: the CPU to get the utilization of
*
* The unit of the return value must be the one of capacity so we can compare
* the utilization with the capacity of the CPU that is available for CFS task
* (ie cpu_capacity).
*
* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on a CPU. It represents
* the amount of utilization of a CPU in the range [0..capacity_orig] where
* capacity_orig is the cpu_capacity available at the highest frequency
* (arch_scale_freq_capacity()).
* The utilization of a CPU converges towards a sum equal to or less than the
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
* the running time on this CPU scaled by capacity_curr.
*
* The estimated utilization of a CPU is defined to be the maximum between its
* cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
* currently RUNNABLE on that CPU.
* This allows to properly represent the expected utilization of a CPU which
* has just got a big task running since a long sleep period. At the same time
* however it preserves the benefits of the "blocked utilization" in
* describing the potential for other tasks waking up on the same CPU.
*
* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
* higher than capacity_orig because of unfortunate rounding in
* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
* the average stabilizes with the new running time. We need to check that the
* utilization stays within the range of [0..capacity_orig] and cap it if
* necessary. Without utilization capping, a group could be seen as overloaded
* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
* available capacity. We allow utilization to overshoot capacity_curr (but not
* capacity_orig) as it useful for predicting the capacity required after task
* migrations (scheduler-driven DVFS).
*
* Return: the (estimated) utilization for the specified CPU
*/
static inline unsigned long cpu_util(int cpu)
{
struct cfs_rq *cfs_rq;
unsigned int util;
#ifdef CONFIG_SCHED_WALT
if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) {
u64 walt_cpu_util =
cpu_rq(cpu)->walt_stats.cumulative_runnable_avg_scaled;
return min_t(unsigned long, walt_cpu_util,
capacity_orig_of(cpu));
}
#endif
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
if (sched_feat(UTIL_EST))
util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
struct sched_walt_cpu_load {
unsigned long prev_window_util;
unsigned long nl;
unsigned long pl;
u64 ws;
};
static inline unsigned long cpu_util_cum(int cpu, int delta)
{
u64 util = cpu_rq(cpu)->cfs.avg.util_avg;
unsigned long capacity = capacity_orig_of(cpu);
#ifdef CONFIG_SCHED_WALT
if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
util = cpu_rq(cpu)->cum_window_demand_scaled;
#endif
delta += util;
if (delta < 0)
return 0;
return (delta >= capacity) ? capacity : delta;
}
#ifdef CONFIG_SCHED_WALT
u64 freq_policy_load(struct rq *rq);
extern u64 walt_load_reported_window;
static inline unsigned long
cpu_util_freq_walt(int cpu, struct sched_walt_cpu_load *walt_load)
{
u64 util, util_unboosted;
struct rq *rq = cpu_rq(cpu);
unsigned long capacity = capacity_orig_of(cpu);
int boost;
if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
return cpu_util(cpu);
boost = per_cpu(sched_load_boost, cpu);
util_unboosted = util = freq_policy_load(rq);
util = div64_u64(util * (100 + boost),
walt_cpu_util_freq_divisor);
if (walt_load) {
u64 nl = cpu_rq(cpu)->nt_prev_runnable_sum +
rq->grp_time.nt_prev_runnable_sum;
u64 pl = rq->walt_stats.pred_demands_sum_scaled;
/* do_pl_notif() needs unboosted signals */
rq->old_busy_time = div64_u64(util_unboosted,
sched_ravg_window >>
SCHED_CAPACITY_SHIFT);
rq->old_estimated_time = pl;
nl = div64_u64(nl * (100 + boost),
walt_cpu_util_freq_divisor);
pl = div64_u64(pl * (100 + boost), 100);
walt_load->prev_window_util = util;
walt_load->nl = nl;
walt_load->pl = pl;
walt_load->ws = walt_load_reported_window;
}
return (util >= capacity) ? capacity : util;
}
static inline unsigned long
cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load)
{
return cpu_util_freq_walt(cpu, walt_load);
}
#else
static inline unsigned long
cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load)
{
return cpu_util(cpu);
}
#define sched_ravg_window TICK_NSEC
#define sysctl_sched_use_walt_cpu_util 0
#endif /* CONFIG_SCHED_WALT */
extern unsigned int capacity_margin_freq;
static inline unsigned long
add_capacity_margin(unsigned long cpu_capacity, int cpu)
{
cpu_capacity = cpu_capacity * capacity_margin_freq *
(100 + per_cpu(sched_load_boost, cpu));
cpu_capacity /= 100;
cpu_capacity /= SCHED_CAPACITY_SCALE;
return cpu_capacity;
}
#endif
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock);
@@ -2218,6 +2533,11 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
struct update_util_data *data;
#ifdef CONFIG_SCHED_WALT
if (!(flags & SCHED_CPUFREQ_WALT))
return;
#endif
data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
cpu_of(rq)));
if (data)
@@ -2333,3 +2653,601 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
#ifdef CONFIG_SMP
extern struct static_key_false sched_energy_present;
#endif
enum sched_boost_policy {
SCHED_BOOST_NONE,
SCHED_BOOST_ON_BIG,
SCHED_BOOST_ON_ALL,
};
#define NO_BOOST 0
#define FULL_THROTTLE_BOOST 1
#define CONSERVATIVE_BOOST 2
#define RESTRAINED_BOOST 3
/*
* Returns the rq capacity of any rq in a group. This does not play
* well with groups where rq capacity can change independently.
*/
#define group_rq_capacity(group) cpu_capacity(group_first_cpu(group))
#ifdef CONFIG_SCHED_WALT
static inline int cluster_first_cpu(struct sched_cluster *cluster)
{
return cpumask_first(&cluster->cpus);
}
struct related_thread_group {
int id;
raw_spinlock_t lock;
struct list_head tasks;
struct list_head list;
struct sched_cluster *preferred_cluster;
struct rcu_head rcu;
u64 last_update;
};
extern struct list_head cluster_head;
extern struct sched_cluster *sched_cluster[NR_CPUS];
#define for_each_sched_cluster(cluster) \
list_for_each_entry_rcu(cluster, &cluster_head, list)
#define WINDOW_STATS_RECENT 0
#define WINDOW_STATS_MAX 1
#define WINDOW_STATS_MAX_RECENT_AVG 2
#define WINDOW_STATS_AVG 3
#define WINDOW_STATS_INVALID_POLICY 4
#define SCHED_UPMIGRATE_MIN_NICE 15
#define EXITING_TASK_MARKER 0xdeaddead
#define UP_MIGRATION 1
#define DOWN_MIGRATION 2
#define IRQLOAD_MIGRATION 3
extern struct mutex policy_mutex;
extern unsigned int sched_disable_window_stats;
extern unsigned int max_possible_freq;
extern unsigned int min_max_freq;
extern unsigned int max_possible_efficiency;
extern unsigned int min_possible_efficiency;
extern unsigned int max_capacity;
extern unsigned int min_capacity;
extern unsigned int max_load_scale_factor;
extern unsigned int max_possible_capacity;
extern unsigned int min_max_possible_capacity;
extern unsigned int max_power_cost;
extern unsigned int __read_mostly sched_init_task_load_windows;
extern unsigned int up_down_migrate_scale_factor;
extern unsigned int sysctl_sched_restrict_cluster_spill;
extern unsigned int sched_pred_alert_load;
extern struct sched_cluster init_cluster;
extern unsigned int __read_mostly sched_short_sleep_task_threshold;
extern unsigned int __read_mostly sched_long_cpu_selection_threshold;
extern unsigned int __read_mostly sched_big_waker_task_load;
extern unsigned int __read_mostly sched_small_wakee_task_load;
extern unsigned int __read_mostly sched_spill_load;
extern unsigned int __read_mostly sched_upmigrate;
extern unsigned int __read_mostly sched_downmigrate;
extern unsigned int __read_mostly sysctl_sched_spill_nr_run;
extern unsigned int __read_mostly sched_load_granule;
extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb);
extern int update_preferred_cluster(struct related_thread_group *grp,
struct task_struct *p, u32 old_load);
extern void set_preferred_cluster(struct related_thread_group *grp);
extern void add_new_task_to_grp(struct task_struct *new);
extern unsigned int update_freq_aggregate_threshold(unsigned int threshold);
#define NO_BOOST 0
#define FULL_THROTTLE_BOOST 1
#define CONSERVATIVE_BOOST 2
#define RESTRAINED_BOOST 3
#define FULL_THROTTLE_BOOST_DISABLE -1
#define CONSERVATIVE_BOOST_DISABLE -2
#define RESTRAINED_BOOST_DISABLE -3
#define MAX_NUM_BOOST_TYPE (RESTRAINED_BOOST+1)
static inline int cpu_capacity(int cpu)
{
return cpu_rq(cpu)->cluster->capacity;
}
static inline int cpu_max_possible_capacity(int cpu)
{
return cpu_rq(cpu)->cluster->max_possible_capacity;
}
static inline int cpu_load_scale_factor(int cpu)
{
return cpu_rq(cpu)->cluster->load_scale_factor;
}
static inline int cpu_efficiency(int cpu)
{
return cpu_rq(cpu)->cluster->efficiency;
}
static inline unsigned int cpu_min_freq(int cpu)
{
return cpu_rq(cpu)->cluster->min_freq;
}
static inline unsigned int cluster_max_freq(struct sched_cluster *cluster)
{
/*
* Governor and thermal driver don't know the other party's mitigation
* voting. So struct cluster saves both and return min() for current
* cluster fmax.
*/
return min(cluster->max_mitigated_freq, cluster->max_freq);
}
static inline unsigned int cpu_max_freq(int cpu)
{
return cluster_max_freq(cpu_rq(cpu)->cluster);
}
static inline unsigned int cpu_max_possible_freq(int cpu)
{
return cpu_rq(cpu)->cluster->max_possible_freq;
}
/* Keep track of max/min capacity possible across CPUs "currently" */
static inline void __update_min_max_capacity(void)
{
int i;
int max_cap = 0, min_cap = INT_MAX;
for_each_online_cpu(i) {
max_cap = max(max_cap, cpu_capacity(i));
min_cap = min(min_cap, cpu_capacity(i));
}
max_capacity = max_cap;
min_capacity = min_cap;
}
/*
* Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
* that "most" efficient cpu gets a load_scale_factor of 1
*/
static inline unsigned long
load_scale_cpu_efficiency(struct sched_cluster *cluster)
{
return DIV_ROUND_UP(1024 * max_possible_efficiency,
cluster->efficiency);
}
/*
* Return load_scale_factor of a cpu in reference to cpu with best max_freq
* (max_possible_freq), so that one with best max_freq gets a load_scale_factor
* of 1.
*/
static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster)
{
return DIV_ROUND_UP(1024 * max_possible_freq,
cluster_max_freq(cluster));
}
static inline int compute_load_scale_factor(struct sched_cluster *cluster)
{
int load_scale = 1024;
/*
* load_scale_factor accounts for the fact that task load
* is in reference to "best" performing cpu. Task's load will need to be
* scaled (up) by a factor to determine suitability to be placed on a
* (little) cpu.
*/
load_scale *= load_scale_cpu_efficiency(cluster);
load_scale >>= 10;
load_scale *= load_scale_cpu_freq(cluster);
load_scale >>= 10;
return load_scale;
}
static inline int cpu_max_power_cost(int cpu)
{
return cpu_rq(cpu)->cluster->max_power_cost;
}
static inline int cpu_min_power_cost(int cpu)
{
return cpu_rq(cpu)->cluster->min_power_cost;
}
static inline bool hmp_capable(void)
{
return max_possible_capacity != min_max_possible_capacity;
}
static inline bool is_max_capacity_cpu(int cpu)
{
return cpu_max_possible_capacity(cpu) == max_possible_capacity;
}
static inline bool is_min_capacity_cpu(int cpu)
{
return cpu_max_possible_capacity(cpu) == min_max_possible_capacity;
}
/*
* 'load' is in reference to "best cpu" at its best frequency.
* Scale that in reference to a given cpu, accounting for how bad it is
* in reference to "best cpu".
*/
static inline u64 scale_load_to_cpu(u64 task_load, int cpu)
{
u64 lsf = cpu_load_scale_factor(cpu);
if (lsf != 1024) {
task_load *= lsf;
task_load /= 1024;
}
return task_load;
}
/*
* Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
* least efficient cpu gets capacity of 1024
*/
static unsigned long
capacity_scale_cpu_efficiency(struct sched_cluster *cluster)
{
return (1024 * cluster->efficiency) / min_possible_efficiency;
}
/*
* Return 'capacity' of a cpu in reference to cpu with lowest max_freq
* (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
*/
static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster)
{
return (1024 * cluster_max_freq(cluster)) / min_max_freq;
}
static inline int compute_capacity(struct sched_cluster *cluster)
{
int capacity = 1024;
capacity *= capacity_scale_cpu_efficiency(cluster);
capacity >>= 10;
capacity *= capacity_scale_cpu_freq(cluster);
capacity >>= 10;
return capacity;
}
static inline unsigned int task_load(struct task_struct *p)
{
return p->ravg.demand;
}
static inline unsigned int task_pl(struct task_struct *p)
{
return p->ravg.pred_demand;
}
#define pct_to_real(tunable) \
(div64_u64((u64)tunable * (u64)max_task_load(), 100))
#define real_to_pct(tunable) \
(div64_u64((u64)tunable * (u64)100, (u64)max_task_load()))
static inline bool task_in_related_thread_group(struct task_struct *p)
{
return !!(rcu_access_pointer(p->grp) != NULL);
}
static inline
struct related_thread_group *task_related_thread_group(struct task_struct *p)
{
return rcu_dereference(p->grp);
}
/* Is frequency of two cpus synchronized with each other? */
static inline int same_freq_domain(int src_cpu, int dst_cpu)
{
struct rq *rq = cpu_rq(src_cpu);
if (src_cpu == dst_cpu)
return 1;
return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask);
}
#define BOOST_KICK 0
#define CPU_RESERVED 1
extern int sched_boost(void);
extern int preferred_cluster(struct sched_cluster *cluster,
struct task_struct *p);
extern struct sched_cluster *rq_cluster(struct rq *rq);
extern void reset_task_stats(struct task_struct *p);
extern void clear_top_tasks_bitmap(unsigned long *bitmap);
#if defined(CONFIG_SCHED_TUNE)
extern bool task_sched_boost(struct task_struct *p);
extern int sync_cgroup_colocation(struct task_struct *p, bool insert);
extern bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2);
extern void update_cgroup_boost_settings(void);
extern void restore_cgroup_boost_settings(void);
#else
static inline bool
same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2)
{
return true;
}
static inline bool task_sched_boost(struct task_struct *p)
{
return true;
}
static inline void update_cgroup_boost_settings(void) { }
static inline void restore_cgroup_boost_settings(void) { }
#endif
extern int alloc_related_thread_groups(void);
extern unsigned long all_cluster_ids[];
extern void check_for_migration(struct rq *rq, struct task_struct *p);
static inline int is_reserved(int cpu)
{
struct rq *rq = cpu_rq(cpu);
return test_bit(CPU_RESERVED, &rq->walt_flags);
}
static inline int mark_reserved(int cpu)
{
struct rq *rq = cpu_rq(cpu);
return test_and_set_bit(CPU_RESERVED, &rq->walt_flags);
}
static inline void clear_reserved(int cpu)
{
struct rq *rq = cpu_rq(cpu);
clear_bit(CPU_RESERVED, &rq->walt_flags);
}
static inline bool
task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
{
return cpu_of(rq) == task_cpu(p) && (p->on_rq || p->last_sleep_ts >=
rq->window_start);
}
static inline void walt_fixup_cum_window_demand(struct rq *rq, s64 scaled_delta)
{
rq->cum_window_demand_scaled += scaled_delta;
if (unlikely((s64)rq->cum_window_demand_scaled < 0))
rq->cum_window_demand_scaled = 0;
}
extern void update_cpu_cluster_capacity(const cpumask_t *cpus);
extern unsigned long thermal_cap(int cpu);
extern void clear_walt_request(int cpu);
extern int got_boost_kick(void);
extern void clear_boost_kick(int cpu);
extern enum sched_boost_policy sched_boost_policy(void);
extern void sched_boost_parse_dt(void);
extern void clear_ed_task(struct task_struct *p, struct rq *rq);
extern bool early_detection_notify(struct rq *rq, u64 wallclock);
static inline unsigned int power_cost(int cpu, u64 demand)
{
return cpu_max_possible_capacity(cpu);
}
void note_task_waking(struct task_struct *p, u64 wallclock);
static inline bool task_placement_boost_enabled(struct task_struct *p)
{
if (task_sched_boost(p))
return sched_boost_policy() != SCHED_BOOST_NONE;
return false;
}
static inline enum sched_boost_policy task_boost_policy(struct task_struct *p)
{
enum sched_boost_policy policy = task_sched_boost(p) ?
sched_boost_policy() :
SCHED_BOOST_NONE;
if (policy == SCHED_BOOST_ON_BIG) {
/*
* Filter out tasks less than min task util threshold
* under conservative boost.
*/
if (sysctl_sched_boost == CONSERVATIVE_BOOST &&
task_util(p) <=
sysctl_sched_min_task_util_for_boost)
policy = SCHED_BOOST_NONE;
}
return policy;
}
extern void walt_map_freq_to_load(void);
static inline bool is_min_capacity_cluster(struct sched_cluster *cluster)
{
return is_min_capacity_cpu(cluster_first_cpu(cluster));
}
#else /* CONFIG_SCHED_WALT */
struct walt_sched_stats;
struct related_thread_group;
struct sched_cluster;
static inline bool task_sched_boost(struct task_struct *p)
{
return false;
}
static inline bool task_placement_boost_enabled(struct task_struct *p)
{
return false;
}
static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
static inline int sched_boost(void)
{
return 0;
}
static inline enum sched_boost_policy task_boost_policy(struct task_struct *p)
{
return SCHED_BOOST_NONE;
}
static inline bool
task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
{
return false;
}
static inline bool hmp_capable(void) { return false; }
static inline bool is_max_capacity_cpu(int cpu) { return true; }
static inline bool is_min_capacity_cpu(int cpu) { return true; }
static inline int
preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
{
return 1;
}
static inline struct sched_cluster *rq_cluster(struct rq *rq)
{
return NULL;
}
static inline u64 scale_load_to_cpu(u64 load, int cpu)
{
return load;
}
#ifdef CONFIG_SMP
static inline int cpu_capacity(int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
static inline void set_preferred_cluster(struct related_thread_group *grp) { }
static inline bool task_in_related_thread_group(struct task_struct *p)
{
return false;
}
static inline
struct related_thread_group *task_related_thread_group(struct task_struct *p)
{
return NULL;
}
static inline u32 task_load(struct task_struct *p) { return 0; }
static inline u32 task_pl(struct task_struct *p) { return 0; }
static inline int update_preferred_cluster(struct related_thread_group *grp,
struct task_struct *p, u32 old_load)
{
return 0;
}
static inline void add_new_task_to_grp(struct task_struct *new) {}
static inline int same_freq_domain(int src_cpu, int dst_cpu)
{
return 1;
}
static inline void clear_reserved(int cpu) { }
static inline int alloc_related_thread_groups(void) { return 0; }
#define trace_sched_cpu_load(...)
#define trace_sched_cpu_load_lb(...)
#define trace_sched_cpu_load_cgroup(...)
#define trace_sched_cpu_load_wakeup(...)
static inline void walt_fixup_cum_window_demand(struct rq *rq,
s64 scaled_delta) { }
static inline void update_cpu_cluster_capacity(const cpumask_t *cpus) { }
#ifdef CONFIG_SMP
static inline unsigned long thermal_cap(int cpu)
{
return cpu_rq(cpu)->cpu_capacity_orig;
}
#endif
static inline void clear_walt_request(int cpu) { }
static inline int is_reserved(int cpu)
{
return 0;
}
static inline int got_boost_kick(void)
{
return 0;
}
static inline void clear_boost_kick(int cpu) { }
static inline enum sched_boost_policy sched_boost_policy(void)
{
return SCHED_BOOST_NONE;
}
static inline void sched_boost_parse_dt(void) { }
static inline void clear_ed_task(struct task_struct *p, struct rq *rq) { }
static inline bool early_detection_notify(struct rq *rq, u64 wallclock)
{
return 0;
}
#ifdef CONFIG_SMP
static inline unsigned int power_cost(int cpu, u64 demand)
{
return SCHED_CAPACITY_SCALE;
}
#endif
static inline void note_task_waking(struct task_struct *p, u64 wallclock) { }
static inline void walt_map_freq_to_load(void) { }
#endif /* CONFIG_SCHED_WALT */
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
static inline bool energy_aware(void)
{
return sysctl_sched_energy_aware;
}
#else
static inline bool energy_aware(void)
{
return 0;
}
#endif

View File

@@ -8,6 +8,7 @@
* See kernel/stop_machine.c
*/
#include "sched.h"
#include "walt.h"
#ifdef CONFIG_SMP
static int
@@ -43,12 +44,14 @@ static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
add_nr_running(rq, 1);
walt_inc_cumulative_runnable_avg(rq, p);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
walt_dec_cumulative_runnable_avg(rq, p);
}
static void yield_task_stop(struct rq *rq)
@@ -143,4 +146,7 @@ const struct sched_class stop_sched_class = {
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,
#ifdef CONFIG_SCHED_WALT
.fixup_walt_sched_stats = fixup_walt_sched_stats_common,
#endif
};

View File

@@ -89,6 +89,33 @@ struct schedtune {
/* Boost value for tasks on that SchedTune CGroup */
int boost;
#ifdef CONFIG_SCHED_WALT
/* Toggle ability to override sched boost enabled */
bool sched_boost_no_override;
/*
* Controls whether a cgroup is eligible for sched boost or not. This
* can temporariliy be disabled by the kernel based on the no_override
* flag above.
*/
bool sched_boost_enabled;
/*
* This tracks the default value of sched_boost_enabled and is used
* restore the value following any temporary changes to that flag.
*/
bool sched_boost_enabled_backup;
/*
* Controls whether tasks of this cgroup should be colocated with each
* other and tasks of other cgroups that have the same flag turned on.
*/
bool colocate;
/* Controls whether further updates are allowed to the colocate flag */
bool colocate_update_disabled;
#endif /* CONFIG_SCHED_WALT */
/* Hint to bias scheduling of tasks on that SchedTune CGroup
* towards idle CPUs */
int prefer_idle;
@@ -121,6 +148,13 @@ static inline struct schedtune *parent_st(struct schedtune *st)
static struct schedtune
root_schedtune = {
.boost = 0,
#ifdef CONFIG_SCHED_WALT
.sched_boost_no_override = false,
.sched_boost_enabled = true,
.sched_boost_enabled_backup = true,
.colocate = false,
.colocate_update_disabled = false,
#endif
.prefer_idle = 0,
};
@@ -172,6 +206,77 @@ struct boost_groups {
/* Boost groups affecting each CPU in the system */
DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
#ifdef CONFIG_SCHED_WALT
static inline void init_sched_boost(struct schedtune *st)
{
st->sched_boost_no_override = false;
st->sched_boost_enabled = true;
st->sched_boost_enabled_backup = st->sched_boost_enabled;
st->colocate = false;
st->colocate_update_disabled = false;
}
bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2)
{
return task_schedtune(tsk1) == task_schedtune(tsk2);
}
void update_cgroup_boost_settings(void)
{
int i;
for (i = 0; i < BOOSTGROUPS_COUNT; i++) {
if (!allocated_group[i])
break;
if (allocated_group[i]->sched_boost_no_override)
continue;
allocated_group[i]->sched_boost_enabled = false;
}
}
void restore_cgroup_boost_settings(void)
{
int i;
for (i = 0; i < BOOSTGROUPS_COUNT; i++) {
if (!allocated_group[i])
break;
allocated_group[i]->sched_boost_enabled =
allocated_group[i]->sched_boost_enabled_backup;
}
}
bool task_sched_boost(struct task_struct *p)
{
struct schedtune *st = task_schedtune(p);
return st->sched_boost_enabled;
}
static u64
sched_boost_override_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct schedtune *st = css_st(css);
return st->sched_boost_no_override;
}
static int sched_boost_override_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 override)
{
struct schedtune *st = css_st(css);
st->sched_boost_no_override = !!override;
return 0;
}
#endif /* CONFIG_SCHED_WALT */
static inline bool schedtune_boost_timeout(u64 now, u64 ts)
{
return ((now - ts) > SCHEDTUNE_BOOST_HOLD_NS);
@@ -422,6 +527,53 @@ int schedtune_can_attach(struct cgroup_taskset *tset)
return 0;
}
#ifdef CONFIG_SCHED_WALT
static u64 sched_boost_enabled_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct schedtune *st = css_st(css);
return st->sched_boost_enabled;
}
static int sched_boost_enabled_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 enable)
{
struct schedtune *st = css_st(css);
st->sched_boost_enabled = !!enable;
st->sched_boost_enabled_backup = st->sched_boost_enabled;
return 0;
}
static u64 sched_colocate_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct schedtune *st = css_st(css);
return st->colocate;
}
static int sched_colocate_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 colocate)
{
struct schedtune *st = css_st(css);
if (st->colocate_update_disabled)
return -EPERM;
st->colocate = !!colocate;
st->colocate_update_disabled = true;
return 0;
}
#else /* CONFIG_SCHED_WALT */
static inline void init_sched_boost(struct schedtune *st) { }
#endif /* CONFIG_SCHED_WALT */
void schedtune_cancel_attach(struct cgroup_taskset *tset)
{
/* This can happen only if SchedTune controller is mounted with
@@ -535,6 +687,28 @@ boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
return st->boost;
}
#ifdef CONFIG_SCHED_WALT
static void schedtune_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
struct schedtune *st;
bool colocate;
cgroup_taskset_first(tset, &css);
st = css_st(css);
colocate = st->colocate;
cgroup_taskset_for_each(task, css, tset)
sync_cgroup_colocation(task, colocate);
}
#else
static void schedtune_attach(struct cgroup_taskset *tset)
{
}
#endif
static int
boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
s64 boost)
@@ -553,6 +727,23 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
}
static struct cftype files[] = {
#ifdef CONFIG_SCHED_WALT
{
.name = "sched_boost_no_override",
.read_u64 = sched_boost_override_read,
.write_u64 = sched_boost_override_write,
},
{
.name = "sched_boost_enabled",
.read_u64 = sched_boost_enabled_read,
.write_u64 = sched_boost_enabled_write,
},
{
.name = "colocate",
.read_u64 = sched_colocate_read,
.write_u64 = sched_colocate_write,
},
#endif
{
.name = "boost",
.read_s64 = boost_read,
@@ -615,6 +806,7 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
goto out;
/* Initialize per CPUs boost group support */
init_sched_boost(st);
schedtune_boostgroup_init(st, idx);
return &st->css;
@@ -653,6 +845,7 @@ schedtune_css_free(struct cgroup_subsys_state *css)
struct cgroup_subsys schedtune_cgrp_subsys = {
.css_alloc = schedtune_css_alloc,
.css_free = schedtune_css_free,
.attach = schedtune_attach,
.can_attach = schedtune_can_attach,
.cancel_attach = schedtune_cancel_attach,
.legacy_cftypes = files,

3420
kernel/sched/walt.c Normal file

File diff suppressed because it is too large Load Diff

387
kernel/sched/walt.h Normal file
View File

@@ -0,0 +1,387 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
*/
#ifndef __WALT_H
#define __WALT_H
#ifdef CONFIG_SCHED_WALT
#include <linux/sched/sysctl.h>
#define WINDOW_STATS_RECENT 0
#define WINDOW_STATS_MAX 1
#define WINDOW_STATS_MAX_RECENT_AVG 2
#define WINDOW_STATS_AVG 3
#define WINDOW_STATS_INVALID_POLICY 4
#define EXITING_TASK_MARKER 0xdeaddead
#define FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK 0
#define FREQ_REPORT_CPU_LOAD 1
#define FREQ_REPORT_TOP_TASK 2
#define for_each_related_thread_group(grp) \
list_for_each_entry(grp, &active_related_thread_groups, list)
#define SCHED_NEW_TASK_WINDOWS 5
extern unsigned int sched_ravg_window;
extern unsigned int max_possible_efficiency;
extern unsigned int min_possible_efficiency;
extern unsigned int max_possible_freq;
extern unsigned int sched_major_task_runtime;
extern unsigned int __read_mostly sched_init_task_load_windows;
extern unsigned int __read_mostly sched_load_granule;
extern struct mutex cluster_lock;
extern rwlock_t related_thread_group_lock;
extern __read_mostly unsigned int sched_ravg_hist_size;
extern __read_mostly unsigned int sched_freq_aggregate;
extern __read_mostly int sched_freq_aggregate_threshold;
extern __read_mostly unsigned int sched_window_stats_policy;
extern __read_mostly unsigned int sched_group_upmigrate;
extern __read_mostly unsigned int sched_group_downmigrate;
extern struct sched_cluster init_cluster;
extern void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
u64 wallclock, u64 irqtime);
extern unsigned int walt_big_tasks(int cpu);
static inline void
inc_nr_big_task(struct walt_sched_stats *stats, struct task_struct *p)
{
if (sched_disable_window_stats)
return;
if (p->misfit)
stats->nr_big_tasks++;
}
static inline void
dec_nr_big_task(struct walt_sched_stats *stats, struct task_struct *p)
{
if (sched_disable_window_stats)
return;
if (p->misfit)
stats->nr_big_tasks--;
BUG_ON(stats->nr_big_tasks < 0);
}
static inline void
walt_adjust_nr_big_tasks(struct rq *rq, int delta, bool inc)
{
if (sched_disable_window_stats)
return;
//sched_update_nr_prod(cpu_of(rq), 0, true); SATYA: Cross check
rq->walt_stats.nr_big_tasks += inc ? delta : -delta;
BUG_ON(rq->walt_stats.nr_big_tasks < 0);
}
static inline void
fixup_cumulative_runnable_avg(struct walt_sched_stats *stats,
s64 demand_scaled_delta,
s64 pred_demand_scaled_delta)
{
if (sched_disable_window_stats)
return;
stats->cumulative_runnable_avg_scaled += demand_scaled_delta;
BUG_ON((s64)stats->cumulative_runnable_avg_scaled < 0);
stats->pred_demands_sum_scaled += pred_demand_scaled_delta;
BUG_ON((s64)stats->pred_demands_sum_scaled < 0);
}
static inline void
walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
{
if (sched_disable_window_stats)
return;
fixup_cumulative_runnable_avg(&rq->walt_stats, p->ravg.demand_scaled,
p->ravg.pred_demand_scaled);
/*
* Add a task's contribution to the cumulative window demand when
*
* (1) task is enqueued with on_rq = 1 i.e migration,
* prio/cgroup/class change.
* (2) task is waking for the first time in this window.
*/
if (p->on_rq || (p->last_sleep_ts < rq->window_start))
walt_fixup_cum_window_demand(rq, p->ravg.demand_scaled);
}
static inline void
walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
{
if (sched_disable_window_stats)
return;
fixup_cumulative_runnable_avg(&rq->walt_stats,
-(s64)p->ravg.demand_scaled,
-(s64)p->ravg.pred_demand_scaled);
/*
* on_rq will be 1 for sleeping tasks. So check if the task
* is migrating or dequeuing in RUNNING state to change the
* prio/cgroup/class.
*/
if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
walt_fixup_cum_window_demand(rq, -(s64)p->ravg.demand_scaled);
}
extern void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p,
u16 updated_demand_scaled,
u16 updated_pred_demand_scaled);
extern void inc_rq_walt_stats(struct rq *rq, struct task_struct *p);
extern void dec_rq_walt_stats(struct rq *rq, struct task_struct *p);
extern void fixup_busy_time(struct task_struct *p, int new_cpu);
extern void init_new_task_load(struct task_struct *p);
extern void mark_task_starting(struct task_struct *p);
extern void set_window_start(struct rq *rq);
void account_irqtime(int cpu, struct task_struct *curr, u64 delta,
u64 wallclock);
extern bool do_pl_notif(struct rq *rq);
#define SCHED_HIGH_IRQ_TIMEOUT 3
static inline u64 sched_irqload(int cpu)
{
struct rq *rq = cpu_rq(cpu);
s64 delta;
delta = get_jiffies_64() - rq->irqload_ts;
/*
* Current context can be preempted by irq and rq->irqload_ts can be
* updated by irq context so that delta can be negative.
* But this is okay and we can safely return as this means there
* was recent irq occurrence.
*/
if (delta < SCHED_HIGH_IRQ_TIMEOUT)
return rq->avg_irqload;
else
return 0;
}
static inline int sched_cpu_high_irqload(int cpu)
{
return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload;
}
static inline int exiting_task(struct task_struct *p)
{
return (p->ravg.sum_history[0] == EXITING_TASK_MARKER);
}
static inline struct sched_cluster *cpu_cluster(int cpu)
{
return cpu_rq(cpu)->cluster;
}
static inline u64
scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
{
return div64_u64(load * (u64)src_freq, (u64)dst_freq);
}
static inline bool is_new_task(struct task_struct *p)
{
return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS;
}
static inline void clear_top_tasks_table(u8 *table)
{
memset(table, 0, NUM_LOAD_INDICES * sizeof(u8));
}
extern void update_cluster_load_subtractions(struct task_struct *p,
int cpu, u64 ws, bool new_task);
extern void sched_account_irqstart(int cpu, struct task_struct *curr,
u64 wallclock);
static inline unsigned int max_task_load(void)
{
return sched_ravg_window;
}
static inline u32 cpu_cycles_to_freq(u64 cycles, u64 period)
{
return div64_u64(cycles, period);
}
static inline unsigned int cpu_cur_freq(int cpu)
{
return cpu_rq(cpu)->cluster->cur_freq;
}
static inline unsigned int sched_cpu_legacy_freq(int cpu)
{
unsigned long curr_cap = arch_scale_freq_capacity(cpu);
return (curr_cap * (u64) cpu_rq(cpu)->cluster->max_possible_freq) >>
SCHED_CAPACITY_SHIFT;
}
static inline void
move_list(struct list_head *dst, struct list_head *src, bool sync_rcu)
{
struct list_head *first, *last;
first = src->next;
last = src->prev;
if (sync_rcu) {
INIT_LIST_HEAD_RCU(src);
synchronize_rcu();
}
first->prev = dst;
dst->prev = last;
last->next = dst;
/* Ensure list sanity before making the head visible to all CPUs. */
smp_mb();
dst->next = first;
}
extern void reset_task_stats(struct task_struct *p);
extern void update_cluster_topology(void);
extern struct list_head cluster_head;
#define for_each_sched_cluster(cluster) \
list_for_each_entry_rcu(cluster, &cluster_head, list)
extern void init_clusters(void);
extern void clear_top_tasks_bitmap(unsigned long *bitmap);
extern void sched_account_irqtime(int cpu, struct task_struct *curr,
u64 delta, u64 wallclock);
static inline void assign_cluster_ids(struct list_head *head)
{
struct sched_cluster *cluster;
int pos = 0;
list_for_each_entry(cluster, head, list) {
cluster->id = pos;
sched_cluster[pos++] = cluster;
}
}
static inline int same_cluster(int src_cpu, int dst_cpu)
{
return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster;
}
void sort_clusters(void);
void walt_irq_work(struct irq_work *irq_work);
void walt_sched_init_rq(struct rq *rq);
static inline void walt_update_last_enqueue(struct task_struct *p)
{
p->last_enqueued_ts = sched_ktime_clock();
}
extern void walt_rotate_work_init(void);
extern void walt_rotation_checkpoint(int nr_big);
extern unsigned int walt_rotation_enabled;
extern unsigned int walt_get_default_coloc_group_load(void);
#else /* CONFIG_SCHED_WALT */
static inline void walt_sched_init_rq(struct rq *rq) { }
static inline void walt_rotate_work_init(void) { }
static inline void walt_rotation_checkpoint(int nr_big) { }
static inline void walt_update_last_enqueue(struct task_struct *p) { }
static inline unsigned int walt_get_default_coloc_group_load(void)
{
return 0;
}
static inline void update_task_ravg(struct task_struct *p, struct rq *rq,
int event, u64 wallclock, u64 irqtime) { }
static inline void walt_inc_cumulative_runnable_avg(struct rq *rq,
struct task_struct *p)
{
}
static inline unsigned int walt_big_tasks(int cpu)
{
return 0;
}
static inline void walt_adjust_nr_big_tasks(struct rq *rq,
int delta, bool inc)
{
}
static inline void inc_nr_big_task(struct walt_sched_stats *stats,
struct task_struct *p)
{
}
static inline void dec_nr_big_task(struct walt_sched_stats *stats,
struct task_struct *p)
{
}
static inline void walt_dec_cumulative_runnable_avg(struct rq *rq,
struct task_struct *p)
{
}
static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
static inline void init_new_task_load(struct task_struct *p)
{
}
static inline void mark_task_starting(struct task_struct *p) { }
static inline void set_window_start(struct rq *rq) { }
static inline int sched_cpu_high_irqload(int cpu) { return 0; }
static inline void sched_account_irqstart(int cpu, struct task_struct *curr,
u64 wallclock)
{
}
static inline void update_cluster_topology(void) { }
static inline void init_clusters(void) {}
static inline void sched_account_irqtime(int cpu, struct task_struct *curr,
u64 delta, u64 wallclock)
{
}
static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; }
static inline bool do_pl_notif(struct rq *rq) { return false; }
static inline void
inc_rq_walt_stats(struct rq *rq, struct task_struct *p) { }
static inline void
dec_rq_walt_stats(struct rq *rq, struct task_struct *p) { }
static inline void
fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p,
u16 updated_demand_scaled,
u16 updated_pred_demand_scaled)
{
}
static inline u64 sched_irqload(int cpu)
{
return 0;
}
#endif /* CONFIG_SCHED_WALT */
#endif

View File

@@ -121,14 +121,19 @@ static int sixty = 60;
#endif
static int __maybe_unused neg_one = -1;
static int __maybe_unused neg_three = -3;
static int zero;
static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused three = 3;
static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
static int one_hundred = 100;
static int one_thousand = 1000;
#ifdef CONFIG_SCHED_WALT
static int two_million = 2000000;
#endif
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
@@ -320,6 +325,77 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
#ifdef CONFIG_SCHED_WALT
{
.procname = "sched_cpu_high_irqload",
.data = &sysctl_sched_cpu_high_irqload,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sched_group_upmigrate",
.data = &sysctl_sched_group_upmigrate_pct,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = walt_proc_update_handler,
.extra1 = &sysctl_sched_group_downmigrate_pct,
},
{
.procname = "sched_group_downmigrate",
.data = &sysctl_sched_group_downmigrate_pct,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = walt_proc_update_handler,
.extra1 = &zero,
.extra2 = &sysctl_sched_group_upmigrate_pct,
},
{
.procname = "sched_boost",
.data = &sysctl_sched_boost,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_boost_handler,
.extra1 = &neg_three,
.extra2 = &three,
},
{
.procname = "sched_walt_rotate_big_tasks",
.data = &sysctl_sched_walt_rotate_big_tasks,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one,
},
{
.procname = "sched_min_task_util_for_boost",
.data = &sysctl_sched_min_task_util_for_boost,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one_thousand,
},
{
.procname = "sched_min_task_util_for_colocation",
.data = &sysctl_sched_min_task_util_for_colocation,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one_thousand,
},
{
.procname = "sched_little_cluster_coloc_fmin_khz",
.data = &sysctl_sched_little_cluster_coloc_fmin_khz,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_little_cluster_coloc_fmin_khz_handler,
.extra1 = &zero,
.extra2 = &two_million,
},
#endif
#ifdef CONFIG_SCHED_DEBUG
{
.procname = "sched_cstate_aware",