sched: Add support to spread tasks

If sysctl_sched_prefer_spread is enabled, then tasks would be freely migrated to idle cpus within same cluster to reduce runnables. By default, the feature is disabled. User can trigger feature with: echo 1 > /proc/sys/kernel/sched_prefer_spread Aggressively spread tasks with in little cluster. echo 2 > /proc/sys/kernel/sched_prefer_spread Aggressively spread tasks with in little cluster as well as big cluster, but not between big and little. Change-Id: I0a4d87bd17de3525548765472e6f388a9970f13c Signed-off-by: Lingutla Chandrasekhar <clingutla@codeaurora.org>
2020-01-08 10:30:04 +05:30
parent 0cf4f5c7da
commit a80cf2007d
4 changed files with 93 additions and 24 deletions
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -56,6 +56,7 @@ extern unsigned int sysctl_sched_coloc_busy_hyst_max_ms;
 extern unsigned int sysctl_sched_window_stats_policy;
 extern unsigned int sysctl_sched_ravg_window_nr_ticks;
 extern unsigned int sysctl_sched_dynamic_ravg_window_enable;
+extern unsigned int sysctl_sched_prefer_spread;

 extern int
 walt_proc_group_thresholds_handler(struct ctl_table *table, int write,
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -278,11 +278,11 @@ TRACE_EVENT(sched_load_balance,
 		unsigned long group_mask, int busiest_nr_running,
 		unsigned long imbalance, unsigned int env_flags, int ld_moved,
 		unsigned int balance_interval, int active_balance,
-		int overutilized),
+		int overutilized, int prefer_spread),

 	TP_ARGS(cpu, idle, balance, group_mask, busiest_nr_running,
 		imbalance, env_flags, ld_moved, balance_interval,
-		active_balance, overutilized),
+		active_balance, overutilized, prefer_spread),

 	TP_STRUCT__entry(
 		__field(int,                    cpu)
@@ -296,6 +296,7 @@ TRACE_EVENT(sched_load_balance,
 		__field(unsigned int,           balance_interval)
 		__field(int,                    active_balance)
 		__field(int,                    overutilized)
+		__field(int,                    prefer_spread)
 	),

 	TP_fast_assign(
@@ -310,9 +311,10 @@ TRACE_EVENT(sched_load_balance,
 		__entry->balance_interval       = balance_interval;
 		__entry->active_balance		= active_balance;
 		__entry->overutilized		= overutilized;
+		__entry->prefer_spread		= prefer_spread;
 	),

-	TP_printk("cpu=%d state=%s balance=%d group=%#lx busy_nr=%d imbalance=%ld flags=%#x ld_moved=%d bal_int=%d active_balance=%d sd_overutilized=%d",
+	TP_printk("cpu=%d state=%s balance=%d group=%#lx busy_nr=%d imbalance=%ld flags=%#x ld_moved=%d bal_int=%d active_balance=%d sd_overutilized=%d prefer_spread=%d",
 		__entry->cpu,
 		__entry->idle == CPU_IDLE ? "idle" :
 		(__entry->idle == CPU_NEWLY_IDLE ? "newly_idle" : "busy"),
@@ -320,7 +322,7 @@ TRACE_EVENT(sched_load_balance,
 		__entry->group_mask, __entry->busiest_nr_running,
 		__entry->imbalance, __entry->env_flags, __entry->ld_moved,
 		__entry->balance_interval, __entry->active_balance,
-		__entry->overutilized)
+		__entry->overutilized, __entry->prefer_spread)
 );

 TRACE_EVENT(sched_load_balance_nohz_kick,
@@ -1213,7 +1215,7 @@ TRACE_EVENT(sched_compute_energy,
 TRACE_EVENT(sched_task_util,

 	TP_PROTO(struct task_struct *p, unsigned long candidates,
-		int best_energy_cpu, bool sync, bool need_idle, int fastpath,
+		int best_energy_cpu, bool sync, int need_idle, int fastpath,
 		bool placement_boost, u64 start_t,
 		bool stune_boosted, bool is_rtg, bool rtg_skip_min,
 		int start_cpu),
@@ -1230,7 +1232,7 @@ TRACE_EVENT(sched_task_util,
 		__field(int,		prev_cpu)
 		__field(int,		best_energy_cpu)
 		__field(bool,		sync)
-		__field(bool,		need_idle)
+		__field(int,		need_idle)
 		__field(int,		fastpath)
 		__field(int,		placement_boost)
 		__field(int,		rtg_cpu)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -176,6 +176,7 @@ unsigned int sysctl_sched_min_task_util_for_boost = 51;
 /* 0.68ms default for 20ms window size scaled to 1024 */
 unsigned int sysctl_sched_min_task_util_for_colocation = 35;
 unsigned int sched_task_filter_util = 35;
+__read_mostly unsigned int sysctl_sched_prefer_spread;
 #endif
 unsigned int sched_small_task_threshold = 102;

@@ -3928,16 +3929,27 @@ static inline bool task_demand_fits(struct task_struct *p, int cpu)
 }

 struct find_best_target_env {
-	bool is_rtg;
 	int placement_boost;
-	bool need_idle;
-	bool boosted;
+	int need_idle;
 	int fastpath;
 	int start_cpu;
-	bool strict_max;
 	int skip_cpu;
+	bool is_rtg;
+	bool boosted;
+	bool strict_max;
 };

+static inline bool prefer_spread_on_idle(int cpu)
+{
+	if (likely(!sysctl_sched_prefer_spread))
+		return false;
+
+	if (is_min_capacity_cpu(cpu))
+		return sysctl_sched_prefer_spread >= 1;
+
+	return sysctl_sched_prefer_spread > 1;
+}
+
 static inline void adjust_cpus_for_packing(struct task_struct *p,
 			int *target_cpu, int *best_idle_cpu,
 			int shallowest_idle_cstate,
@@ -3949,8 +3961,11 @@ static inline void adjust_cpus_for_packing(struct task_struct *p,
 	if (*best_idle_cpu == -1 || *target_cpu == -1)
 		return;

-	if (task_placement_boost_enabled(p) || fbt_env->need_idle || boosted ||
-			shallowest_idle_cstate <= 0) {
+	if (prefer_spread_on_idle(*best_idle_cpu))
+		fbt_env->need_idle |= 2;
+
+	if (fbt_env->need_idle || task_placement_boost_enabled(p) || boosted ||
+		shallowest_idle_cstate <= 0) {
 		*target_cpu = -1;
 		return;
 	}
@@ -7582,6 +7597,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu,
 	curr_is_rtg = task_in_related_thread_group(cpu_rq(cpu)->curr);

 	fbt_env.fastpath = 0;
+	fbt_env.need_idle = need_idle;

 	if (trace_sched_task_util_enabled())
 		start_t = sched_clock();
@@ -7629,7 +7645,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu,
 	if (sched_feat(FIND_BEST_TARGET)) {
 		fbt_env.is_rtg = is_rtg;
 		fbt_env.placement_boost = placement_boost;
-		fbt_env.need_idle = need_idle;
 		fbt_env.start_cpu = start_cpu;
 		fbt_env.boosted = boosted;
 		fbt_env.strict_max = is_rtg &&
@@ -7659,7 +7674,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu,
 	if (p->state == TASK_WAKING)
 		delta = task_util(p);
 #endif
-	if (task_placement_boost_enabled(p) || need_idle || boosted ||
+	if (task_placement_boost_enabled(p) || fbt_env.need_idle || boosted ||
 	    is_rtg || __cpu_overutilized(prev_cpu, delta) ||
 	    !task_fits_max(p, prev_cpu) || cpu_isolated(prev_cpu)) {
 		best_energy_cpu = cpu;
@@ -7703,8 +7718,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu,
 done:

 	trace_sched_task_util(p, cpumask_bits(candidates)[0], best_energy_cpu,
-			sync, need_idle, fbt_env.fastpath, placement_boost,
-			start_t, boosted, is_rtg, get_rtg_status(p), start_cpu);
+			sync, fbt_env.need_idle, fbt_env.fastpath,
+			placement_boost, start_t, boosted, is_rtg,
+			get_rtg_status(p), start_cpu);

 	return best_energy_cpu;

@@ -8410,6 +8426,7 @@ struct lb_env {
 	unsigned int		loop;
 	unsigned int		loop_break;
 	unsigned int		loop_max;
+	bool			prefer_spread;

 	enum fbq_type		fbq_type;
 	enum group_type		src_grp_type;
@@ -8582,8 +8599,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 		struct root_domain *rd = env->dst_rq->rd;

 		if ((rcu_dereference(rd->pd) && !sd_overutilized(env->sd)) &&
-					env->idle == CPU_NEWLY_IDLE &&
-					!task_in_related_thread_group(p)) {
+		    env->idle == CPU_NEWLY_IDLE && !env->prefer_spread &&
+		    !task_in_related_thread_group(p)) {
 			long util_cum_dst, util_cum_src;
 			unsigned long demand;

@@ -8754,8 +8771,13 @@ static int detach_tasks(struct lb_env *env)
 		 * So only when there is other tasks can be balanced or
 		 * there is situation to ignore big task, it is needed
 		 * to skip the task load bigger than 2*imbalance.
+		 *
+		 * And load based checks are skipped for prefer_spread in
+		 * finding busiest group, ignore the task's h_load.
 		 */
-		if (((cpu_rq(env->src_cpu)->nr_running > 2) ||
+
+		if (!env->prefer_spread &&
+			((cpu_rq(env->src_cpu)->nr_running > 2) ||
 			(env->flags & LBF_IGNORE_BIG_TASKS)) &&
 			((load / 2) > env->imbalance))
 			goto next;
@@ -9549,6 +9571,11 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 	if (sgs->group_type < busiest->group_type)
 		return false;

+	if (env->prefer_spread && env->idle != CPU_NOT_IDLE &&
+		(sgs->sum_nr_running > busiest->sum_nr_running) &&
+		(sgs->group_util > busiest->group_util))
+		return true;
+
 	if (sgs->avg_load <= busiest->avg_load)
 		return false;

@@ -9582,6 +9609,11 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 		return false;

 asym_packing:
+
+	if (env->prefer_spread &&
+		(sgs->sum_nr_running < busiest->sum_nr_running))
+		return false;
+
 	/* This is the busiest node in its class. */
 	if (!(env->sd->flags & SD_ASYM_PACKING))
 		return true;
@@ -10052,6 +10084,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s

 		return fix_small_imbalance(env, sds);
 	}
+
+	/*
+	 * If we couldn't find any imbalance, then boost the imbalance
+	 * with the group util.
+	 */
+	if (env->prefer_spread && !env->imbalance &&
+		env->idle != CPU_NOT_IDLE &&
+		busiest->sum_nr_running > busiest->group_weight)
+		env->imbalance = busiest->group_util;
 }

 /******* find_busiest_group() helpers end here *********************/
@@ -10439,6 +10480,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		.loop		= 0,
 	};

+	env.prefer_spread = (prefer_spread_on_idle(this_cpu) &&
+				!((sd->flags & SD_ASYM_CPUCAPACITY) &&
+				 !cpumask_test_cpu(this_cpu,
+						 &asym_cap_sibling_cpus)));
+
 	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);

 	schedstat_inc(sd->lb_count[idle]);
@@ -10728,7 +10774,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 				 busiest ? busiest->nr_running : 0,
 				 env.imbalance, env.flags, ld_moved,
 				 sd->balance_interval, active_balance,
-				 sd_overutilized(sd));
+				 sd_overutilized(sd), env.prefer_spread);
 	return ld_moved;
 }

@@ -10961,7 +11007,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 		}
 		max_cost += sd->max_newidle_lb_cost;

-		if (!sd_overutilized(sd))
+		if (!sd_overutilized(sd) && !prefer_spread_on_idle(cpu))
 			continue;

 		if (!(sd->flags & SD_LOAD_BALANCE))
@@ -11214,7 +11260,8 @@ static void nohz_balancer_kick(struct rq *rq)
 	 * happens from the tickpath.
 	 */
 	if (static_branch_likely(&sched_energy_present)) {
-		if (rq->nr_running >= 2 && cpu_overutilized(cpu))
+		if (rq->nr_running >= 2 && (cpu_overutilized(cpu) ||
+			prefer_spread_on_idle(cpu)))
 			flags = NOHZ_KICK_MASK;
 		goto out;
 	}
@@ -11584,6 +11631,11 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 	int pulled_task = 0;
 	u64 curr_cost = 0;
 	u64 avg_idle = this_rq->avg_idle;
+	bool prefer_spread = prefer_spread_on_idle(this_cpu);
+	bool force_lb = (!is_min_capacity_cpu(this_cpu) &&
+				silver_has_big_tasks() &&
+				(atomic_read(&this_rq->nr_iowait) == 0));
+

 	if (cpu_isolated(this_cpu))
 		return 0;
@@ -11599,8 +11651,8 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 	 */
 	if (!cpu_active(this_cpu))
 		return 0;
-	if (!is_min_capacity_cpu(this_cpu) && silver_has_big_tasks()
-		&& (atomic_read(&this_rq->nr_iowait) == 0))
+
+	if (force_lb || prefer_spread)
 		avg_idle = ULLONG_MAX;
 	/*
 	 * This is OK, because current is on_cpu, which avoids it being picked
@@ -11635,6 +11687,11 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;

+		if (prefer_spread && !force_lb &&
+			(sd->flags & SD_ASYM_CPUCAPACITY) &&
+			!(cpumask_test_cpu(this_cpu, &asym_cap_sibling_cpus)))
+			avg_idle = this_rq->avg_idle;
+
 		if (avg_idle < curr_cost + sd->max_newidle_lb_cost) {
 			update_next_balance(sd, &next_balance);
 			break;
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -551,6 +551,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_updown_migrate_handler,
 	},
+	{
+		.procname	= "sched_prefer_spread",
+		.data		= &sysctl_sched_prefer_spread,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &two,
+	},
 #endif
 #ifdef CONFIG_SCHED_DEBUG
 	{