From 9ade83ddc8a9ec85c4ef3c8c6099312c63b8064a Mon Sep 17 00:00:00 2001 From: zhizhimeimei6 Date: Thu, 27 Jan 2022 10:29:33 +0800 Subject: [PATCH 001/113] Signed-off-by: zhizhimeimei6 Change-Id: I40131c671b6a977c3a7644d335dc2d08ae1c8428 --- drivers/staging/Kconfig | 2 + drivers/staging/Makefile | 1 + drivers/staging/hungtask/Kconfig | 13 + drivers/staging/hungtask/Makefile | 2 + drivers/staging/hungtask/hungtask_base.c | 1041 ++++++++++++++++++++++ drivers/staging/hungtask/hungtask_user.c | 270 ++++++ drivers/staging/hungtask/hungtask_user.h | 47 + include/dfx/hungtask_base.h | 121 +++ kernel/hung_task.c | 35 +- 9 files changed, 1528 insertions(+), 4 deletions(-) create mode 100644 drivers/staging/hungtask/Kconfig create mode 100644 drivers/staging/hungtask/Makefile create mode 100644 drivers/staging/hungtask/hungtask_base.c create mode 100644 drivers/staging/hungtask/hungtask_user.c create mode 100644 drivers/staging/hungtask/hungtask_user.h create mode 100644 include/dfx/hungtask_base.h diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 4dbdc87e809c..e7cd80bb8761 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -122,4 +122,6 @@ source "drivers/staging/hilog/Kconfig" source "drivers/staging/hievent/Kconfig" +source "drivers/staging/hungtask/Kconfig" + endif # STAGING diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index b33dfbdcce7e..dfa144064b94 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -51,3 +51,4 @@ obj-$(CONFIG_WFX) += wfx/ obj-y += hikey9xx/ obj-$(CONFIG_HILOG) += hilog/ obj-$(CONFIG_HIEVENT) += hievent/ +obj-$(CONFIG_DFX_HUNGTASK) += hungtask/ diff --git a/drivers/staging/hungtask/Kconfig b/drivers/staging/hungtask/Kconfig new file mode 100644 index 000000000000..c7b43fa6eb62 --- /dev/null +++ b/drivers/staging/hungtask/Kconfig @@ -0,0 +1,13 @@ +config DFX_HUNGTASK + bool "DFX hungtask" + depends on DETECT_HUNG_TASK + default n + help + Base DFX hungtask module + +config DFX_HUNGTASK_USER + bool "DFX hungtask user watchdog module" + depends on DFX_HUNGTASK + default n + help + DFX hungtask user watchdog module \ No newline at end of file diff --git a/drivers/staging/hungtask/Makefile b/drivers/staging/hungtask/Makefile new file mode 100644 index 000000000000..24951f2cf42c --- /dev/null +++ b/drivers/staging/hungtask/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_DFX_HUNGTASK) += hungtask_base.o +obj-$(CONFIG_DFX_HUNGTASK_USER) += hungtask_user.o diff --git a/drivers/staging/hungtask/hungtask_base.c b/drivers/staging/hungtask/hungtask_base.c new file mode 100644 index 000000000000..740a5d1e2578 --- /dev/null +++ b/drivers/staging/hungtask/hungtask_base.c @@ -0,0 +1,1041 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#define pr_fmt(fmt) "hungtask_base " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_DFX_ZEROHUNG +#include +#endif +#include +#include "hungtask_user.h" + +static struct rb_root list_tasks = RB_ROOT; +static DEFINE_SPINLOCK(list_tasks_lock); +static struct hlist_head whitelist[WHITELIST_LEN]; +static struct whitelist_item whitetmplist[WHITELIST_LEN]; +static bool whitelist_empty = true; +static int remove_cnt; +static struct task_item *remove_list[MAX_REMOVE_LIST_NUM + 1]; +static unsigned long __read_mostly hungtask_timeout_secs = + CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; +static int did_panic; +static unsigned int hungtask_enable = HT_DISABLE; +static unsigned int whitelist_type = WHITE_LIST; +static int whitelist_dump_cnt = DEFAULT_WHITE_DUMP_CNT; +static int whitelist_panic_cnt = DEFAULT_WHITE_PANIC_CNT; +static int appspawn_pid; +static int dump_and_upload; +static int time_since_upload; +static int hung_task_must_panic; +static int report_zrhung_id; +static struct task_hung_upload upload; +static int do_refresh; +static char frozen_buf[FROZEN_BUF_LEN]; +static int frozen_used; +static bool frozed_head; +static unsigned long cur_heartbeat; +static struct work_struct send_work; +static char report_buf_text[REPORT_MSGLENGTH]; + +bool hashlist_find(struct hlist_head *head, int count, pid_t tgid) +{ + struct hashlist_node *hnode = NULL; + + if (count <= 0) + return false; + if (hlist_empty(&head[tgid % count])) + return false; + hlist_for_each_entry(hnode, &head[tgid % count], list) { + if (hnode->pid == tgid) + return true; + } + return false; +} + +void hashlist_clear(struct hlist_head *head, int count) +{ + int i = 0; + struct hlist_node *n = NULL; + struct hashlist_node *hnode = NULL; + + for (i = 0; i < count; i++) { + hlist_for_each_entry_safe(hnode, n, &head[i], list) { + hlist_del(&hnode->list); + kfree(hnode); + hnode = NULL; + } + } + for (i = 0; i < count; i++) + INIT_HLIST_HEAD(&head[i]); +} + +bool hashlist_insert(struct hlist_head *head, int count, pid_t tgid) +{ + struct hashlist_node *hnode = NULL; + + if (hashlist_find(head, count, tgid)) + return false; + hnode = kmalloc(sizeof(struct hashlist_node), GFP_ATOMIC); + if (!hnode) + return false; + INIT_HLIST_NODE(&hnode->list); + hnode->pid = tgid; + hlist_add_head(&hnode->list, &head[tgid % count]); + return true; +} + +static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) +{ + bool can_cont = false; + + get_task_struct(g); + get_task_struct(t); + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + can_cont = pid_alive(g) && pid_alive(t); + put_task_struct(t); + put_task_struct(g); + return can_cont; +} + +static bool rcu_break(int *max_count, int *batch_count, + struct task_struct *g, + struct task_struct *t) +{ + if (!(*max_count)--) + return true; + if (!--(*batch_count)) { + *batch_count = HUNG_TASK_BATCHING; + if (!rcu_lock_break(g, t)) + return true; + } + return false; +} + +static pid_t get_pid_by_name(const char *name) +{ + int max_count = PID_MAX_LIMIT; + int batch_count = HUNG_TASK_BATCHING; + struct task_struct *g = NULL; + struct task_struct *t = NULL; + int pid = 0; + + rcu_read_lock(); + do_each_thread(g, t) { + if (rcu_break(&max_count, &batch_count, g, t)) + goto unlock; + if (!strncmp(t->comm, name, TASK_COMM_LEN)) { + pid = t->tgid; + goto unlock; + } + } while_each_thread(g, t); + +unlock: + rcu_read_unlock(); + return pid; +} + +static unsigned int get_task_type(pid_t pid, pid_t tgid, struct task_struct *parent) +{ + unsigned int flag = TASK_TYPE_IGNORE; + /* check tgid of it's parent as PPID */ + if (parent) { + pid_t ppid = parent->tgid; + + if (ppid == PID_KTHREAD) + flag |= TASK_TYPE_KERNEL; + else if (ppid == appspawn_pid) + flag |= TASK_TYPE_APP; + else if (ppid == PID_INIT) + flag |= TASK_TYPE_NATIVE; + } + if (!whitelist_empty && hashlist_find(whitelist, WHITELIST_LEN, tgid)) + flag |= TASK_TYPE_WHITE | TASK_TYPE_JANK; + + return flag; +} + +static void refresh_appspawn_pids(void) +{ + int max_count = PID_MAX_LIMIT; + int batch_count = HUNG_TASK_BATCHING; + struct task_struct *g = NULL; + struct task_struct *t = NULL; + + rcu_read_lock(); + do_each_thread(g, t) { + if (rcu_break(&max_count, &batch_count, g, t)) + goto unlock; + if (!strncmp(t->comm, "appspawn", TASK_COMM_LEN)) + appspawn_pid = t->tgid; + } while_each_thread(g, t); +unlock: + rcu_read_unlock(); +} + +static void refresh_task_type(pid_t pid, int task_type) +{ + struct task_item *item = NULL; + struct rb_node *p = NULL; + + spin_lock(&list_tasks_lock); + for (p = rb_first(&list_tasks); p; p = rb_next(p)) { + item = rb_entry(p, struct task_item, node); + if (item->tgid == pid) + item->task_type = task_type; + } + spin_unlock(&list_tasks_lock); +} + +static void refresh_whitelist_pids(void) +{ + int i; + + hashlist_clear(whitelist, WHITELIST_LEN); + for (i = 0; i < WHITELIST_LEN; i++) { + if (!strlen(whitetmplist[i].name)) + continue; + whitetmplist[i].pid = + get_pid_by_name(whitetmplist[i].name); + if (!whitetmplist[i].pid) + continue; + refresh_task_type(whitetmplist[i].pid, + TASK_TYPE_WHITE | TASK_TYPE_JANK); + if (hashlist_insert(whitelist, WHITELIST_LEN, + whitetmplist[i].pid)) + pr_info("whitelist[%d]-%s-%d\n", i, + whitetmplist[i].name, whitetmplist[i].pid); + else + pr_info("can't find %s\n", whitetmplist[i].name); + } + refresh_appspawn_pids(); +} + +static struct task_item *find_task(pid_t pid, struct rb_root *root) +{ + struct rb_node **p = &root->rb_node; + struct task_item *cur = NULL; + struct rb_node *parent = NULL; + + while (*p) { + parent = *p; + cur = rb_entry(parent, struct task_item, node); + if (!cur) + return NULL; + if (pid < cur->pid) + p = &(*p)->rb_left; + else if (pid > cur->pid) + p = &(*p)->rb_right; + else + return cur; + } + return NULL; +} + +static bool insert_task(struct task_item *item, struct rb_root *root) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct task_item *cur = NULL; + + while (*p) { + parent = *p; + + cur = rb_entry(parent, struct task_item, node); + if (!cur) + return false; + if (item->pid < cur->pid) { + p = &(*p)->rb_left; + } else if (item->pid > cur->pid) { + p = &(*p)->rb_right; + } else { + pr_info("insert pid=%d,tgid=%d,name=%s,type=%d fail\n", + item->pid, item->tgid, + item->name, item->task_type); + return false; + } + } + rb_link_node(&item->node, parent, p); + rb_insert_color(&item->node, root); + return true; +} + +void show_block_task(struct task_item *taskitem, struct task_struct *p) +{ + unsigned long last_arrival; + unsigned long last_queued; + +#ifdef CONFIG_SCHED_INFO + last_arrival = p->sched_info.last_arrival; + last_queued = p->sched_info.last_queued; +#else + last_arrival = 0; + last_queued = 0; +#endif /* CONFIG_SCHED_INFO */ + if (unlikely(p->flags & PF_FROZEN)) { + if (taskitem) + pr_err("name=%s,PID=%d,tgid=%d,tgname=%s," + "FROZEN for %ds,type=%d,la%lu/lq%lu\n", + p->comm, p->pid, p->tgid, + p->group_leader->comm, + taskitem->d_state_time * HEARTBEAT_TIME, + taskitem->task_type, + last_arrival, last_queued); + else + pr_err("name=%s,PID=%d,tgid=%d,tgname=%s," + "just FROZE,la%lu/lq%lu\n", + p->comm, p->pid, p->tgid, + p->group_leader->comm, + last_arrival, last_queued); + } else { + if (taskitem) + pr_err("name=%s,PID=%d,tgid=%d,prio=%d,cpu=%d,tgname=%s," + "type=%d,blocked for %ds,la%lu/lq%lu\n", + taskitem->name, taskitem->pid, p->tgid, p->prio, + task_cpu(p), p->group_leader->comm, taskitem->task_type, + taskitem->d_state_time * HEARTBEAT_TIME, + last_arrival, last_queued); + else + pr_err("name=%s,PID=%d,tgid=%d,prio=%d,cpu=%d," + "tgname=%s,la%lu/lq%lu\n", + p->comm, p->pid, p->tgid, p->prio, task_cpu(p), + p->group_leader->comm, + last_arrival, last_queued); + + sched_show_task(p); + } +} + +void htbase_show_state_filter(unsigned long state_filter) +{ + struct task_struct *g = NULL; + struct task_struct *p = NULL; + struct task_item *taskitem = NULL; + +#if BITS_PER_LONG == 32 + pr_info(" task PC stack pid father\n"); +#else + pr_info(" task PC stack pid father\n"); +#endif + rcu_read_lock(); + for_each_process_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take a lot of time: + */ + touch_nmi_watchdog(); + if ((p->state == TASK_RUNNING) || (p->state & state_filter)) { + spin_lock(&list_tasks_lock); + taskitem = find_task(p->pid, &list_tasks); + spin_unlock(&list_tasks_lock); + show_block_task(taskitem, p); + } + } + touch_all_softlockup_watchdogs(); + rcu_read_unlock(); + /* Show locks if hungtask happen */ + if ((state_filter == TASK_UNINTERRUPTIBLE) || !state_filter) + debug_show_all_locks(); +} + +void hungtask_show_state_filter(unsigned long state_filter) +{ + pr_err("BinderChain_SysRq start\n"); + htbase_show_state_filter(state_filter); + pr_err("BinderChain_SysRq end\n"); +} + +void do_dump_task(struct task_struct *task) +{ + sched_show_task(task); + debug_show_held_locks(task); +} + +void do_show_task(struct task_struct *task, unsigned int flag, int d_state_time) +{ + pr_err("%s, flag=%d\n", __func__, flag); + rcu_read_lock(); + if (!pid_alive(task)) { + rcu_read_unlock(); + return; + } + if (flag & (FLAG_DUMP_WHITE | FLAG_DUMP_APP)) { + int cnt = 0; + + trace_sched_process_hang(task); + cnt = d_state_time; + pr_err("INFO: task %s:%d tgid:%d blocked for %ds in %s\n", + task->comm, task->pid, task->tgid, + (HEARTBEAT_TIME * cnt), + (flag & FLAG_DUMP_WHITE) ? "whitelist" : "applist"); + pr_err(" %s %s %.*s\n", + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + do_dump_task(task); + touch_nmi_watchdog(); + if (flag & FLAG_DUMP_WHITE && (!dump_and_upload)) { + dump_and_upload++; + upload.pid = task->pid; + upload.tgid = task->tgid; + upload.duration = d_state_time; + memset(upload.name, 0, sizeof(upload.name)); + strncpy(upload.name, task->comm, sizeof(task->comm)); + upload.flag = flag; + if (task->flags & PF_FROZEN) + upload.flag = (upload.flag | FLAG_PF_FROZEN); + } + } + rcu_read_unlock(); +} + +static void do_panic(void) +{ + if (sysctl_hung_task_panic) { + trigger_all_cpu_backtrace(); + panic("hungtask: blocked tasks"); + } +} + +static void create_taskitem(struct task_item *taskitem, + struct task_struct *task) +{ + taskitem->pid = task->pid; + taskitem->tgid = task->tgid; + memset(taskitem->name, 0, sizeof(taskitem->name)); + strncpy(taskitem->name, task->comm, sizeof(task->comm)); + taskitem->switch_count = task->nvcsw + task->nivcsw; + taskitem->dump_wa = 0; /* whitelist or applist task dump times */ + taskitem->panic_wa = 0; /* whitelist or applist task panic times */ + taskitem->d_state_time = -1; + taskitem->isdone_wa = true; /* if task in white or app dealed */ +} + +static bool refresh_task(struct task_item *taskitem, struct task_struct *task) +{ + bool is_called = false; + + if (taskitem->switch_count != (task->nvcsw + task->nivcsw)) { + taskitem->switch_count = task->nvcsw + task->nivcsw; + is_called = true; + return is_called; + } + if (taskitem->task_type & TASK_TYPE_WHITE) { + taskitem->isdone_wa = false; + taskitem->dump_wa++; + taskitem->panic_wa++; + } + taskitem->d_state_time++; + if (task->flags & PF_FROZEN) + taskitem->task_type |= TASK_TYPE_FROZEN; + return is_called; +} + +static void remove_list_tasks(struct task_item *item) +{ + rb_erase(&item->node, &list_tasks); + kfree(item); +} + +static void shrink_process_item(struct task_item *item, bool *is_finish) +{ + if (remove_cnt >= MAX_REMOVE_LIST_NUM) { + int i; + + remove_list[remove_cnt++] = item; + for (i = 0; i < remove_cnt; i++) + remove_list_tasks(remove_list[i]); + remove_cnt = 0; + *is_finish = false; + } else { + remove_list[remove_cnt++] = item; + } +} + +static void shrink_list_tasks(void) +{ + int i; + bool is_finish = false; + struct rb_node *n = NULL; + struct task_item *item = NULL; + + spin_lock(&list_tasks_lock); + while (!is_finish) { + is_finish = true; + for (n = rb_first(&list_tasks); n != NULL; n = rb_next(n)) { + item = rb_entry(n, struct task_item, node); + if (!item) + continue; + if (item->isdone_wa) { + shrink_process_item(item, &is_finish); + if (!is_finish) + break; + } + } + } + for (i = 0; i < remove_cnt; i++) + remove_list_tasks(remove_list[i]); + remove_cnt = 0; + spin_unlock(&list_tasks_lock); +} + +static void check_parameters(void) +{ + if ((whitelist_dump_cnt < 0) || + (whitelist_dump_cnt > DEFAULT_WHITE_DUMP_CNT)) + whitelist_dump_cnt = DEFAULT_WHITE_DUMP_CNT; + if ((whitelist_panic_cnt <= 0) || + (whitelist_panic_cnt > DEFAULT_WHITE_PANIC_CNT)) + whitelist_panic_cnt = DEFAULT_WHITE_PANIC_CNT; +} + +static void send_work_handler(struct work_struct *data) +{ +#ifdef CONFIG_DFX_ZEROHUNG + zrhung_send_event(HUNGTASK_DOMAIN, HUNGTASK_NAME, + report_buf_text); +#endif +} + +static void htbase_report_zrhung_event(const char *report_buf_tag) +{ + htbase_show_state_filter(TASK_UNINTERRUPTIBLE); + pr_err("%s end\n", report_buf_tag); + schedule_work(&send_work); + report_zrhung_id++; +} + +static void htbase_report_zrhung(unsigned int event) +{ + bool report_load = false; + char report_buf_tag[REPORT_MSGLENGTH] = {0}; + char report_name[TASK_COMM_LEN + 1] = {0}; + int report_pid = 0; + int report_hungtime = 0; + int report_tasktype = 0; + + if (!event) + return; + if (event & HUNGTASK_EVENT_WHITELIST) { + snprintf(report_buf_tag, sizeof(report_buf_tag), + "hungtask_whitelist_%d", report_zrhung_id); + strncpy(report_name, upload.name, TASK_COMM_LEN); + report_pid = upload.pid; + report_tasktype = TASK_TYPE_WHITE; + report_hungtime = whitelist_dump_cnt * HEARTBEAT_TIME; + report_load = true; + } else { + pr_err("No such event report to zerohung!"); + } + pr_err("%s start\n", report_buf_tag); + if (event & HUNGTASK_EVENT_WHITELIST) + pr_err("report HUNGTASK_EVENT_WHITELIST to zrhung\n"); + if (upload.flag & FLAG_PF_FROZEN) + snprintf(report_buf_text, sizeof(report_buf_text), + "Task %s(%s) pid %d type %d blocked %ds.", + report_name, "FROZEN", report_pid, report_tasktype, report_hungtime); + else + snprintf(report_buf_text, sizeof(report_buf_text), + "Task %s pid %d type %d blocked %ds.", + report_name, report_pid, report_tasktype, report_hungtime); + if (report_load) + htbase_report_zrhung_event(report_buf_tag); +} + +static int print_frozen_list_item(int pid) +{ + int tmp; + + if (!frozed_head) { + tmp = snprintf(frozen_buf, FROZEN_BUF_LEN, "%s", "FROZEN Pid:"); + if (tmp < 0) + return -1; + frozen_used += min(tmp, FROZEN_BUF_LEN - 1); + frozed_head = true; + } + tmp = snprintf(frozen_buf + frozen_used, FROZEN_BUF_LEN - frozen_used, "%d,", + pid); + if (tmp < 0) + return -1; + frozen_used += min(tmp, FROZEN_BUF_LEN - frozen_used - 1); + return frozen_used; +} + +int dump_task_wa(struct task_item *item, int dump_cnt, + struct task_struct *task, unsigned int flag) +{ + int ret = 0; + + if ((item->d_state_time > TWO_MINUTES) && + (item->d_state_time % TWO_MINUTES != 0)) + return ret; + if ((item->d_state_time > HUNG_TEN_MINUTES) && + (item->d_state_time % HUNG_TEN_MINUTES != 0)) + return ret; + if ((item->d_state_time > HUNG_ONE_HOUR) && + (item->d_state_time % HUNG_ONE_HOUR != 0)) + return ret; + if (dump_cnt && (item->dump_wa > dump_cnt)) { + item->dump_wa = 1; + if (!dump_and_upload && task->flags & PF_FROZEN) { + int tmp = print_frozen_list_item(item->pid); + if (tmp < 0) + return ret; + if (tmp >= FROZEN_BUF_LEN - 1) { + pr_err("%s", frozen_buf); + memset(frozen_buf, 0, sizeof(frozen_buf)); + frozen_used = 0; + frozed_head = false; + print_frozen_list_item(item->pid); + } + } else if (!dump_and_upload) { + pr_err("Ready to dump a task %s\n", item->name); + do_show_task(task, flag, item->d_state_time); + ret++; + } + } + return ret; +} + +static void update_panic_task(struct task_item *item) +{ + if (upload.pid != 0) + return; + + upload.pid = item->pid; + upload.tgid = item->tgid; + memset(upload.name, 0, sizeof(upload.name)); + strncpy(upload.name, item->name, sizeof(item->name)); +} + +static void deal_task(struct task_item *item, struct task_struct *task, bool is_called) +{ + int any_dumped_num = 0; + + if (is_called) { + item->dump_wa = 1; + item->panic_wa = 1; + item->d_state_time = 0; + return; + } + if (item->task_type & TASK_TYPE_WHITE) + any_dumped_num = dump_task_wa(item, whitelist_dump_cnt, task, + FLAG_DUMP_WHITE); + if (!is_called && (item->task_type & TASK_TYPE_WHITE)) { + if (whitelist_panic_cnt && item->panic_wa > whitelist_panic_cnt) { + pr_err("Task %s is causing panic\n", item->name); + update_panic_task(item); + item->panic_wa = 0; + hung_task_must_panic++; + } else { + item->isdone_wa = false; + } + } + if (item->isdone_wa) + remove_list_tasks(item); +} + +static bool check_conditions(struct task_struct *task, unsigned int task_type) +{ + bool no_check = true; + + if (task->flags & PF_FROZEN) + return no_check; + if (task_type & TASK_TYPE_WHITE && + (whitelist_dump_cnt || whitelist_panic_cnt)) + no_check = false; + return no_check; +} + +static void htbase_check_one_task(struct task_struct *t) +{ + unsigned int task_type = TASK_TYPE_IGNORE; + unsigned long switch_count = t->nvcsw + t->nivcsw; + struct task_item *taskitem = NULL; + bool is_called = false; + + if (unlikely(!switch_count)) { + pr_info("skip one's switch_count is zero\n"); + return; + } + + taskitem = find_task(t->pid, &list_tasks); + if (taskitem) { + if (check_conditions(t, taskitem->task_type)) + return; + is_called = refresh_task(taskitem, t); + } else { + task_type = get_task_type(t->pid, t->tgid, t->real_parent); + if (check_conditions(t, task_type)) + return; + taskitem = kmalloc(sizeof(*taskitem), GFP_ATOMIC); + if (!taskitem) { + pr_err("kmalloc failed"); + return; + } + memset(taskitem, 0, sizeof(*taskitem)); + taskitem->task_type = task_type; + create_taskitem(taskitem, t); + is_called = refresh_task(taskitem, t); + insert_task(taskitem, &list_tasks); + } + deal_task(taskitem, t, is_called); +} + +static void htbase_pre_process(void) +{ + htbase_set_timeout_secs(sysctl_hung_task_timeout_secs); + cur_heartbeat++; + if ((cur_heartbeat % REFRESH_INTERVAL) == 0) + do_refresh = 1; + else + do_refresh = 0; + if (do_refresh || (cur_heartbeat < TIME_REFRESH_PIDS)) { + refresh_whitelist_pids(); + check_parameters(); + } +} + +static void htbase_post_process(void) +{ + struct rb_node *n = NULL; + unsigned int hungevent = 0; + + if (frozen_used) { + pr_err("%s", frozen_buf); + memset(frozen_buf, 0, sizeof(frozen_buf)); + frozen_used = 0; + frozed_head = false; + } + if (dump_and_upload == HUNG_TASK_UPLOAD_ONCE) { + hungevent |= HUNGTASK_EVENT_WHITELIST; + dump_and_upload++; + } + if (dump_and_upload > 0) { + time_since_upload++; + if (time_since_upload > (whitelist_panic_cnt - whitelist_dump_cnt)) { + dump_and_upload = 0; + time_since_upload = 0; + } + } + if (hung_task_must_panic) { + htbase_show_state_filter(TASK_UNINTERRUPTIBLE); + hung_task_must_panic = 0; + pr_err("Task %s:%d blocked for %ds is causing panic\n", + upload.name, upload.pid, + whitelist_panic_cnt * HEARTBEAT_TIME); + do_panic(); + } + htuser_post_process_userlist(); + shrink_list_tasks(); + for (n = rb_first(&list_tasks); n != NULL; n = rb_next(n)) { + struct task_item *item = rb_entry(n, struct task_item, node); + item->isdone_wa = true; + } + + if (hungevent) + htbase_report_zrhung(hungevent); +} + +void htbase_check_tasks(unsigned long timeout) +{ + int max_count = PID_MAX_LIMIT; + int batch_count = HUNG_TASK_BATCHING; + struct task_struct *g = NULL; + struct task_struct *t = NULL; + + if (!hungtask_enable) + return; + if (test_taint(TAINT_DIE) || did_panic) { + pr_err("already in doing panic\n"); + return; + } + + htbase_pre_process(); + rcu_read_lock(); + for_each_process_thread(g, t) { + if (!max_count--) + goto unlock; + if (!--batch_count) { + batch_count = HUNG_TASK_BATCHING; + if (!rcu_lock_break(g, t)) + goto unlock; + } + if ((t->state == TASK_UNINTERRUPTIBLE) || + (t->state == TASK_KILLABLE)) + htbase_check_one_task(t); + } +unlock: + rcu_read_unlock(); + htbase_post_process(); +} + +static ssize_t htbase_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + if (hungtask_enable) + return snprintf(buf, ENABLE_SHOW_LEN, "on\n"); + else + return snprintf(buf, ENABLE_SHOW_LEN, "off\n"); +} + +static ssize_t htbase_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + char tmp[6]; /* only storage "on" "off" "kick" and enter */ + size_t len; + char *p = NULL; + + if (!buf) + return -EINVAL; + if ((count < 2) || (count > (sizeof(tmp) - 1))) { + pr_err("string too long or too short\n"); + return -EINVAL; + } + + p = memchr(buf, '\n', count); + len = p ? (size_t)(p - buf) : count; + memset(tmp, 0, sizeof(tmp)); + strncpy(tmp, buf, len); + if (!strncmp(tmp, "on", strlen(tmp))) { + hungtask_enable = HT_ENABLE; + pr_info("set hungtask_enable to enable\n"); + } else if (!strncmp(tmp, "off", strlen(tmp))) { + hungtask_enable = HT_DISABLE; + pr_info("set hungtask_enable to disable\n"); + } else { + pr_err("only accept on or off\n"); + } + return (ssize_t) count; +} + +static ssize_t htbase_monitorlist_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + int i; + char *start = buf; + char all_buf[WHITELIST_STORE_LEN - 20]; /* exclude extra header len 20*/ + unsigned long len = 0; + + memset(all_buf, 0, sizeof(all_buf)); + for (i = 0; i < WHITELIST_LEN; i++) { + if (whitetmplist[i].pid > 0) { + len += snprintf(all_buf + len, sizeof(all_buf) - len, + "%s-%d,", whitetmplist[i].name, whitetmplist[i].pid); + if (!(len < sizeof(all_buf))) { + len = sizeof(all_buf) - 1; + break; + } + } + } + if (len > 0) + all_buf[len] = 0; + if (whitelist_type == WHITE_LIST) + buf += snprintf(buf, WHITELIST_STORE_LEN, "whitelist:[%s]\n", all_buf); + else if (whitelist_type == BLACK_LIST) + buf += snprintf(buf, WHITELIST_STORE_LEN, "blacklist:[%s]\n", all_buf); + else + buf += snprintf(buf, WHITELIST_STORE_LEN, "\n"); + return buf - start; +} + +static void htbase_monitorlist_update(char **cur) +{ + int index = 0; + char *token = NULL; + + hashlist_clear(whitelist, WHITELIST_LEN); + memset(whitetmplist, 0, sizeof(whitetmplist)); + /* generate the new whitelist */ + for (; ; ) { + token = strsep(cur, ","); + if (token && strlen(token)) { + strncpy(whitetmplist[index].name, token, TASK_COMM_LEN); + if (strlen(whitetmplist[index].name) > 0) + whitelist_empty = false; + index++; + if (index >= WHITELIST_LEN) + break; + } + if (!(*cur)) + break; + } +} + +/* + * monitorlist_store - Called when 'write/echo' method is + * used on entry '/sys/kernel/hungtask/monitorlist'. + */ +static ssize_t htbase_monitorlist_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + size_t len; + char *p = NULL; + char all_buf[WHITELIST_STORE_LEN]; + char *cur = all_buf; + + + if ((n < 2) || (n > (sizeof(all_buf) - 1))) { + pr_err("whitelist input string illegal\n"); + return -EINVAL; + } + if (!buf) + return -EINVAL; + /* + * input format: + * write /sys/kernel/hungtask/monitorlist "whitelist, + * system_server,surfaceflinger" + */ + p = memchr(buf, '\n', n); + len = p ? (size_t)(p - buf) : n; /* exclude the '\n' */ + + memset(all_buf, 0, sizeof(all_buf)); + len = len > WHITELIST_STORE_LEN ? WHITELIST_STORE_LEN : len; + strncpy(all_buf, buf, len); + p = strsep(&cur, ","); + if (!cur) { + pr_err("string is not correct\n"); + return -EINVAL; + } + if (!strncmp(p, "whitelist", n)) { + whitelist_type = WHITE_LIST; + } else { + if (!strncmp(p, "blacklist", n)) + pr_err("blacklist is not support\n"); + else + pr_err("wrong list type is set\n"); + return -EINVAL; + } + if (!strlen(cur)) { + pr_err("at least one process need to be set\n"); + return -EINVAL; + } + pr_err("whitelist is %s\n", cur); + + htbase_monitorlist_update(&cur); + /* check again in case user input "whitelist,,,,,," */ + if (whitelist_empty) { + pr_err("at least one process need to be set\n"); + return -EINVAL; + } + return (ssize_t) n; +} + +/* used for sysctl at "/proc/sys/kernel/hung_task_timeout_secs" */ +void htbase_set_timeout_secs(unsigned long new_hungtask_timeout_secs) +{ + if ((new_hungtask_timeout_secs > CONFIG_DEFAULT_HUNG_TASK_TIMEOUT) || + (new_hungtask_timeout_secs % HEARTBEAT_TIME)) + return; + hungtask_timeout_secs = new_hungtask_timeout_secs; + /* + * if user change panic timeout value, we sync it to dump value + * defaultly, user can set it diffrently + */ + whitelist_panic_cnt = (int)(hungtask_timeout_secs / HEARTBEAT_TIME); + if (whitelist_panic_cnt > THIRTY_SECONDS) + whitelist_dump_cnt = whitelist_panic_cnt / HT_DUMP_IN_PANIC_LOOSE; + else + whitelist_dump_cnt = whitelist_panic_cnt / HT_DUMP_IN_PANIC_STRICT; +} + +void htbase_set_panic(int new_did_panic) +{ + did_panic = new_did_panic; +} + +static struct kobj_attribute timeout_attribute = { + .attr = { + .name = "enable", + .mode = 0640, + }, + .show = htbase_enable_show, + .store = htbase_enable_store, +}; + +static struct kobj_attribute monitorlist_attr = { + .attr = { + .name = "monitorlist", + .mode = 0640, + }, + .show = htbase_monitorlist_show, + .store = htbase_monitorlist_store, +}; + +#ifdef CONFIG_DFX_HUNGTASK_USER +static struct kobj_attribute userlist_attr = { + .attr = { + .name = "userlist", + .mode = 0640, + }, + .show = htuser_list_show, + .store = htuser_list_store, +}; +#endif + +static struct attribute *attrs[] = { + &timeout_attribute.attr, + &monitorlist_attr.attr, +#ifdef CONFIG_DFX_HUNGTASK_USER + &userlist_attr.attr, +#endif + NULL +}; + +static struct attribute_group hungtask_attr_group = { + .attrs = attrs, +}; + +static struct kobject *hungtask_kobj; +int htbase_create_sysfs(void) +{ + int i; + int ret; + + /* sleep 1000ms and wait /sys/kernel ready */ + while (!kernel_kobj) + msleep(1000); + + /* Create kobject named "hungtask" located at /sys/kernel/huangtask */ + hungtask_kobj = kobject_create_and_add("hungtask", kernel_kobj); + if (!hungtask_kobj) + return -ENOMEM; + ret = sysfs_create_group(hungtask_kobj, &hungtask_attr_group); + if (ret) + kobject_put(hungtask_kobj); + + for (i = 0; i < WHITELIST_LEN; i++) + INIT_HLIST_HEAD(&whitelist[i]); + memset(whitetmplist, 0, sizeof(whitetmplist)); + + INIT_WORK(&send_work, send_work_handler); + + return ret; +} diff --git a/drivers/staging/hungtask/hungtask_user.c b/drivers/staging/hungtask/hungtask_user.c new file mode 100644 index 000000000000..7070ba197d9b --- /dev/null +++ b/drivers/staging/hungtask/hungtask_user.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#define pr_fmt(fmt) "hungtask_user " fmt + +#include +#include +#include +#include +#include + +#include + +#define CMD_MIN_LEN 3 +#define CMD_MAX_LEN 20 +#define USERLIST_NUM 10 +#define MAX_USER_TIMEOUT 120 +#define MAX_SHOW_LEN 512 + +struct user_item { + pid_t pid; + int cur_cnt; + int panic_cnt; +}; + +static struct user_item userlist[USERLIST_NUM]; +static int userlist_count; +static DEFINE_SPINLOCK(userlist_lock); +static bool is_registered; +static bool need_panic; +static bool need_dump; +static int block_time; +static int block_pid; + +static void htuser_show_task(int pid) +{ + struct task_struct *p = NULL; + + p = pid_task(find_vpid(pid), PIDTYPE_PID); + if (p == NULL) { + pr_err("can not find pid %d\n", pid); + return; + } + + if (p->flags & PF_FROZEN) { + pr_info("process %d is frozen\n", pid); + return; + } + if (p->state == TASK_UNINTERRUPTIBLE) { + pr_err("UserList_KernelStack start\n"); + sched_show_task(p); + pr_err("UserList_KernelStack end\n"); + } +} + +static void htuser_list_insert(int pid, int count) +{ + spin_lock(&userlist_lock); + if (userlist_count >= USERLIST_NUM) { + pr_err("list is full\n"); + spin_unlock(&userlist_lock); + return; + } + userlist[userlist_count].pid = pid; + userlist[userlist_count].cur_cnt = 0; + userlist[userlist_count].panic_cnt = count; + userlist_count++; + spin_unlock(&userlist_lock); +} + +static int htuser_list_remove(int pid) +{ + int i; + + spin_lock(&userlist_lock); + for (i = 0; i < userlist_count; i++) { + if (userlist[i].pid == pid) { + if (i == userlist_count - 1) { + memset(&userlist[i], 0, sizeof(userlist[i])); + } else { + int len = sizeof(userlist[0]) * (userlist_count - i - 1); + memmove(&userlist[i], &userlist[i + 1], len); + } + userlist_count--; + spin_unlock(&userlist_lock); + return 0; + } + } + spin_unlock(&userlist_lock); + return -ENOENT; +} + +static void htuser_list_update(void) +{ + int i; + + need_panic = false; + need_dump = false; + spin_lock(&userlist_lock); + for (i = 0; i < userlist_count; i++) { + userlist[i].cur_cnt++; + if ((userlist[i].cur_cnt >= userlist[i].panic_cnt) || + (userlist[i].cur_cnt == userlist[i].panic_cnt / 2)) { + htuser_show_task(userlist[i].pid); + pr_err("process %d not scheduled for %ds\n", + userlist[i].pid, + userlist[i].cur_cnt * HEARTBEAT_TIME); + } + if (userlist[i].cur_cnt == userlist[i].panic_cnt) { + need_dump = true; + need_panic = true; + block_time = userlist[i].cur_cnt * HEARTBEAT_TIME; + block_pid = userlist[i].pid; + } + } + spin_unlock(&userlist_lock); +} + +static void htuser_list_kick(int pid) +{ + int i; + + spin_lock(&userlist_lock); + for (i = 0; i < userlist_count; i++) { + if (userlist[i].pid == pid) { + userlist[i].cur_cnt = 0; + spin_unlock(&userlist_lock); + return; + } + } + spin_unlock(&userlist_lock); +} + +void htuser_post_process_userlist(void) +{ + htuser_list_update(); + if (need_dump) { + pr_err("print all cpu stack and D state stack\n"); + hungtask_show_state_filter(TASK_UNINTERRUPTIBLE); + } + if (need_panic) + panic("UserList Process %d blocked for %ds causing panic", block_pid, block_time); +} + +static int htuser_process_notifier(struct notifier_block *self, + unsigned long cmd, void *v) +{ + struct task_struct *task = v; + + if (task == NULL) + return NOTIFY_OK; + + if ((task->tgid == task->pid) && (!htuser_list_remove(task->tgid))) + pr_err("remove success due to process %d die\n", task->tgid); + + return NOTIFY_OK; +} + +static struct notifier_block htuser_process_notify = { + .notifier_call = htuser_process_notifier, +}; + +ssize_t htuser_list_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i; + char tmp[MAX_SHOW_LEN] = {0}; + int len = 0; + + len += snprintf(tmp + len, MAX_SHOW_LEN - len, + " Pid Current(sec) Expired(sec)\n"); + + spin_lock(&userlist_lock); + for (i = 0; i < userlist_count; i++) { + len += snprintf(tmp + len, MAX_SHOW_LEN - len, + "%5d %5d %5d", userlist[i].pid, + userlist[i].cur_cnt * HEARTBEAT_TIME, + userlist[i].panic_cnt * HEARTBEAT_TIME); + if (len >= MAX_SHOW_LEN) { + len = MAX_SHOW_LEN - 1; + break; + } + } + spin_unlock(&userlist_lock); + pr_info("%s\n", tmp); + strncpy(buf, tmp, len); + + return len; +} + +static int htuser_list_store_on(char *tmp, size_t len, int pid) +{ + unsigned long sec = 0; + + if (kstrtoul(tmp + 3, 10, &sec)) { + pr_err("invalid timeout value\n"); + return -EINVAL; + } + if ((sec > MAX_USER_TIMEOUT) || !sec) { + pr_err("invalid timeout value, should be in 0-%d\n", MAX_USER_TIMEOUT); + return -EINVAL; + } + if (sec % HEARTBEAT_TIME) { + pr_err("invalid timeout value, should be devided by %d\n", HEARTBEAT_TIME); + return -EINVAL; + } + pr_info("process %d set to enable, timeout=%ld\n", pid, sec); + htuser_list_insert(pid, sec / HEARTBEAT_TIME); + if (!is_registered) { + profile_event_register(PROFILE_TASK_EXIT, + &htuser_process_notify); + is_registered = true; + } + + return 0; +} + +ssize_t htuser_list_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + char tmp[CMD_MAX_LEN]; /* on/off/kick */ + size_t len; + char *p = NULL; + int pid = current->tgid; + int uid = current->cred->euid.val; + + if (uid >= 10000) + pr_err("non-system process %d(uid=%d) can not be added to hungtask userlist\n", + pid, uid); + if ((count < CMD_MIN_LEN) || (count > CMD_MAX_LEN)) { + pr_err("string too long or too short\n"); + return -EINVAL; + } + if (!buf) + return -EINVAL; + + memset(tmp, 0, sizeof(tmp)); + p = memchr(buf, '\n', count); + len = p ? (size_t)(p - buf) : count; + strncpy(tmp, buf, len); + + if (strncmp(tmp, "on", CMD_MIN_LEN) == 0) { + if (htuser_list_store_on(tmp, len, pid)) + return -EINVAL; + } else if (unlikely(strncmp(tmp, "off", CMD_MIN_LEN) == 0)) { + pr_info("process %d set to disable\n", pid); + if (!htuser_list_remove(pid)) + pr_err("remove success duet to process %d call off\n", pid); + } else if (likely(strncmp(tmp, "kick", CMD_MIN_LEN) == 0)) { + pr_info("process %d is kicked\n", pid); + htuser_list_kick(pid); + } else { + pr_err("only accept on off or kick\n"); + } + return (ssize_t)count; +} + diff --git a/drivers/staging/hungtask/hungtask_user.h b/drivers/staging/hungtask/hungtask_user.h new file mode 100644 index 000000000000..3cd655cac2d5 --- /dev/null +++ b/drivers/staging/hungtask/hungtask_user.h @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef DFX_HUNGTASK_USER_H +#define DFX_HUNGTASK_USER_H + +#include + +#ifdef CONFIG_DFX_HUNGTASK_USER +void htuser_post_process_userlist(void); +ssize_t htuser_list_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count); +ssize_t htuser_list_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf); +#else +static inline void htuser_post_process_userlist(void) +{ +} + +static inline ssize_t htuser_list_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return 0; +} +static inline ssize_t htuser_list_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return 0; +} + +#endif + +#endif /* DFX_HUNGTASK_USER_H */ diff --git a/include/dfx/hungtask_base.h b/include/dfx/hungtask_base.h new file mode 100644 index 000000000000..5c280b5b21b5 --- /dev/null +++ b/include/dfx/hungtask_base.h @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef DFX_HUNGTASK_BASE_H +#define DFX_HUNGTASK_BASE_H + +#include +#include +#include + +#define ENABLE_SHOW_LEN 8 +#define WHITELIST_STORE_LEN 400 +#define WHITELIST_LEN 61 +#define WHITE_LIST 1 +#define BLACK_LIST 2 +#define HT_ENABLE 1 +#define HT_DISABLE 0 +#define HEARTBEAT_TIME 3 +#define MAX_LOOP_NUM (CONFIG_DEFAULT_HUNG_TASK_TIMEOUT / HEARTBEAT_TIME) +#define ONE_MINUTE (60 / HEARTBEAT_TIME) +#define ONE_AND_HALF_MINUTE (90 / HEARTBEAT_TIME) +#define TWO_MINUTES (120 / HEARTBEAT_TIME) +#define THREE_MINUTES (180 / HEARTBEAT_TIME) +#define TWENTY_SECONDS (21 / HEARTBEAT_TIME) +#define THIRTY_SECONDS (30 / HEARTBEAT_TIME) +#define HUNG_ONE_HOUR (3600 / HEARTBEAT_TIME) +#define HUNG_TEN_MINUTES (600 / HEARTBEAT_TIME) +#define HUNGTASK_REPORT_TIMECOST TWENTY_SECONDS +#define HT_DUMP_IN_PANIC_LOOSE 5 +#define HT_DUMP_IN_PANIC_STRICT 2 +#define REFRESH_INTERVAL THREE_MINUTES +#define FLAG_DUMP_WHITE (1 << 0) +#define FLAG_DUMP_APP (1 << 1) +#define FLAG_DUMP_NOSCHEDULE (1 << 2) +#define FLAG_DUMP_JANK (1 << 3) +#define FLAG_PANIC (1 << 4) +#define FLAG_PF_FROZEN (1 << 6) +#define TASK_TYPE_IGNORE 0 +#define TASK_TYPE_WHITE (1 << 0) +#define TASK_TYPE_APP (1 << 1) +#define TASK_TYPE_JANK (1 << 2) +#define TASK_TYPE_KERNEL (1 << 3) +#define TASK_TYPE_NATIVE (1 << 4) +#define TASK_TYPE_FROZEN (1 << 6) +#define PID_INIT 1 +#define PID_KTHREAD 2 +#define DEFAULT_WHITE_DUMP_CNT MAX_LOOP_NUM +#define DEFAULT_WHITE_PANIC_CNT MAX_LOOP_NUM +#define HUNG_TASK_UPLOAD_ONCE 1 +#define FROZEN_BUF_LEN 1024 +#define MAX_REMOVE_LIST_NUM 200 +#define HUNGTASK_DOMAIN "KERNEL_VENDOR" +#define HUNGTASK_NAME "HUNGTASK" +#define INIT_FREEZE_NAME "INIT_FREEZE" +#define HUNG_TASK_BATCHING 1024 +#define TIME_REFRESH_PIDS 20 +#define PID_ERROR (-1) +#define HUNGTASK_EVENT_WHITELIST 1 +#define REPORT_MSGLENGTH 200 + +struct task_item { + struct rb_node node; + pid_t pid; + pid_t tgid; + char name[TASK_COMM_LEN + 1]; + unsigned long switch_count; + unsigned int task_type; + int dump_wa; + int panic_wa; + int dump_jank; + int d_state_time; + bool isdone_wa; +}; + +struct hashlist_node { + pid_t pid; + struct hlist_node list; +}; + +struct whitelist_item { + pid_t pid; + char name[TASK_COMM_LEN + 1]; +}; + +struct task_hung_upload { + char name[TASK_COMM_LEN + 1]; + pid_t pid; + pid_t tgid; + unsigned int flag; + int duration; +}; + +extern unsigned long sysctl_hung_task_timeout_secs; +extern unsigned int sysctl_hung_task_panic; + +void do_dump_task(struct task_struct *task); +int dump_task_wa(struct task_item *item, int dump_cnt, + struct task_struct *task, unsigned int flag); +void do_show_task(struct task_struct *task, unsigned int flag, int d_state_time); +void hungtask_show_state_filter(unsigned long state_filter); +int htbase_create_sysfs(void); +void htbase_set_panic(int new_did_panic); +void htbase_set_timeout_secs(unsigned long new_hungtask_timeout_secs); +void htbase_check_tasks(unsigned long timeout); +bool hashlist_find(struct hlist_head *head, int count, pid_t tgid); +void hashlist_clear(struct hlist_head *head, int count); +bool hashlist_insert(struct hlist_head *head, int count, pid_t tgid); + +#endif /* DFX_HUNGTASK_BASE_H */ diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 396ebaebea3f..82cd7297c9c1 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -24,6 +24,10 @@ #include +#ifdef CONFIG_DFX_HUNGTASK +#include +#endif + /* * The number of tasks checked: */ @@ -51,9 +55,11 @@ unsigned long __read_mostly sysctl_hung_task_check_interval_secs; int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; +#ifndef CONFIG_DFX_HUNGTASK static bool hung_task_show_lock; static bool hung_task_call_panic; static bool hung_task_show_all_bt; +#endif static struct task_struct *watchdog_task; @@ -76,7 +82,9 @@ static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) { did_panic = 1; - +#ifdef CONFIG_DFX_HUNGTASK + htbase_set_panic(did_panic); +#endif return NOTIFY_DONE; } @@ -84,6 +92,7 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; +#ifndef CONFIG_DFX_HUNGTASK static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; @@ -212,6 +221,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (hung_task_call_panic) panic("hung_task: blocked tasks"); } +#endif static long hung_timeout_jiffies(unsigned long last_checked, unsigned long timeout) @@ -235,7 +245,9 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, goto out; wake_up_process(watchdog_task); - +#ifdef CONFIG_DFX_HUNGTASK + htbase_set_timeout_secs(sysctl_hung_task_timeout_secs); +#endif out: return ret; } @@ -280,18 +292,26 @@ static int watchdog(void *dummy) set_user_nice(current, 0); for ( ; ; ) { +#ifdef CONFIG_DFX_HUNGTASK + unsigned long timeout = HEARTBEAT_TIME; +#else unsigned long timeout = sysctl_hung_task_timeout_secs; +#endif unsigned long interval = sysctl_hung_task_check_interval_secs; long t; if (interval == 0) interval = timeout; - interval = min_t(unsigned long, interval, timeout); - t = hung_timeout_jiffies(hung_last_checked, interval); + timeout = min_t(unsigned long, interval, timeout); + t = hung_timeout_jiffies(hung_last_checked, timeout); if (t <= 0) { if (!atomic_xchg(&reset_hung_task, 0) && !hung_detector_suspended) +#ifdef CONFIG_DFX_HUNGTASK + htbase_check_tasks(timeout); +#else check_hung_uninterruptible_tasks(timeout); +#endif hung_last_checked = jiffies; continue; } @@ -303,6 +323,13 @@ static int watchdog(void *dummy) static int __init hung_task_init(void) { +#ifdef CONFIG_DFX_HUNGTASK + int ret = 0; + + ret = htbase_create_sysfs(); + if (ret) + pr_err("hungtask: create_sysfs_hungtask fail"); +#endif atomic_notifier_chain_register(&panic_notifier_list, &panic_block); /* Disable hung task detector on suspend */ -- Gitee From 30996666434496f843e2ef3683121726664c05b8 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Thu, 9 Dec 2021 11:49:23 +0800 Subject: [PATCH 002/113] sched:Add Window Assisted Load Tracking (WALT) codeaurora inclusion category: feature issue: #I4SA1E CVE: NA Signed-off-by: Hu Zhaodong ------------------------------------------- A basic WALT patch Based on Code Aurora's latest msm-4.14 Those features are cropped: 1. top tasks 2. predicted load 3. RTG adaptation 4. some tunnable procfs Notes: The double locks are added in routines of move_queued_task and deatch_task due to fixup_busy_time was inserted to set_task_cpu routines. And the caller of fixup_busy_time must hold the double rq lock before calling fixup_busy_time. Or we will have nasty concurrency problems. Signed-off-by: Vikram Mulukutla Signed-off-by: Satya Durga Srinivasu Prabhala Signed-off-by: Srinath Sridharan --- fs/proc/base.c | 68 ++ include/linux/sched.h | 66 ++ include/linux/sched/cpufreq.h | 2 + include/linux/sched/sysctl.h | 10 + include/trace/events/walt.h | 168 +++ init/Kconfig | 9 + kernel/exit.c | 1 + kernel/fork.c | 1 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 100 ++ kernel/sched/cpufreq_schedutil.c | 8 + kernel/sched/cputime.c | 15 + kernel/sched/deadline.c | 6 + kernel/sched/debug.c | 21 + kernel/sched/fair.c | 201 +++- kernel/sched/rt.c | 6 + kernel/sched/sched.h | 331 +++++- kernel/sched/stop_task.c | 6 + kernel/sched/walt.c | 1733 ++++++++++++++++++++++++++++++ kernel/sched/walt.h | 237 ++++ kernel/sysctl.c | 34 + 21 files changed, 3016 insertions(+), 8 deletions(-) create mode 100644 include/trace/events/walt.h create mode 100644 kernel/sched/walt.c create mode 100644 kernel/sched/walt.h diff --git a/fs/proc/base.c b/fs/proc/base.c index 9478b78f53ce..96d4ab81619e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -88,6 +88,7 @@ #include #include #include +#include #include #include #include @@ -1573,6 +1574,70 @@ static const struct file_operations proc_pid_sched_autogroup_operations = { #endif /* CONFIG_SCHED_AUTOGROUP */ +#ifdef CONFIG_SCHED_WALT +static int sched_init_task_load_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + seq_printf(m, "%d\n", sched_get_init_task_load(p)); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_init_task_load_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[PROC_NUMBUF]; + int init_task_load, err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &init_task_load); + if (err) + goto out; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = sched_set_init_task_load(p, init_task_load); + + put_task_struct(p); + +out: + return err < 0 ? err : count; +} + +static int sched_init_task_load_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_init_task_load_show, inode); +} + +static const struct file_operations proc_pid_sched_init_task_load_operations = { + .open = sched_init_task_load_open, + .read = seq_read, + .write = sched_init_task_load_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SCHED_WALT */ + #ifdef CONFIG_TIME_NS static int timens_offsets_show(struct seq_file *m, void *v) { @@ -3208,6 +3273,9 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), +#ifdef CONFIG_SCHED_WALT + REG("sched_init_task_load", 00644, proc_pid_sched_init_task_load_operations), +#endif #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index d42f4addcaec..e4b281653f7c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -213,6 +213,15 @@ struct io_uring_task; /* Task command name length: */ #define TASK_COMM_LEN 16 +enum task_event { + PUT_PREV_TASK = 0, + PICK_NEXT_TASK = 1, + TASK_WAKE = 2, + TASK_MIGRATE = 3, + TASK_UPDATE = 4, + IRQ_UPDATE = 5, +}; + extern void scheduler_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX @@ -495,6 +504,53 @@ struct sched_entity { #endif }; +#ifdef CONFIG_SCHED_WALT +extern void sched_exit(struct task_struct *p); +extern int sched_set_init_task_load(struct task_struct *p, int init_load_pct); +extern u32 sched_get_init_task_load(struct task_struct *p); +extern void free_task_load_ptrs(struct task_struct *p); +#define RAVG_HIST_SIZE_MAX 5 +struct ravg { + /* + * 'mark_start' marks the beginning of an event (task waking up, task + * starting to execute, task being preempted) within a window + * + * 'sum' represents how runnable a task has been within current + * window. It incorporates both running time and wait time and is + * frequency scaled. + * + * 'sum_history' keeps track of history of 'sum' seen over previous + * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are + * ignored. + * + * 'demand' represents maximum sum seen over previous + * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency + * demand for tasks. + * + * 'curr_window_cpu' represents task's contribution to cpu busy time on + * various CPUs in the current window + * + * 'prev_window_cpu' represents task's contribution to cpu busy time on + * various CPUs in the previous window + * + * 'curr_window' represents the sum of all entries in curr_window_cpu + * + * 'prev_window' represents the sum of all entries in prev_window_cpu + * + */ + u64 mark_start; + u32 sum, demand; + u32 sum_history[RAVG_HIST_SIZE_MAX]; + u32 *curr_window_cpu, *prev_window_cpu; + u32 curr_window, prev_window; + u16 active_windows; + u16 demand_scaled; +}; +#else +static inline void sched_exit(struct task_struct *p) { } +static inline void free_task_load_ptrs(struct task_struct *p) { } +#endif /* CONFIG_SCHED_WALT */ + struct sched_rt_entity { struct list_head run_list; unsigned long timeout; @@ -700,6 +756,16 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; +#ifdef CONFIG_SCHED_WALT + struct ravg ravg; + /* + * 'init_load_pct' represents the initial task load assigned to children + * of this task + */ + u32 init_load_pct; + u64 last_sleep_ts; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 3ed5aa18593f..c7cf63236f5b 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -9,6 +9,8 @@ */ #define SCHED_CPUFREQ_IOWAIT (1U << 0) +#define SCHED_CPUFREQ_WALT (1U << 1) +#define SCHED_CPUFREQ_CONTINUE (1U << 2) #ifdef CONFIG_CPU_FREQ struct cpufreq_policy; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3c31ba88aca5..210909cd4141 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -30,6 +30,16 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int sysctl_sched_use_walt_task_util; +extern unsigned int sysctl_sched_walt_init_task_load_pct; +extern unsigned int sysctl_sched_cpu_high_irqload; + +extern int +sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); +#endif enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/include/trace/events/walt.h b/include/trace/events/walt.h new file mode 100644 index 000000000000..e5328b75a8bd --- /dev/null +++ b/include/trace/events/walt.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM walt + +#if !defined(_TRACE_WALT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_WALT_H + +#include +#include + +struct rq; +extern const char *task_event_names[]; + +#if defined(CREATE_TRACE_POINTS) && defined(CONFIG_SCHED_WALT) +static inline void __window_data(u32 *dst, u32 *src) +{ + if (src) + memcpy(dst, src, nr_cpu_ids * sizeof(u32)); + else + memset(dst, 0, nr_cpu_ids * sizeof(u32)); +} + +struct trace_seq; +const char *__window_print(struct trace_seq *p, const u32 *buf, int buf_len) +{ + int i; + const char *ret = p->buffer + seq_buf_used(&p->seq); + + for (i = 0; i < buf_len; i++) + trace_seq_printf(p, "%u ", buf[i]); + + trace_seq_putc(p, 0); + + return ret; +} + +static inline s64 __rq_update_sum(struct rq *rq, bool curr, bool new) +{ + if (curr) + if (new) + return rq->nt_curr_runnable_sum; + else + return rq->curr_runnable_sum; + else + if (new) + return rq->nt_prev_runnable_sum; + else + return rq->prev_runnable_sum; +} +#endif + +TRACE_EVENT(sched_update_history, + + TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples, + enum task_event evt), + + TP_ARGS(rq, p, runtime, samples, evt), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned int, runtime) + __field(int, samples) + __field(enum task_event, evt) + __field(unsigned int, demand) + __array(u32, hist, RAVG_HIST_SIZE_MAX) + __field(int, cpu) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->runtime = runtime; + __entry->samples = samples; + __entry->evt = evt; + __entry->demand = p->ravg.demand; + memcpy(__entry->hist, p->ravg.sum_history, + RAVG_HIST_SIZE_MAX * sizeof(u32)); + __entry->cpu = rq->cpu; + ), + + TP_printk("%d (%s): runtime %u samples %d event %s demand %u (hist: %u %u %u %u %u) cpu %d", + __entry->pid, __entry->comm, + __entry->runtime, __entry->samples, + task_event_names[__entry->evt], __entry->demand, + __entry->hist[0], __entry->hist[1], + __entry->hist[2], __entry->hist[3], + __entry->hist[4], __entry->cpu) +); + +TRACE_EVENT(sched_update_task_ravg, + + TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt, + u64 wallclock, u64 irqtime), + + TP_ARGS(p, rq, evt, wallclock, irqtime), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(pid_t, cur_pid) + __field(unsigned int, cur_freq) + __field(u64, wallclock) + __field(u64, mark_start) + __field(u64, delta_m) + __field(u64, win_start) + __field(u64, delta) + __field(u64, irqtime) + __field(enum task_event, evt) + __field(unsigned int, demand) + __field(unsigned int, sum) + __field(int, cpu) + __field(u64, rq_cs) + __field(u64, rq_ps) + __field(u32, curr_window) + __field(u32, prev_window) + __dynamic_array(u32, curr_sum, nr_cpu_ids) + __dynamic_array(u32, prev_sum, nr_cpu_ids) + __field(u64, nt_cs) + __field(u64, nt_ps) + __field(u32, active_windows) + ), + + TP_fast_assign( + __entry->wallclock = wallclock; + __entry->win_start = rq->window_start; + __entry->delta = (wallclock - rq->window_start); + __entry->evt = evt; + __entry->cpu = rq->cpu; + __entry->cur_pid = rq->curr->pid; + __entry->cur_freq = rq->cluster->cur_freq; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->mark_start = p->ravg.mark_start; + __entry->delta_m = (wallclock - p->ravg.mark_start); + __entry->demand = p->ravg.demand; + __entry->sum = p->ravg.sum; + __entry->irqtime = irqtime; + __entry->rq_cs = rq->curr_runnable_sum; + __entry->rq_ps = rq->prev_runnable_sum; + __entry->curr_window = p->ravg.curr_window; + __entry->prev_window = p->ravg.prev_window; + __window_data(__get_dynamic_array(curr_sum), p->ravg.curr_window_cpu); + __window_data(__get_dynamic_array(prev_sum), p->ravg.prev_window_cpu); + __entry->nt_cs = rq->nt_curr_runnable_sum; + __entry->nt_ps = rq->nt_prev_runnable_sum; + __entry->active_windows = p->ravg.active_windows; + ), + + TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_wins %u", + __entry->wallclock, __entry->win_start, __entry->delta, + task_event_names[__entry->evt], __entry->cpu, + __entry->cur_freq, __entry->cur_pid, + __entry->pid, __entry->comm, __entry->mark_start, + __entry->delta_m, __entry->demand, + __entry->sum, __entry->irqtime, + __entry->rq_cs, __entry->rq_ps, __entry->curr_window, + __window_print(p, __get_dynamic_array(curr_sum), nr_cpu_ids), + __entry->prev_window, + __window_print(p, __get_dynamic_array(prev_sum), nr_cpu_ids), + __entry->nt_cs, __entry->nt_ps, + __entry->active_windows) +); + +#endif /* _TRACE_WALT_H */ + +/* This part must be outside protection */ +#include diff --git a/init/Kconfig b/init/Kconfig index fc4c9f416fad..8b20edacf921 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -526,6 +526,15 @@ config SCHED_THERMAL_PRESSURE This requires the architecture to implement arch_set_thermal_pressure() and arch_get_thermal_pressure(). +config SCHED_WALT + bool "Support window based load tracking" + depends on SMP + help + This feature will allow the scheduler to maintain a tunable window + based set of metrics for tasks and runqueues. These metrics can be + used to guide task placement as well as task frequency requirements + for cpufreq governors. + config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/kernel/exit.c b/kernel/exit.c index d13d67fc5f4e..795e16ecc422 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -765,6 +765,7 @@ void __noreturn do_exit(long code) io_uring_files_cancel(tsk->files); exit_signals(tsk); /* sets PF_EXITING */ + sched_exit(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) diff --git a/kernel/fork.c b/kernel/fork.c index 5d7a6821d59c..be79601c0e97 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2373,6 +2373,7 @@ static __latent_entropy struct task_struct *copy_process( perf_event_free_task(p); bad_fork_cleanup_policy: lockdep_free_task(p); + free_task_load_ptrs(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5fc9c9b70862..75ab238bde9d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -27,6 +27,7 @@ obj-y += idle.o fair.o rt.o deadline.o obj-y += wait.o wait_bit.o swait.o completion.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o +obj-$(CONFIG_SCHED_WALT) += walt.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e4551d1736fa..aed3b931e670 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ #include "pelt.h" #include "smp.h" +#include "walt.h" /* * Export tracepoints that act as a bare tracehook (ie: have no trace event @@ -1745,8 +1746,15 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, lockdep_assert_held(&rq->lock); deactivate_task(rq, p, DEQUEUE_NOCLOCK); +#ifdef CONFIG_SCHED_WALT + double_lock_balance(rq, cpu_rq(new_cpu)); +#endif set_task_cpu(p, new_cpu); +#ifdef CONFIG_SCHED_WALT + double_rq_unlock(cpu_rq(new_cpu), rq); +#else rq_unlock(rq, rf); +#endif rq = cpu_rq(new_cpu); @@ -2007,6 +2015,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.nr_migrations++; rseq_migrate(p); perf_event_task_migrate(p); + fixup_busy_time(p, new_cpu); } __set_task_cpu(p, new_cpu); @@ -2794,6 +2803,26 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * accesses to the task state; see try_to_wake_up() and set_current_state(). */ +#ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_WALT +/* utility function to update walt signals at wakeup */ +static inline void walt_try_to_wake_up(struct task_struct *p) +{ + struct rq *rq = cpu_rq(task_cpu(p)); + struct rq_flags rf; + u64 wallclock; + + rq_lock_irqsave(rq, &rf); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + rq_unlock_irqrestore(rq, &rf); +} +#else +#define walt_try_to_wake_up(a) {} +#endif +#endif + /** * try_to_wake_up - wake up a thread * @p: the thread to be awakened @@ -2928,6 +2957,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_acquire__after_ctrl_dep(); + walt_try_to_wake_up(p); + /* * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq * == 0), which means we need to do an enqueue, change p->state to @@ -3233,6 +3264,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; + init_new_task_load(p); __sched_fork(clone_flags, p); /* * We mark the process as NEW here. This guarantees that @@ -3363,6 +3395,8 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(p); + mark_task_starting(p); + activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -3995,6 +4029,7 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; + u64 wallclock; unsigned long thermal_pressure; arch_scale_freq_tick(); @@ -4002,6 +4037,9 @@ void scheduler_tick(void) rq_lock(rq, &rf); + set_window_start(rq); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); update_rq_clock(rq); thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); @@ -4423,6 +4461,7 @@ static void __sched notrace __schedule(bool preempt) struct rq_flags rf; struct rq *rq; int cpu; + u64 wallclock; cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -4505,7 +4544,14 @@ static void __sched notrace __schedule(bool preempt) clear_tsk_need_resched(prev); clear_preempt_need_resched(); + wallclock = sched_ktime_clock(); if (likely(prev != next)) { +#ifdef CONFIG_SCHED_WALT + if (!prev->on_rq) + prev->last_sleep_ts = wallclock; +#endif + update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); rq->nr_switches++; /* * RCU users of rcu_dereference(rq->curr) may not see @@ -4535,6 +4581,7 @@ static void __sched notrace __schedule(bool preempt) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { + update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0); rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); rq_unlock_irq(rq, &rf); } @@ -6949,6 +6996,11 @@ int sched_cpu_deactivate(unsigned int cpu) static void sched_rq_cpu_starting(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + set_window_start(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; update_max_interval(); @@ -6971,6 +7023,7 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); + if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); @@ -7000,6 +7053,8 @@ void __init sched_init_smp(void) sched_init_domains(cpu_active_mask); mutex_unlock(&sched_domains_mutex); + update_cluster_topology(); + /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) BUG(); @@ -7062,6 +7117,8 @@ void __init sched_init(void) wait_bit_init(); + init_clusters(); + #ifdef CONFIG_FAIR_GROUP_SCHED ptr += 2 * nr_cpu_ids * sizeof(void **); #endif @@ -7173,6 +7230,7 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; + walt_sched_init_rq(rq); INIT_LIST_HEAD(&rq->cfs_tasks); @@ -7203,6 +7261,7 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + init_new_task_load(current); calc_load_update = jiffies + LOAD_FREQ; @@ -8481,3 +8540,44 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) { trace_sched_update_nr_running_tp(rq, count); } + +#ifdef CONFIG_SCHED_WALT +/* + * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field + * + * Stop accounting (exiting) task's future cpu usage + * + * We need this so that reset_all_windows_stats() can function correctly. + * reset_all_window_stats() depends on do_each_thread/for_each_thread task + * iterators to reset *all* task's statistics. Exiting tasks however become + * invisible to those iterators. sched_exit() is called on a exiting task prior + * to being removed from task_list, which will let reset_all_window_stats() + * function correctly. + */ +void sched_exit(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + u64 wallclock; + + rq = task_rq_lock(p, &rf); + + /* rq->curr == p */ + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + dequeue_task(rq, p, 0); + /* + * task's contribution is already removed from the + * cumulative window demand in dequeue. As the + * task's stats are reset, the next enqueue does + * not change the cumulative window demand. + */ + reset_task_stats(p); + p->ravg.mark_start = wallclock; + p->ravg.sum_history[0] = EXITING_TASK_MARKER; + + enqueue_task(rq, p, 0); + task_rq_unlock(rq, p, &rf); + free_task_load_ptrs(p); +} +#endif /* CONFIG_SCHED_WALT */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5e39da0ae086..cb72dc5c2002 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -287,6 +287,10 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); +#ifdef CONFIG_SCHED_WALT + return cpu_util_freq_walt(sg_cpu->cpu); +#endif + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); } @@ -520,7 +524,11 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) ignore_dl_rate_limit(sg_cpu, sg_policy); +#ifdef CONFIG_SCHED_WALT + if (sugov_should_update_freq(sg_policy, time) && !(flags & SCHED_CPUFREQ_CONTINUE)) { +#else if (sugov_should_update_freq(sg_policy, time)) { +#endif next_f = sugov_next_freq_shared(sg_cpu, time); if (sg_policy->policy->fast_switch_enabled) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5a55d2300452..cf87d3fff5dd 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -3,6 +3,7 @@ * Simple CPU accounting cgroup controller */ #include "sched.h" +#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -52,11 +53,18 @@ void irqtime_account_irq(struct task_struct *curr) struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); s64 delta; int cpu; +#ifdef CONFIG_SCHED_WALT + u64 wallclock; + bool account = true; +#endif if (!sched_clock_irqtime) return; cpu = smp_processor_id(); +#ifdef CONFIG_SCHED_WALT + wallclock = sched_clock_cpu(cpu); +#endif delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; @@ -70,6 +78,13 @@ void irqtime_account_irq(struct task_struct *curr) irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); +#ifdef CONFIG_SCHED_WALT + else + account = false; + + if (account) + sched_account_irqtime(cpu, curr, delta, wallclock); +#endif } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8255267ce323..2a64cced37a5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,7 @@ */ #include "sched.h" #include "pelt.h" +#include "walt.h" struct dl_bandwidth def_dl_bandwidth; @@ -1440,6 +1441,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -1454,6 +1456,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); @@ -2547,6 +2550,9 @@ const struct sched_class dl_sched_class .switched_to = switched_to_dl, .update_curr = update_curr_dl, +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; int sched_dl_global_validate(void) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 70a578272436..e5af311230be 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -715,6 +715,17 @@ do { \ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); PN(clock_task); +#ifdef CONFIG_SCHED_WALT + P(cluster->load_scale_factor); + P(cluster->capacity); + P(cluster->max_possible_capacity); + P(cluster->efficiency); + P(cluster->cur_freq); + P(cluster->max_freq); + P(cluster->exec_scale_factor); + SEQ_printf(m, " .%-30s: %llu\n", "walt_stats.cumulative_runnable_avg", + rq->walt_stats.cumulative_runnable_avg_scaled); +#endif #undef P #undef PN @@ -791,6 +802,12 @@ static void sched_debug_header(struct seq_file *m) PN(sysctl_sched_wakeup_granularity); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); +#ifdef CONFIG_SCHED_WALT + P(sched_init_task_load_windows); + P(min_capacity); + P(max_capacity); + P(sched_ravg_window); +#endif #undef PN #undef P @@ -983,6 +1000,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(se.statistics.nr_wakeups_passive); P_SCHEDSTAT(se.statistics.nr_wakeups_idle); +#ifdef CONFIG_SCHED_WALT + P(ravg.demand); +#endif + avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c004e3b89c32..f30bd5d6d655 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,6 +21,34 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" +#include "walt.h" + +#ifdef CONFIG_SCHED_WALT +static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled); +#endif + +#if defined(CONFIG_SCHED_WALT) && defined(CONFIG_CFS_BANDWIDTH) +static void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq); +static void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, + struct task_struct *p); +static void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, + struct task_struct *p); +static void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *cfs_rq); +static void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *cfs_rq); +#else +static inline void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq) {} +static inline void +walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {} +static inline void +walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +#define walt_inc_throttled_cfs_rq_stats(...) +#define walt_dec_throttled_cfs_rq_stats(...) + +#endif /* * Targeted preemption latency for CPU-bound tasks: @@ -1559,7 +1587,6 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); -static unsigned long cpu_util(int cpu); static inline long adjust_numa_imbalance(int imbalance, int nr_running); static inline enum @@ -3902,6 +3929,10 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_task_util)) + return p->ravg.demand_scaled; +#endif return READ_ONCE(p->se.avg.util_avg); } @@ -3914,6 +3945,10 @@ static inline unsigned long _task_util_est(struct task_struct *p) static inline unsigned long task_util_est(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_task_util)) + return p->ravg.demand_scaled; +#endif return max(task_util(p), _task_util_est(p)); } @@ -4826,13 +4861,16 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; + walt_dec_throttled_cfs_rq_stats(&qcfs_rq->walt_stats, cfs_rq); if (qcfs_rq->load.weight) dequeue = 0; } - if (!se) + if (!se) { sub_nr_running(rq, task_delta); + walt_dec_throttled_cfs_rq_stats(&rq->walt_stats, cfs_rq); + } /* * Note: distribution will already see us throttled via the @@ -4849,6 +4887,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; + struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4877,6 +4916,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; + walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -4891,7 +4931,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; - + walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -4907,6 +4947,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); + walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq); unthrottle_throttle: /* @@ -5291,6 +5332,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); + walt_init_cfs_rq_stats(cfs_rq); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -5470,8 +5512,6 @@ static inline void hrtick_update(struct rq *rq) #endif #ifdef CONFIG_SMP -static inline unsigned long cpu_util(int cpu); - static inline bool cpu_overutilized(int cpu) { return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); @@ -5544,6 +5584,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; + walt_inc_cfs_rq_stats(cfs_rq, p); + flags = ENQUEUE_WAKEUP; } @@ -5556,6 +5598,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + walt_inc_cfs_rq_stats(cfs_rq, p); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5571,7 +5614,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); - + inc_rq_walt_stats(rq, p); /* * Since new tasks are assigned an initial util_avg equal to * half of the spare capacity of their CPU, tiny tasks have the @@ -5638,6 +5681,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; + walt_dec_cfs_rq_stats(cfs_rq, p); + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -5662,6 +5707,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + walt_dec_cfs_rq_stats(cfs_rq, p); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5671,6 +5717,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); + dec_rq_walt_stats(rq, p); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) @@ -6382,11 +6429,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * * Return: the (estimated) utilization for the specified CPU */ -static inline unsigned long cpu_util(int cpu) +unsigned long cpu_util(int cpu) { struct cfs_rq *cfs_rq; unsigned int util; +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) { + u64 walt_cpu_util = + cpu_rq(cpu)->walt_stats.cumulative_runnable_avg_scaled; + + return min_t(unsigned long, walt_cpu_util, + capacity_orig_of(cpu)); + } +#endif + cfs_rq = &cpu_rq(cpu)->cfs; util = READ_ONCE(cfs_rq->avg.util_avg); @@ -6414,10 +6471,29 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) struct cfs_rq *cfs_rq; unsigned int util; +#ifdef CONFIG_SCHED_WALT + /* + * WALT does not decay idle tasks in the same manner + * as PELT, so it makes little sense to subtract task + * utilization from cpu utilization. Instead just use + * cpu_util for this case. + */ + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util) && + p->state == TASK_WAKING) + return cpu_util(cpu); +#endif + /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) return cpu_util(cpu); +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) { + util = max_t(long, cpu_util(cpu) - task_util(p), 0); + return min_t(unsigned long, util, capacity_orig_of(cpu)); + } +#endif + cfs_rq = &cpu_rq(cpu)->cfs; util = READ_ONCE(cfs_rq->avg.util_avg); @@ -6523,6 +6599,18 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) return min(util, capacity_orig_of(cpu)); } +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig; + unsigned long scale_freq = arch_scale_freq_capacity(cpu); + + return cap_scale(max_cap, scale_freq); +} + /* * compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization @@ -7641,7 +7729,15 @@ static void detach_task(struct task_struct *p, struct lb_env *env) lockdep_assert_held(&env->src_rq->lock); deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); +#ifdef CONFIG_SCHED_WALT + double_lock_balance(env->src_rq, env->dst_rq); + if (!(env->src_rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(env->src_rq); +#endif set_task_cpu(p, env->dst_cpu); +#ifdef CONFIG_SCHED_WALT + double_unlock_balance(env->src_rq, env->dst_rq); +#endif } /* @@ -11269,6 +11365,9 @@ const struct sched_class fair_sched_class #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = walt_fixup_sched_stats_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG @@ -11321,6 +11420,94 @@ __init void init_sched_fair_class(void) } +/* WALT sched implementation begins here */ +#ifdef CONFIG_SCHED_WALT + +#ifdef CONFIG_CFS_BANDWIDTH + +static void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq) +{ + cfs_rq->walt_stats.cumulative_runnable_avg_scaled = 0; +} + +static void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) +{ + fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, + p->ravg.demand_scaled); +} + +static void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) +{ + fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, + -(s64)p->ravg.demand_scaled); +} + +static void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *tcfs_rq) +{ + struct rq *rq = rq_of(tcfs_rq); + + fixup_cumulative_runnable_avg(stats, + tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); + + if (stats == &rq->walt_stats) + walt_fixup_cum_window_demand(rq, + tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); + +} + +static void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *tcfs_rq) +{ + struct rq *rq = rq_of(tcfs_rq); + + fixup_cumulative_runnable_avg(stats, + -tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); + + /* + * We remove the throttled cfs_rq's tasks's contribution from the + * cumulative window demand so that the same can be added + * unconditionally when the cfs_rq is unthrottled. + */ + if (stats == &rq->walt_stats) + walt_fixup_cum_window_demand(rq, + -tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); +} + +static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + s64 task_load_delta = (s64)updated_demand_scaled - + p->ravg.demand_scaled; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, + task_load_delta); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Fix up rq->walt_stats only if we didn't find any throttled cfs_rq */ + if (!se) { + fixup_cumulative_runnable_avg(&rq->walt_stats, + task_load_delta); + walt_fixup_cum_window_demand(rq, task_load_delta); + } +} + +#else /* CONFIG_CFS_BANDWIDTH */ +static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) +{ + fixup_walt_sched_stats_common(rq, p, updated_demand_scaled); +} +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_SCHED_WALT */ + /* * Helper functions to facilitate extracting info from tracepoints. */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index dae1e8eaa983..5938cf2e421b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -6,6 +6,7 @@ #include "sched.h" #include "pelt.h" +#include "walt.h" int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; @@ -1389,6 +1390,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) rt_se->timeout = 0; enqueue_rt_entity(rt_se, flags); + walt_inc_cumulative_runnable_avg(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1400,6 +1402,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) update_curr_rt(rq); dequeue_rt_entity(rt_se, flags); + walt_dec_cumulative_runnable_avg(rq, p); dequeue_pushable_task(rq, p); } @@ -2480,6 +2483,9 @@ const struct sched_class rt_sched_class #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; #ifdef CONFIG_RT_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 08db8e095e48..6fd06c16ee24 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -87,6 +87,47 @@ struct rq; struct cpuidle_state; +#ifdef CONFIG_SCHED_WALT +extern unsigned int sched_ravg_window; +extern unsigned int walt_cpu_util_freq_divisor; + +struct walt_sched_stats { + u64 cumulative_runnable_avg_scaled; +}; + +struct load_subtractions { + u64 window_start; + u64 subs; + u64 new_subs; +}; + +#define NUM_TRACKED_WINDOWS 2 + +struct sched_cluster { + raw_spinlock_t load_lock; + struct list_head list; + struct cpumask cpus; + int id; + int max_power_cost; + int min_power_cost; + int max_possible_capacity; + int capacity; + int efficiency; /* Differentiate cpus with different IPC capability */ + int load_scale_factor; + unsigned int exec_scale_factor; + /* + * max_freq = user maximum + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq, max_freq, min_freq; + unsigned int max_possible_freq; + bool freq_init_done; +}; + +extern unsigned int sched_disable_window_stats; +#endif /* CONFIG_SCHED_WALT */ + + /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -594,6 +635,10 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_SCHED_WALT + struct walt_sched_stats walt_stats; +#endif + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; @@ -604,6 +649,9 @@ struct cfs_rq { int throttled; int throttle_count; struct list_head throttled_list; +#ifdef CONFIG_SCHED_WALT + u64 cumulative_runnable_avg; +#endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -1008,6 +1056,25 @@ struct rq { u64 max_idle_balance_cost; #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_WALT + struct sched_cluster *cluster; + struct cpumask freq_domain_cpumask; + struct walt_sched_stats walt_stats; + + u64 window_start; + unsigned long walt_flags; + + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; + u64 cum_window_demand_scaled; + struct load_subtractions load_subs[NUM_TRACKED_WINDOWS]; +#endif + #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif @@ -1837,6 +1904,10 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif +#ifdef CONFIG_SCHED_WALT + void (*fixup_walt_sched_stats)(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled); +#endif } __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) @@ -2052,6 +2123,15 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ +#ifdef CONFIG_SCHED_WALT +u64 sched_ktime_clock(void); +#else +static inline u64 sched_ktime_clock(void) +{ + return sched_clock(); +} +#endif + #ifndef arch_scale_freq_tick static __always_inline void arch_scale_freq_tick(void) @@ -2077,7 +2157,14 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif +unsigned long capacity_curr_of(int cpu); +unsigned long cpu_util(int cpu); + #ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int walt_disabled; +#endif #ifdef CONFIG_PREEMPTION static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -2390,11 +2477,20 @@ DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { struct update_util_data *data; + u64 clock; +#ifdef CONFIG_SCHED_WALT + if (!(flags & SCHED_CPUFREQ_WALT)) + return; + + clock = sched_ktime_clock(); +#else + clock = rq_clock(rq); +#endif data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, cpu_of(rq))); if (data) - data->func(data, rq_clock(rq), flags); + data->func(data, clock, flags); } #else static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} @@ -2644,3 +2740,236 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + +#ifdef CONFIG_SCHED_WALT +static inline int cluster_first_cpu(struct sched_cluster *cluster) +{ + return cpumask_first(&cluster->cpus); +} + +extern struct list_head cluster_head; +extern struct sched_cluster *sched_cluster[NR_CPUS]; + +#define for_each_sched_cluster(cluster) \ + list_for_each_entry_rcu(cluster, &cluster_head, list) + +extern struct mutex policy_mutex; +extern unsigned int sched_disable_window_stats; +extern unsigned int max_possible_freq; +extern unsigned int min_max_freq; +extern unsigned int max_possible_efficiency; +extern unsigned int min_possible_efficiency; +extern unsigned int max_capacity; +extern unsigned int min_capacity; +extern unsigned int max_load_scale_factor; +extern unsigned int max_possible_capacity; +extern unsigned int min_max_possible_capacity; +extern unsigned int max_power_cost; +extern unsigned int __read_mostly sched_init_task_load_windows; +extern unsigned int sysctl_sched_restrict_cluster_spill; +extern unsigned int sched_pred_alert_load; +extern struct sched_cluster init_cluster; + +static inline void walt_fixup_cum_window_demand(struct rq *rq, s64 scaled_delta) +{ + rq->cum_window_demand_scaled += scaled_delta; + if (unlikely((s64)rq->cum_window_demand_scaled < 0)) + rq->cum_window_demand_scaled = 0; +} + +/* Is frequency of two cpus synchronized with each other? */ +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + struct rq *rq = cpu_rq(src_cpu); + + if (src_cpu == dst_cpu) + return 1; + + return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask); +} + +extern void reset_task_stats(struct task_struct *p); + +#define CPU_RESERVED 1 +static inline int is_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return test_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline int mark_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return test_and_set_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline void clear_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline int cpu_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->capacity; +} + +static inline int cpu_max_possible_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->max_possible_capacity; +} + +static inline int cpu_load_scale_factor(int cpu) +{ + return cpu_rq(cpu)->cluster->load_scale_factor; +} + +static inline unsigned int cluster_max_freq(struct sched_cluster *cluster) +{ + /* + * Governor and thermal driver don't know the other party's mitigation + * voting. So struct cluster saves both and return min() for current + * cluster fmax. + */ + return cluster->max_freq; +} + +/* Keep track of max/min capacity possible across CPUs "currently" */ +static inline void __update_min_max_capacity(void) +{ + int i; + int max_cap = 0, min_cap = INT_MAX; + + for_each_possible_cpu(i) { + if (!cpu_active(i)) + continue; + + max_cap = max(max_cap, cpu_capacity(i)); + min_cap = min(min_cap, cpu_capacity(i)); + } + + max_capacity = max_cap; + min_capacity = min_cap; +} + +/* + * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so + * that "most" efficient cpu gets a load_scale_factor of 1 + */ +static inline unsigned long +load_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_efficiency, + cluster->efficiency); +} + +/* + * Return load_scale_factor of a cpu in reference to cpu with best max_freq + * (max_possible_freq), so that one with best max_freq gets a load_scale_factor + * of 1. + */ +static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_freq, + cluster_max_freq(cluster)); +} + +static inline int compute_load_scale_factor(struct sched_cluster *cluster) +{ + int load_scale = 1024; + + /* + * load_scale_factor accounts for the fact that task load + * is in reference to "best" performing cpu. Task's load will need to be + * scaled (up) by a factor to determine suitability to be placed on a + * (little) cpu. + */ + load_scale *= load_scale_cpu_efficiency(cluster); + load_scale >>= 10; + + load_scale *= load_scale_cpu_freq(cluster); + load_scale >>= 10; + + return load_scale; +} + +static inline bool is_max_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == max_possible_capacity; +} + +static inline bool is_min_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == min_max_possible_capacity; +} + +/* + * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that + * least efficient cpu gets capacity of 1024 + */ +static unsigned long +capacity_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return (1024 * cluster->efficiency) / min_possible_efficiency; +} + +/* + * Return 'capacity' of a cpu in reference to cpu with lowest max_freq + * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. + */ +static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster) +{ + return (1024 * cluster_max_freq(cluster)) / min_max_freq; +} + +static inline int compute_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= capacity_scale_cpu_freq(cluster); + capacity >>= 10; + + return capacity; +} + +static inline unsigned int power_cost(int cpu, u64 demand) +{ + return cpu_max_possible_capacity(cpu); +} + +static inline unsigned long cpu_util_freq_walt(int cpu) +{ + u64 util; + struct rq *rq = cpu_rq(cpu); + unsigned long capacity = capacity_orig_of(cpu); + + if (unlikely(walt_disabled || !sysctl_sched_use_walt_cpu_util)) + return cpu_util(cpu); + + util = rq->prev_runnable_sum << SCHED_CAPACITY_SHIFT; + util = div_u64(util, sched_ravg_window); + + return (util >= capacity) ? capacity : util; +} +#else /* CONFIG_SCHED_WALT */ +static inline void walt_fixup_cum_window_demand(struct rq *rq, + s64 scaled_delta) { } + +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + return 1; +} + +static inline int is_reserved(int cpu) +{ + return 0; +} + +static inline void clear_reserved(int cpu) { } +#endif /* CONFIG_SCHED_WALT */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index ceb5b6b12561..ae43901c57af 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -8,6 +8,7 @@ * See kernel/stop_machine.c */ #include "sched.h" +#include "walt.h" #ifdef CONFIG_SMP static int @@ -47,12 +48,14 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); + walt_inc_cumulative_runnable_avg(rq, p); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + walt_dec_cumulative_runnable_avg(rq, p); } static void yield_task_stop(struct rq *rq) @@ -133,4 +136,7 @@ const struct sched_class stop_sched_class .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c new file mode 100644 index 000000000000..753b852ab340 --- /dev/null +++ b/kernel/sched/walt.c @@ -0,0 +1,1733 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * walt.c + * + * Window Assistant Load Tracking + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include "sched.h" +#include "walt.h" +#define CREATE_TRACE_POINTS +#include +#undef CREATE_TRACE_POINTS + +const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK", + "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", + "IRQ_UPDATE"}; + +#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 +#define SCHED_ACCOUNT_WAIT_TIME 1 + +static ktime_t ktime_last; +static bool sched_ktime_suspended; +DEFINE_MUTEX(cluster_lock); +static atomic64_t walt_irq_work_lastq_ws; +u64 walt_load_reported_window; + +static struct irq_work walt_cpufreq_irq_work; +static struct irq_work walt_migration_irq_work; + +u64 sched_ktime_clock(void) +{ + if (unlikely(sched_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void sched_resume(void) +{ + sched_ktime_suspended = false; +} + +static int sched_suspend(void) +{ + ktime_last = ktime_get(); + sched_ktime_suspended = true; + return 0; +} + +static struct syscore_ops sched_syscore_ops = { + .resume = sched_resume, + .suspend = sched_suspend +}; + +static int __init sched_init_ops(void) +{ + register_syscore_ops(&sched_syscore_ops); + return 0; +} +late_initcall(sched_init_ops); + +static void acquire_rq_locks_irqsave(const cpumask_t *cpus, + unsigned long *flags) +{ + int cpu; + int level = 0; + + local_irq_save(*flags); + for_each_cpu(cpu, cpus) { + if (level == 0) + raw_spin_lock(&cpu_rq(cpu)->lock); + else + raw_spin_lock_nested(&cpu_rq(cpu)->lock, level); + level++; + } +} + +static void release_rq_locks_irqrestore(const cpumask_t *cpus, + unsigned long *flags) +{ + int cpu; + + for_each_cpu(cpu, cpus) + raw_spin_unlock(&cpu_rq(cpu)->lock); + local_irq_restore(*flags); +} + +#ifdef CONFIG_HZ_300 +/* + * Tick interval becomes to 3333333 due to + * rounding error when HZ=300. + */ +#define MIN_SCHED_RAVG_WINDOW (3333333 * 6) +#else +/* Min window size (in ns) = 20ms */ +#define MIN_SCHED_RAVG_WINDOW 20000000 +#endif + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ +unsigned int __read_mostly walt_disabled; + +__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC); + +/* + * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy + * associated with them. This is required for atomic update of those variables + * when being modifed via sysctl interface. + * + * IMPORTANT: Initialize both copies to same value!! + */ + +__read_mostly unsigned int sched_ravg_hist_size = 5; +__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5; + +__read_mostly unsigned int sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG; +__read_mostly unsigned int sysctl_sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG; + +static __read_mostly unsigned int sched_io_is_busy = 1; + +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +unsigned int sysctl_sched_walt_init_task_load_pct = 15; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = (10 * NSEC_PER_MSEC); + +/* Window size (in ns) */ +__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; + +/* + * A after-boot constant divisor for cpu_util_freq_walt() to apply the load + * boost. + */ +__read_mostly unsigned int walt_cpu_util_freq_divisor; + +/* Initial task load. Newly created tasks are assigned this load. */ +unsigned int __read_mostly sched_init_task_load_windows; +unsigned int __read_mostly sched_init_task_load_windows_scaled; +unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; + +/* + * Maximum possible frequency across all cpus. Task demand and cpu + * capacity (cpu_power) metrics are scaled in reference to it. + */ +unsigned int max_possible_freq = 1; + +/* + * Minimum possible max_freq across all cpus. This will be same as + * max_possible_freq on homogeneous systems and could be different from + * max_possible_freq on heterogenous systems. min_max_freq is used to derive + */ +unsigned int min_max_freq = 1; + +unsigned int max_capacity = 1024; /* max(rq->capacity) */ +unsigned int min_capacity = 1024; /* min(rq->capacity) */ +unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ +unsigned int +min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ + +/* Temporarily disable window-stats activity on all cpus */ +unsigned int __read_mostly sched_disable_window_stats; + +/* + * This governs what load needs to be used when reporting CPU busy time + * to the cpufreq governor. + */ +__read_mostly unsigned int sysctl_sched_freq_reporting_policy; + +static int __init set_sched_ravg_window(char *str) +{ + unsigned int window_size; + + get_option(&str, &window_size); + + if (window_size < MIN_SCHED_RAVG_WINDOW || + window_size > MAX_SCHED_RAVG_WINDOW) { + WARN_ON(1); + return -EINVAL; + } + + sched_ravg_window = window_size; + return 0; +} +early_param("sched_ravg_window", set_sched_ravg_window); + +__read_mostly unsigned int walt_scale_demand_divisor; +#define scale_demand(d) ((d)/walt_scale_demand_divisor) + +void inc_rq_walt_stats(struct rq *rq, struct task_struct *p) +{ + walt_inc_cumulative_runnable_avg(rq, p); +} + +void dec_rq_walt_stats(struct rq *rq, struct task_struct *p) +{ + walt_dec_cumulative_runnable_avg(rq, p); +} + +void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) +{ + s64 task_load_delta = (s64)updated_demand_scaled - + p->ravg.demand_scaled; + + fixup_cumulative_runnable_avg(&rq->walt_stats, task_load_delta); + + walt_fixup_cum_window_demand(rq, task_load_delta); +} + +static u64 +update_window_start(struct rq *rq, u64 wallclock, int event) +{ + s64 delta; + int nr_windows; + u64 old_window_start = rq->window_start; + + delta = wallclock - rq->window_start; + BUG_ON(delta < 0); + if (delta < sched_ravg_window) + return old_window_start; + + nr_windows = div64_u64(delta, sched_ravg_window); + rq->window_start += (u64)nr_windows * (u64)sched_ravg_window; + + rq->cum_window_demand_scaled = + rq->walt_stats.cumulative_runnable_avg_scaled; + + return old_window_start; +} + +void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +static int +account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event) +{ + /* + * No need to bother updating task demand for exiting tasks + * or the idle task. + */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + /* + * When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. + */ + if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + /* + * The idle exit time is not accounted for the first task _picked_ up to + * run on the idle CPU. + */ + if (event == PICK_NEXT_TASK && rq->curr == rq->idle) + return 0; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0; + } + + return 1; +} + +/* + * In this function we match the accumulated subtractions with the current + * and previous windows we are operating with. Ignore any entries where + * the window start in the load_subtraction struct does not match either + * the curent or the previous window. This could happen whenever CPUs + * become idle or busy with interrupts disabled for an extended period. + */ +static inline void account_load_subtractions(struct rq *rq) +{ + u64 ws = rq->window_start; + u64 prev_ws = ws - sched_ravg_window; + struct load_subtractions *ls = rq->load_subs; + int i; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + if (ls[i].window_start == ws) { + rq->curr_runnable_sum -= ls[i].subs; + rq->nt_curr_runnable_sum -= ls[i].new_subs; + } else if (ls[i].window_start == prev_ws) { + rq->prev_runnable_sum -= ls[i].subs; + rq->nt_prev_runnable_sum -= ls[i].new_subs; + } + + ls[i].subs = 0; + ls[i].new_subs = 0; + } + + BUG_ON((s64)rq->prev_runnable_sum < 0); + BUG_ON((s64)rq->curr_runnable_sum < 0); + BUG_ON((s64)rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)rq->nt_curr_runnable_sum < 0); +} + +static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index) +{ + rq->load_subs[index].window_start = ws; + rq->load_subs[index].subs = 0; + rq->load_subs[index].new_subs = 0; +} + +static bool get_subtraction_index(struct rq *rq, u64 ws) +{ + int i; + u64 oldest = ULLONG_MAX; + int oldest_index = 0; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + u64 entry_ws = rq->load_subs[i].window_start; + + if (ws == entry_ws) + return i; + + if (entry_ws < oldest) { + oldest = entry_ws; + oldest_index = i; + } + } + + create_subtraction_entry(rq, ws, oldest_index); + return oldest_index; +} + +static void update_rq_load_subtractions(int index, struct rq *rq, + u32 sub_load, bool new_task) +{ + rq->load_subs[index].subs += sub_load; + if (new_task) + rq->load_subs[index].new_subs += sub_load; +} + +void update_cluster_load_subtractions(struct task_struct *p, + int cpu, u64 ws, bool new_task) +{ + struct sched_cluster *cluster = cpu_cluster(cpu); + struct cpumask cluster_cpus = cluster->cpus; + u64 prev_ws = ws - sched_ravg_window; + int i; + + cpumask_clear_cpu(cpu, &cluster_cpus); + raw_spin_lock(&cluster->load_lock); + + for_each_cpu(i, &cluster_cpus) { + struct rq *rq = cpu_rq(i); + int index; + + if (p->ravg.curr_window_cpu[i]) { + index = get_subtraction_index(rq, ws); + update_rq_load_subtractions(index, rq, + p->ravg.curr_window_cpu[i], new_task); + p->ravg.curr_window_cpu[i] = 0; + } + + if (p->ravg.prev_window_cpu[i]) { + index = get_subtraction_index(rq, prev_ws); + update_rq_load_subtractions(index, rq, + p->ravg.prev_window_cpu[i], new_task); + p->ravg.prev_window_cpu[i] = 0; + } + } + + raw_spin_unlock(&cluster->load_lock); +} + +static inline void inter_cluster_migration_fixup + (struct task_struct *p, int new_cpu, int task_cpu, bool new_task) +{ + struct rq *dest_rq = cpu_rq(new_cpu); + struct rq *src_rq = cpu_rq(task_cpu); + + if (same_freq_domain(new_cpu, task_cpu)) + return; + + p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window; + p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window; + + dest_rq->curr_runnable_sum += p->ravg.curr_window; + dest_rq->prev_runnable_sum += p->ravg.prev_window; + + src_rq->curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu]; + src_rq->prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu]; + + if (new_task) { + dest_rq->nt_curr_runnable_sum += p->ravg.curr_window; + dest_rq->nt_prev_runnable_sum += p->ravg.prev_window; + + src_rq->nt_curr_runnable_sum -= + p->ravg.curr_window_cpu[task_cpu]; + src_rq->nt_prev_runnable_sum -= + p->ravg.prev_window_cpu[task_cpu]; + } + + p->ravg.curr_window_cpu[task_cpu] = 0; + p->ravg.prev_window_cpu[task_cpu] = 0; + + update_cluster_load_subtractions(p, task_cpu, + src_rq->window_start, new_task); + + BUG_ON((s64)src_rq->prev_runnable_sum < 0); + BUG_ON((s64)src_rq->curr_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0); +} + +void fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + bool new_task; + + if (!p->on_rq && p->state != TASK_WAKING) + return; + + if (exiting_task(p)) + return; + + if (p->state == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + if (sched_disable_window_stats) + goto done; + + wallclock = sched_ktime_clock(); + + update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, + wallclock, 0); + update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + update_task_ravg(p, task_rq(p), TASK_MIGRATE, + wallclock, 0); + + /* + * When a task is migrating during the wakeup, adjust + * the task's contribution towards cumulative window + * demand. + */ + if (p->state == TASK_WAKING && p->last_sleep_ts >= + src_rq->window_start) { + walt_fixup_cum_window_demand(src_rq, + -(s64)p->ravg.demand_scaled); + walt_fixup_cum_window_demand(dest_rq, p->ravg.demand_scaled); + } + + new_task = is_new_task(p); + + inter_cluster_migration_fixup(p, new_cpu, + task_cpu(p), new_task); + + if (!same_freq_domain(new_cpu, task_cpu(p))) + irq_work_queue(&walt_migration_irq_work); + +done: + if (p->state == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +void set_window_start(struct rq *rq) +{ + static int sync_cpu_available; + + if (likely(rq->window_start)) + return; + + if (!sync_cpu_available) { + rq->window_start = 1; + sync_cpu_available = 1; + atomic64_set(&walt_irq_work_lastq_ws, rq->window_start); + walt_load_reported_window = + atomic64_read(&walt_irq_work_lastq_ws); + + } else { + struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask)); + + raw_spin_unlock(&rq->lock); + double_rq_lock(rq, sync_rq); + rq->window_start = sync_rq->window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + raw_spin_unlock(&sync_rq->lock); + } + + rq->curr->ravg.mark_start = rq->window_start; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + u32 *hist = &p->ravg.sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand; + u64 sum = 0; + u16 demand_scaled; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = sched_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + p->ravg.sum = 0; + + if (sched_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (sched_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, sched_ravg_hist_size); + if (sched_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + demand_scaled = scale_demand(demand); + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements walt stats + * avoid decrementing it here again. + * + * When window is rolled over, the cumulative window demand + * is reset to the cumulative runnable average (contribution from + * the tasks on the runqueue). If the current task is dequeued + * already, it's demand is not included in the cumulative runnable + * average. So add the task demand separately to cumulative window + * demand. + */ + if (!task_has_dl_policy(p) || !p->dl.dl_throttled) { + if (task_on_rq_queued(p) + && p->sched_class->fixup_walt_sched_stats) + p->sched_class->fixup_walt_sched_stats(rq, p, + demand_scaled); + else if (rq->curr == p) + walt_fixup_cum_window_demand(rq, demand_scaled); + } + + p->ravg.demand = demand; + p->ravg.demand_scaled = demand_scaled; + +done: + trace_sched_update_history(rq, p, runtime, samples, event); +} + +#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) + +static inline u64 scale_exec_time(u64 delta, struct rq *rq) +{ + unsigned long capcurr = capacity_curr_of(cpu_of(rq)); + + delta = (delta * capcurr) >> SCHED_CAPACITY_SHIFT; + + return delta; +} + +static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) +{ + delta = scale_exec_time(delta, rq); + p->ravg.sum += delta; + if (unlikely(p->ravg.sum > sched_ravg_window)) + p->ravg.sum = sched_ravg_window; + + return delta; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = p->ravg.mark_start; + * wc = wallclock + * ws = rq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, p->ravg.sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, p->ravg.sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of p->ravg.sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally p->ravg.sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static u64 update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + u64 mark_start = p->ravg.mark_start; + u64 delta, window_start = rq->window_start; + int new_window, nr_full_windows; + u32 window_size = sched_ravg_window; + u64 runtime; + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(rq, p, event)) { + if (new_window) + /* + * If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. + */ + update_history(rq, p, p->ravg.sum, 1, event); + return 0; + } + + if (!new_window) { + /* + * The simple case - busy time contained within the existing + * window. + */ + return add_to_task_demand(rq, p, wallclock - mark_start); + } + + /* + * Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. + */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + runtime = add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, p->ravg.sum, 1, event); + if (nr_full_windows) { + u64 scaled_window = scale_exec_time(window_size, rq); + + update_history(rq, p, scaled_window, nr_full_windows, event); + runtime += nr_full_windows * scaled_window; + } + + /* + * Roll window_start back to current to process any remainder + * in current window. + */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + runtime += add_to_task_demand(rq, p, wallclock - mark_start); + + return runtime; +} + +static u32 empty_windows[NR_CPUS]; + +static void rollover_task_window(struct task_struct *p, bool full_window) +{ + u32 *curr_cpu_windows = empty_windows; + u32 curr_window; + int i; + + /* Rollover the sum */ + curr_window = 0; + + if (!full_window) { + curr_window = p->ravg.curr_window; + curr_cpu_windows = p->ravg.curr_window_cpu; + } + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + + /* Roll over individual CPU contributions */ + for (i = 0; i < nr_cpu_ids; i++) { + p->ravg.prev_window_cpu[i] = curr_cpu_windows[i]; + p->ravg.curr_window_cpu[i] = 0; + } +} + +static void rollover_cpu_window(struct rq *rq, bool full_window) +{ + u64 curr_sum = rq->curr_runnable_sum; + u64 nt_curr_sum = rq->nt_curr_runnable_sum; + + if (unlikely(full_window)) { + curr_sum = 0; + nt_curr_sum = 0; + } + + rq->prev_runnable_sum = curr_sum; + rq->nt_prev_runnable_sum = nt_curr_sum; + + rq->curr_runnable_sum = 0; + rq->nt_curr_runnable_sum = 0; +} + +static inline int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!sched_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE) + return 1; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0; + } + + /* TASK_MIGRATE, PICK_NEXT_TASK left */ + return SCHED_FREQ_ACCOUNT_WAIT_TIME; +} + +/* + * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, full_window = 0; + int p_is_curr_task = (p == rq->curr); + u64 mark_start = p->ravg.mark_start; + u64 window_start = rq->window_start; + u32 window_size = sched_ravg_window; + u64 delta; + u64 *curr_runnable_sum = &rq->curr_runnable_sum; + u64 *prev_runnable_sum = &rq->prev_runnable_sum; + u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + bool new_task; + int cpu = rq->cpu; + + new_window = mark_start < window_start; + if (new_window) { + full_window = (window_start - mark_start) >= window_size; + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + new_task = is_new_task(p); + + /* + * Handle per-task window rollover. We don't care about the idle + * task or exiting tasks. + */ + if (!is_idle_task(p) && !exiting_task(p)) { + if (new_window) + rollover_task_window(p, full_window); + } + + if (p_is_curr_task && new_window) + rollover_cpu_window(rq, full_window); + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) + goto done; + + if (!new_window) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. + */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.curr_window += delta; + p->ravg.curr_window_cpu[cpu] += delta; + } + + goto done; + } + + if (!p_is_curr_task) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!exiting_task(p)) { + p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!exiting_task(p)) { + p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } + } + + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!exiting_task(p)) { + p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } + + goto done; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task or exiting tasks need not + * be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } + } + + /* + * Rollover is done here by overwriting the values in + * prev_runnable_sum and curr_runnable_sum. + */ + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } + + goto done; + } + + if (irqtime) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. + */ + + BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* + * Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. + */ + if (mark_start > window_start) { + *curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* + * The IRQ busy time spanned multiple windows. Process the + * window then that is all that need be accounted. + */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + *prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + rq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + +done: + return; +} + +static inline void run_walt_irq_work(u64 old_window_start, struct rq *rq) +{ + u64 result; + + if (old_window_start == rq->window_start) + return; + + result = atomic64_cmpxchg(&walt_irq_work_lastq_ws, old_window_start, + rq->window_start); + if (result == old_window_start) + irq_work_queue(&walt_cpufreq_irq_work); +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + u64 old_window_start; + + if (!rq->window_start || sched_disable_window_stats || + p->ravg.mark_start == wallclock) + return; + + lockdep_assert_held(&rq->lock); + + old_window_start = update_window_start(rq, wallclock, event); + + if (!p->ravg.mark_start) + goto done; + + update_task_demand(p, rq, event, wallclock); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime); + + if (exiting_task(p)) + goto done; + +done: + p->ravg.mark_start = wallclock; + + run_walt_irq_work(old_window_start, rq); +} + +int sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec(table, write, buffer, length, ppos); + if (rc) + return rc; + + sysctl_sched_init_task_load_pct = sysctl_sched_walt_init_task_load_pct; + + return 0; +} + +u32 sched_get_init_task_load(struct task_struct *p) +{ + return p->init_load_pct; +} + +int sched_set_init_task_load(struct task_struct *p, int init_load_pct) +{ + if (init_load_pct < 0 || init_load_pct > 100) + return -EINVAL; + + p->init_load_pct = init_load_pct; + + return 0; +} + +void init_new_task_load(struct task_struct *p) +{ + int i; + u32 init_load_windows = sched_init_task_load_windows; + u32 init_load_windows_scaled = sched_init_task_load_windows_scaled; + u32 init_load_pct = current->init_load_pct; + + p->last_sleep_ts = 0; + p->init_load_pct = 0; + memset(&p->ravg, 0, sizeof(struct ravg)); + + p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), + GFP_KERNEL | __GFP_NOFAIL); + p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), + GFP_KERNEL | __GFP_NOFAIL); + + if (init_load_pct) { + init_load_windows = div64_u64((u64)init_load_pct * + (u64)sched_ravg_window, 100); + init_load_windows_scaled = scale_demand(init_load_windows); + } + + p->ravg.demand = init_load_windows; + p->ravg.demand_scaled = init_load_windows_scaled; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + p->ravg.sum_history[i] = init_load_windows; +} + +void free_task_load_ptrs(struct task_struct *p) +{ + kfree(p->ravg.curr_window_cpu); + kfree(p->ravg.prev_window_cpu); + + /* + * update_task_ravg() can be called for exiting tasks. While the + * function itself ensures correct behavior, the corresponding + * trace event requires that these pointers be NULL. + */ + p->ravg.curr_window_cpu = NULL; + p->ravg.prev_window_cpu = NULL; +} + +void reset_task_stats(struct task_struct *p) +{ + u32 sum = 0; + u32 *curr_window_ptr = NULL; + u32 *prev_window_ptr = NULL; + + if (exiting_task(p)) { + sum = EXITING_TASK_MARKER; + } else { + curr_window_ptr = p->ravg.curr_window_cpu; + prev_window_ptr = p->ravg.prev_window_cpu; + memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + } + + memset(&p->ravg, 0, sizeof(struct ravg)); + + p->ravg.curr_window_cpu = curr_window_ptr; + p->ravg.prev_window_cpu = prev_window_ptr; + + /* Retain EXITING_TASK marker */ + p->ravg.sum_history[0] = sum; +} + +void mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + + if (!rq->window_start || sched_disable_window_stats) { + reset_task_stats(p); + return; + } + + wallclock = sched_ktime_clock(); + p->ravg.mark_start = wallclock; +} + +unsigned int max_possible_efficiency = 1; +unsigned int min_possible_efficiency = UINT_MAX; +unsigned int max_power_cost = 1; + +static cpumask_t all_cluster_cpus = CPU_MASK_NONE; +DECLARE_BITMAP(all_cluster_ids, NR_CPUS); +struct sched_cluster *sched_cluster[NR_CPUS]; +int num_clusters; + +struct list_head cluster_head; + +static void +insert_cluster(struct sched_cluster *cluster, struct list_head *head) +{ + struct sched_cluster *tmp; + struct list_head *iter = head; + + list_for_each_entry(tmp, head, list) { + if (cluster->max_power_cost < tmp->max_power_cost) + break; + iter = &tmp->list; + } + + list_add(&cluster->list, iter); +} + +static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus) +{ + struct sched_cluster *cluster = NULL; + + cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC); + if (!cluster) { + pr_warn("Cluster allocation failed. Possible bad scheduling\n"); + return NULL; + } + + INIT_LIST_HEAD(&cluster->list); + cluster->max_power_cost = 1; + cluster->min_power_cost = 1; + cluster->capacity = 1024; + cluster->max_possible_capacity = 1024; + cluster->efficiency = 1; + cluster->load_scale_factor = 1024; + cluster->cur_freq = 1; + cluster->max_freq = 1; + cluster->min_freq = 1; + cluster->max_possible_freq = 1; + cluster->freq_init_done = false; + + raw_spin_lock_init(&cluster->load_lock); + cluster->cpus = *cpus; + cluster->efficiency = topology_get_cpu_scale(cpumask_first(cpus)); + + if (cluster->efficiency > max_possible_efficiency) + max_possible_efficiency = cluster->efficiency; + if (cluster->efficiency < min_possible_efficiency) + min_possible_efficiency = cluster->efficiency; + + return cluster; +} + +static void add_cluster(const struct cpumask *cpus, struct list_head *head) +{ + struct sched_cluster *cluster = alloc_new_cluster(cpus); + int i; + + if (!cluster) + return; + + for_each_cpu(i, cpus) + cpu_rq(i)->cluster = cluster; + + insert_cluster(cluster, head); + set_bit(num_clusters, all_cluster_ids); + num_clusters++; +} + +static int compute_max_possible_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= (1024 * cluster->max_possible_freq) / min_max_freq; + capacity >>= 10; + + return capacity; +} + +void walt_update_min_max_capacity(void) +{ + unsigned long flags; + + acquire_rq_locks_irqsave(cpu_possible_mask, &flags); + __update_min_max_capacity(); + release_rq_locks_irqrestore(cpu_possible_mask, &flags); +} + +static int +compare_clusters(void *priv, const struct list_head *a, const struct list_head *b) +{ + struct sched_cluster *cluster1, *cluster2; + int ret; + + cluster1 = container_of(a, struct sched_cluster, list); + cluster2 = container_of(b, struct sched_cluster, list); + + /* + * Don't assume higher capacity means higher power. If the + * power cost is same, sort the higher capacity cluster before + * the lower capacity cluster to start placing the tasks + * on the higher capacity cluster. + */ + ret = cluster1->max_power_cost > cluster2->max_power_cost || + (cluster1->max_power_cost == cluster2->max_power_cost && + cluster1->max_possible_capacity < + cluster2->max_possible_capacity); + + return ret; +} + +void sort_clusters(void) +{ + struct sched_cluster *cluster; + struct list_head new_head; + unsigned int tmp_max = 1; + + INIT_LIST_HEAD(&new_head); + + for_each_sched_cluster(cluster) { + cluster->max_power_cost = power_cost(cluster_first_cpu(cluster), + max_task_load()); + cluster->min_power_cost = power_cost(cluster_first_cpu(cluster), + 0); + + if (cluster->max_power_cost > tmp_max) + tmp_max = cluster->max_power_cost; + } + max_power_cost = tmp_max; + + move_list(&new_head, &cluster_head, true); + + list_sort(NULL, &new_head, compare_clusters); + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); +} + +static void update_all_clusters_stats(void) +{ + struct sched_cluster *cluster; + u64 highest_mpc = 0, lowest_mpc = U64_MAX; + unsigned long flags; + + acquire_rq_locks_irqsave(cpu_possible_mask, &flags); + + for_each_sched_cluster(cluster) { + u64 mpc; + + cluster->capacity = compute_capacity(cluster); + mpc = cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + + cluster->exec_scale_factor = + DIV_ROUND_UP(cluster->efficiency * 1024, + max_possible_efficiency); + + if (mpc > highest_mpc) + highest_mpc = mpc; + + if (mpc < lowest_mpc) + lowest_mpc = mpc; + } + + max_possible_capacity = highest_mpc; + min_max_possible_capacity = lowest_mpc; + + __update_min_max_capacity(); + release_rq_locks_irqrestore(cpu_possible_mask, &flags); +} + +void update_cluster_topology(void) +{ + struct cpumask cpus = *cpu_possible_mask; + const struct cpumask *cluster_cpus; + struct list_head new_head; + int i; + + INIT_LIST_HEAD(&new_head); + + for_each_cpu(i, &cpus) { + cluster_cpus = cpu_coregroup_mask(i); + cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus); + cpumask_andnot(&cpus, &cpus, cluster_cpus); + add_cluster(cluster_cpus, &new_head); + } + + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); + update_all_clusters_stats(); +} + +struct sched_cluster init_cluster = { + .list = LIST_HEAD_INIT(init_cluster.list), + .id = 0, + .max_power_cost = 1, + .min_power_cost = 1, + .capacity = 1024, + .max_possible_capacity = 1024, + .efficiency = 1, + .load_scale_factor = 1024, + .cur_freq = 1, + .max_freq = 1, + .min_freq = 1, + .max_possible_freq = 1, + .exec_scale_factor = 1024, +}; + +void init_clusters(void) +{ + bitmap_clear(all_cluster_ids, 0, NR_CPUS); + init_cluster.cpus = *cpu_possible_mask; + raw_spin_lock_init(&init_cluster.load_lock); + INIT_LIST_HEAD(&cluster_head); +} + +static unsigned long cpu_max_table_freq[NR_CPUS]; + +void update_cpu_cluster_capacity(const cpumask_t *cpus) +{ + int i; + struct sched_cluster *cluster; + struct cpumask cpumask; + unsigned long flags; + + cpumask_copy(&cpumask, cpus); + acquire_rq_locks_irqsave(cpu_possible_mask, &flags); + + for_each_cpu(i, &cpumask) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); + + cluster->capacity = compute_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + } + + __update_min_max_capacity(); + + release_rq_locks_irqrestore(cpu_possible_mask, &flags); +} + +static int cpufreq_notifier_policy(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = (struct cpufreq_policy *)data; + struct sched_cluster *cluster = NULL; + struct cpumask policy_cluster = *policy->related_cpus; + unsigned int orig_max_freq = 0; + int i, j, update_capacity = 0; + + if (val != CPUFREQ_CREATE_POLICY) + return 0; + + walt_update_min_max_capacity(); + + max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); + if (min_max_freq == 1) + min_max_freq = UINT_MAX; + min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); + BUG_ON(!min_max_freq); + BUG_ON(!policy->max); + + for_each_cpu(i, &policy_cluster) + cpu_max_table_freq[i] = policy->cpuinfo.max_freq; + + for_each_cpu(i, &policy_cluster) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&policy_cluster, &policy_cluster, + &cluster->cpus); + + orig_max_freq = cluster->max_freq; + cluster->min_freq = policy->min; + cluster->max_freq = policy->max; + cluster->cur_freq = policy->cur; + + if (!cluster->freq_init_done) { + mutex_lock(&cluster_lock); + for_each_cpu(j, &cluster->cpus) + cpumask_copy(&cpu_rq(j)->freq_domain_cpumask, + policy->related_cpus); + cluster->max_possible_freq = policy->cpuinfo.max_freq; + cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->freq_init_done = true; + + sort_clusters(); + update_all_clusters_stats(); + mutex_unlock(&cluster_lock); + continue; + } + + update_capacity += (orig_max_freq != cluster->max_freq); + } + + if (update_capacity) + update_cpu_cluster_capacity(policy->related_cpus); + + return 0; +} + +static struct notifier_block notifier_policy_block = { + .notifier_call = cpufreq_notifier_policy +}; + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->policy->cpu, new_freq = freq->new; + unsigned long flags; + struct sched_cluster *cluster; + struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask; + int i, j; + + if (val != CPUFREQ_POSTCHANGE) + return NOTIFY_DONE; + + if (cpu_cur_freq(cpu) == new_freq) + return NOTIFY_OK; + + for_each_cpu(i, &policy_cpus) { + cluster = cpu_rq(i)->cluster; + + for_each_cpu(j, &cluster->cpus) { + struct rq *rq = cpu_rq(j); + + raw_spin_lock_irqsave(&rq->lock, flags); + update_task_ravg(rq->curr, rq, TASK_UPDATE, + sched_ktime_clock(), 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + cluster->cur_freq = new_freq; + cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus); + } + + return NOTIFY_OK; +} + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static int register_walt_callback(void) +{ + int ret; + + ret = cpufreq_register_notifier(¬ifier_policy_block, + CPUFREQ_POLICY_NOTIFIER); + if (!ret) + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + + return ret; +} +/* + * cpufreq callbacks can be registered at core_initcall or later time. + * Any registration done prior to that is "forgotten" by cpufreq. See + * initialization of variable init_cpufreq_transition_notifier_list_called + * for further information. + */ +core_initcall(register_walt_callback); + +/* + * Runs in hard-irq context. This should ideally run just after the latest + * window roll-over. + */ +void walt_irq_work(struct irq_work *irq_work) +{ + struct sched_cluster *cluster; + struct rq *rq; + int cpu; + u64 wc; + bool is_migration = false; + int level = 0; + + /* Am I the window rollover work or the migration work? */ + if (irq_work == &walt_migration_irq_work) + is_migration = true; + + for_each_cpu(cpu, cpu_possible_mask) { + if (level == 0) + raw_spin_lock(&cpu_rq(cpu)->lock); + else + raw_spin_lock_nested(&cpu_rq(cpu)->lock, level); + level++; + } + + wc = sched_ktime_clock(); + walt_load_reported_window = atomic64_read(&walt_irq_work_lastq_ws); + for_each_sched_cluster(cluster) { + raw_spin_lock(&cluster->load_lock); + + for_each_cpu(cpu, &cluster->cpus) { + rq = cpu_rq(cpu); + if (rq->curr) { + update_task_ravg(rq->curr, rq, + TASK_UPDATE, wc, 0); + account_load_subtractions(rq); + } + } + + raw_spin_unlock(&cluster->load_lock); + } + + for_each_sched_cluster(cluster) { + cpumask_t cluster_online_cpus; + unsigned int num_cpus, i = 1; + + cpumask_and(&cluster_online_cpus, &cluster->cpus, + cpu_online_mask); + num_cpus = cpumask_weight(&cluster_online_cpus); + for_each_cpu(cpu, &cluster_online_cpus) { + int flag = SCHED_CPUFREQ_WALT; + + rq = cpu_rq(cpu); + + if (i == num_cpus) + cpufreq_update_util(cpu_rq(cpu), flag); + else + cpufreq_update_util(cpu_rq(cpu), flag | + SCHED_CPUFREQ_CONTINUE); + i++; + } + } + + for_each_cpu(cpu, cpu_possible_mask) + raw_spin_unlock(&cpu_rq(cpu)->lock); +} + +static void walt_init_once(void) +{ + init_irq_work(&walt_migration_irq_work, walt_irq_work); + init_irq_work(&walt_cpufreq_irq_work, walt_irq_work); + + walt_cpu_util_freq_divisor = + (sched_ravg_window >> SCHED_CAPACITY_SHIFT) * 100; + walt_scale_demand_divisor = sched_ravg_window >> SCHED_CAPACITY_SHIFT; + + sched_init_task_load_windows = + div64_u64((u64)sysctl_sched_init_task_load_pct * + (u64)sched_ravg_window, 100); + sched_init_task_load_windows_scaled = + scale_demand(sched_init_task_load_windows); +} + +void walt_sched_init_rq(struct rq *rq) +{ + static bool init; + int j; + + if (!init) { + walt_init_once(); + init = true; + } + + cpumask_set_cpu(cpu_of(rq), &rq->freq_domain_cpumask); + + rq->walt_stats.cumulative_runnable_avg_scaled = 0; + rq->window_start = 0; + rq->walt_flags = 0; + rq->cur_irqload = 0; + rq->avg_irqload = 0; + rq->irqload_ts = 0; + + /* + * All cpus part of same cluster by default. This avoids the + * need to check for rq->cluster being non-NULL in hot-paths + * like select_best_cpu() + */ + rq->cluster = &init_cluster; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + rq->cum_window_demand_scaled = 0; + + for (j = 0; j < NUM_TRACKED_WINDOWS; j++) + memset(&rq->load_subs[j], 0, sizeof(struct load_subtractions)); +} diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h new file mode 100644 index 000000000000..fcb1555d53f8 --- /dev/null +++ b/kernel/sched/walt.h @@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * walt.h + * + * head file for Window-Assistant-Load-Tracking + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef __WALT_H +#define __WALT_H + +#ifdef CONFIG_SCHED_WALT + +#include + +#define WINDOW_STATS_RECENT 0 +#define WINDOW_STATS_MAX 1 +#define WINDOW_STATS_MAX_RECENT_AVG 2 +#define WINDOW_STATS_AVG 3 +#define WINDOW_STATS_INVALID_POLICY 4 + +#define EXITING_TASK_MARKER 0xdeaddead + +#define SCHED_NEW_TASK_WINDOWS 5 + +extern unsigned int sched_ravg_window; +extern unsigned int sysctl_sched_walt_init_task_load_pct; + +static inline int exiting_task(struct task_struct *p) +{ + return (p->ravg.sum_history[0] == EXITING_TASK_MARKER); +} + +static inline struct sched_cluster *cpu_cluster(int cpu) +{ + return cpu_rq(cpu)->cluster; +} + +static inline bool is_new_task(struct task_struct *p) +{ + return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS; +} + +static inline unsigned int max_task_load(void) +{ + return sched_ravg_window; +} + +static inline void +move_list(struct list_head *dst, struct list_head *src, bool sync_rcu) +{ + struct list_head *first, *last; + + first = src->next; + last = src->prev; + + if (sync_rcu) { + INIT_LIST_HEAD_RCU(src); + synchronize_rcu(); + } + + first->prev = dst; + dst->prev = last; + last->next = dst; + + /* Ensure list sanity before making the head visible to all CPUs. */ + smp_mb(); + dst->next = first; +} + +extern void reset_task_stats(struct task_struct *p); +extern void update_cluster_topology(void); +extern void init_clusters(void); +extern void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); + +static inline void +fixup_cumulative_runnable_avg(struct walt_sched_stats *stats, + s64 demand_scaled_delta) +{ + if (sched_disable_window_stats) + return; + + stats->cumulative_runnable_avg_scaled += demand_scaled_delta; + BUG_ON((s64)stats->cumulative_runnable_avg_scaled < 0); +} + +static inline void +walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + fixup_cumulative_runnable_avg(&rq->walt_stats, p->ravg.demand_scaled); + + /* + * Add a task's contribution to the cumulative window demand when + * + * (1) task is enqueued with on_rq = 1 i.e migration, + * prio/cgroup/class change. + * (2) task is waking for the first time in this window. + */ + if (p->on_rq || (p->last_sleep_ts < rq->window_start)) + walt_fixup_cum_window_demand(rq, p->ravg.demand_scaled); +} + +static inline void +walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + fixup_cumulative_runnable_avg(&rq->walt_stats, + -(s64)p->ravg.demand_scaled); + + /* + * on_rq will be 1 for sleeping tasks. So check if the task + * is migrating or dequeuing in RUNNING state to change the + * prio/cgroup/class. + */ + if (task_on_rq_migrating(p) || p->state == TASK_RUNNING) + walt_fixup_cum_window_demand(rq, -(s64)p->ravg.demand_scaled); +} +extern void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled); +extern void inc_rq_walt_stats(struct rq *rq, struct task_struct *p); +extern void dec_rq_walt_stats(struct rq *rq, struct task_struct *p); +extern void fixup_busy_time(struct task_struct *p, int new_cpu); +extern void init_new_task_load(struct task_struct *p); +extern void mark_task_starting(struct task_struct *p); +extern void set_window_start(struct rq *rq); +void account_irqtime(int cpu, struct task_struct *curr, u64 delta, u64 wallclock); + +void walt_irq_work(struct irq_work *irq_work); + +void walt_sched_init_rq(struct rq *rq); + +extern void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock); + +#define SCHED_HIGH_IRQ_TIMEOUT 3 +static inline u64 sched_irqload(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + s64 delta; + + delta = get_jiffies_64() - rq->irqload_ts; + /* + * Current context can be preempted by irq and rq->irqload_ts can be + * updated by irq context so that delta can be negative. + * But this is okay and we can safely return as this means there + * was recent irq occurrence. + */ + + if (delta < SCHED_HIGH_IRQ_TIMEOUT) + return rq->avg_irqload; + else + return 0; +} + +static inline int sched_cpu_high_irqload(int cpu) +{ + return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload; +} + +extern int +sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); + +static inline unsigned int cpu_cur_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->cur_freq; +} + +static inline void assign_cluster_ids(struct list_head *head) +{ + struct sched_cluster *cluster; + int pos = 0; + + list_for_each_entry(cluster, head, list) { + cluster->id = pos; + sched_cluster[pos++] = cluster; + } +} + +#else /* CONFIG_SCHED_WALT */ +static inline void walt_sched_init_rq(struct rq *rq) { } + +static inline void update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) { } + +static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) { } + +static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) { } + +static inline void +inc_rq_walt_stats(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_rq_walt_stats(struct rq *rq, struct task_struct *p) { } + +static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } +static inline void init_new_task_load(struct task_struct *p) { } +static inline void mark_task_starting(struct task_struct *p) { } +static inline void set_window_start(struct rq *rq) { } +static inline void update_cluster_topology(void) { } +static inline void init_clusters(void) { } + +static inline void +fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) { } + +static inline void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) { } + +static inline u64 sched_irqload(int cpu) +{ + return 0; +} +static inline int sched_cpu_high_irqload(int cpu) +{ + return 0; +} +#endif /* CONFIG_SCHED_WALT */ + +#endif /* __WALT_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1d8b4358aa11..f13b9e456f50 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1659,6 +1659,40 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_WALT + { + .procname = "sched_use_walt_cpu_util", + .data = &sysctl_sched_use_walt_cpu_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_use_walt_task_util", + .data = &sysctl_sched_use_walt_task_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_walt_init_task_load_pct", + .data = &sysctl_sched_walt_init_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_walt_init_task_load_pct_sysctl_handler, + }, + { + .procname = "sched_cpu_high_irqload", + .data = &sysctl_sched_cpu_high_irqload, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_SCHED_DEBUG { .procname = "sched_min_granularity_ns", -- Gitee From ed5ad643ba013e9df793b5d72ea71b260e97ce0f Mon Sep 17 00:00:00 2001 From: Xi_Yuhao Date: Sun, 30 Jan 2022 16:13:43 +0800 Subject: [PATCH 003/113] binder:Use ioctl cmd to support access token rather than changing existing struct ohos inclusion category: feature issue: #I4SCSR CVE: NA ----------- tokendid is used for special app security control Signed-off-by: Xi_Yuhao --- drivers/android/binder.c | 58 ++++++++++++++++++++++++++--- include/uapi/linux/android/binder.h | 26 ++++++------- 2 files changed, 66 insertions(+), 18 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index e58dd44eee3d..3604f0df6896 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -95,6 +95,15 @@ DEFINE_SHOW_ATTRIBUTE(proc); #define FORBIDDEN_MMAP_FLAGS (VM_WRITE) +#ifdef CONFIG_ACCESS_TOKENID +#define ENABLE_ACCESS_TOKENID 1 +#else +#define ENABLE_ACCESS_TOKENID 0 +#endif /* CONFIG_ACCESS_TOKENID */ + +#define ACCESS_TOKENID_FEATURE_VALUE (ENABLE_ACCESS_TOKENID << 0) +#define BINDER_CURRENT_FEATURE_SET ACCESS_TOKENID_FEATURE_VALUE + enum { BINDER_DEBUG_USER_ERROR = 1U << 0, BINDER_DEBUG_FAILED_TRANSACTION = 1U << 1, @@ -4455,10 +4464,6 @@ static int binder_thread_read(struct binder_proc *proc, trd->code = t->code; trd->flags = t->flags; trd->sender_euid = from_kuid(current_user_ns(), t->sender_euid); -#ifdef CONFIG_ACCESS_TOKENID - trd->sender_tokenid = t->sender_tokenid; - trd->first_tokenid = t->first_tokenid; -#endif /* CONFIG_ACCESS_TOKENID */ t_from = binder_get_txn_from(t); if (t_from) { @@ -5093,7 +5098,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ret = -EINVAL; goto err; } - if (put_user(BINDER_CURRENT_PROTOCOL_VERSION + BINDER_SUB_VERSION, + if (put_user(BINDER_CURRENT_PROTOCOL_VERSION, &ver->protocol_version)) { ret = -EINVAL; goto err; @@ -5137,6 +5142,49 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } break; } + case BINDER_FEATURE_SET: { + struct binder_feature_set __user *features = ubuf; + + if (size != sizeof(struct binder_feature_set)) { + ret = -EINVAL; + goto err; + } + if (put_user(BINDER_CURRENT_FEATURE_SET, &features->feature_set)) { + ret = -EINVAL; + goto err; + } + break; + } +#ifdef CONFIG_ACCESS_TOKENID + case BINDER_GET_ACCESS_TOKEN: { + struct access_token __user *tokens = ubuf; + u64 token, ftoken; + + if (size != sizeof(struct access_token)) { + ret = -EINVAL; + goto err; + } + binder_inner_proc_lock(proc); + if (thread->transaction_stack == NULL) { + ret = -EFAULT; + binder_inner_proc_unlock(proc); + goto err; + } + token = thread->transaction_stack->sender_tokenid; + ftoken = thread->transaction_stack->first_tokenid; + + binder_inner_proc_unlock(proc); + if (put_user(token, &tokens->sender_tokenid)) { + ret = -EINVAL; + goto err; + } + if (put_user(ftoken, &tokens->first_tokenid)) { + ret = -EINVAL; + goto err; + } + break; + } +#endif /* CONFIG_ACCESS_TOKENID */ default: ret = -EINVAL; goto err; diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 1b75626269a3..3abb5b15aa71 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -195,15 +195,6 @@ struct binder_version { #define BINDER_CURRENT_PROTOCOL_VERSION 8 #endif -#ifdef CONFIG_ACCESS_TOKENID -#define ENABLE_ACCESS_TOKENID 1 -#else -#define ENABLE_ACCESS_TOKENID 0 -#endif /* CONFIG_ACCESS_TOKENID */ - -#define BINDER_SUB_VERSION_SHIFT_BASE 16 -#define BINDER_SUB_VERSION (ENABLE_ACCESS_TOKENID << BINDER_SUB_VERSION_SHIFT_BASE) - /* * Use with BINDER_GET_NODE_DEBUG_INFO, driver reads ptr, writes to all fields. * Set ptr to NULL for the first call to get the info for the first node, and @@ -226,6 +217,16 @@ struct binder_node_info_for_ref { __u32 reserved3; }; +struct binder_feature_set { + __u64 feature_set; +}; + +struct access_token { + __u64 sender_tokenid; + __u64 first_tokenid; + __u64 reserved[2]; +}; + #define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read) #define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64) #define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32) @@ -237,6 +238,9 @@ struct binder_node_info_for_ref { #define BINDER_GET_NODE_INFO_FOR_REF _IOWR('b', 12, struct binder_node_info_for_ref) #define BINDER_SET_CONTEXT_MGR_EXT _IOW('b', 13, struct flat_binder_object) +#define BINDER_FEATURE_SET _IOWR('b', 30, struct binder_feature_set) +#define BINDER_GET_ACCESS_TOKEN _IOWR('b', 31, struct access_token) + /* * NOTE: Two special error codes you should check for when calling * in to the driver are: @@ -293,10 +297,6 @@ struct binder_transaction_data { } ptr; __u8 buf[8]; } data; -#ifdef CONFIG_ACCESS_TOKENID - __u64 sender_tokenid; - __u64 first_tokenid; -#endif /* CONFIG_ACCESS_TOKENID */ }; struct binder_transaction_data_secctx { -- Gitee From a7b6fde865c95687c190e3f47b0ee47c27fcdf51 Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Mon, 10 Jan 2022 21:07:18 +0800 Subject: [PATCH 004/113] hyperhold: add hyperhold as zram backing store device ohos inclusion category: feature issue: #I4RXQ3 CVE: NA ----------------- This patch grouped zram objects by mem cgroups, and set hyperhold as a backing store device of zram which providing group swapout/swapin operations. The group swap operations would be invoked by "zswapd". Signed-off-by: Chengke Wang --- drivers/Kconfig | 2 + drivers/Makefile | 3 + drivers/block/zram/Kconfig | 2 + drivers/block/zram/Makefile | 5 + drivers/block/zram/zram_drv.c | 154 ++-- drivers/block/zram/zram_drv.h | 102 +++ drivers/block/zram/zram_group/Kconfig | 24 + .../block/zram/zram_group/group_writeback.c | 704 ++++++++++++++++++ drivers/block/zram/zram_group/zlist.c | 227 ++++++ drivers/block/zram/zram_group/zlist.h | 96 +++ drivers/block/zram/zram_group/zram_group.c | 590 +++++++++++++++ drivers/block/zram/zram_group/zram_group.h | 96 +++ drivers/hyperhold/Kconfig | 12 + drivers/hyperhold/Makefile | 4 + drivers/hyperhold/hp_core.c | 654 ++++++++++++++++ drivers/hyperhold/hp_device.c | 78 ++ drivers/hyperhold/hp_device.h | 23 + drivers/hyperhold/hp_iotab.c | 271 +++++++ drivers/hyperhold/hp_iotab.h | 62 ++ drivers/hyperhold/hp_space.c | 122 +++ drivers/hyperhold/hp_space.h | 30 + drivers/hyperhold/hyperhold.h | 52 ++ include/linux/hyperhold_inf.h | 23 + 23 files changed, 3278 insertions(+), 58 deletions(-) create mode 100644 drivers/block/zram/zram_group/Kconfig create mode 100644 drivers/block/zram/zram_group/group_writeback.c create mode 100644 drivers/block/zram/zram_group/zlist.c create mode 100644 drivers/block/zram/zram_group/zlist.h create mode 100644 drivers/block/zram/zram_group/zram_group.c create mode 100644 drivers/block/zram/zram_group/zram_group.h create mode 100644 drivers/hyperhold/Kconfig create mode 100644 drivers/hyperhold/Makefile create mode 100644 drivers/hyperhold/hp_core.c create mode 100644 drivers/hyperhold/hp_device.c create mode 100644 drivers/hyperhold/hp_device.h create mode 100644 drivers/hyperhold/hp_iotab.c create mode 100644 drivers/hyperhold/hp_iotab.h create mode 100644 drivers/hyperhold/hp_space.c create mode 100644 drivers/hyperhold/hp_space.h create mode 100644 drivers/hyperhold/hyperhold.h create mode 100644 include/linux/hyperhold_inf.h diff --git a/drivers/Kconfig b/drivers/Kconfig index c9a22b041303..826b2b19d0b8 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -10,6 +10,8 @@ source "drivers/pcmcia/Kconfig" source "drivers/rapidio/Kconfig" +source "drivers/hyperhold/Kconfig" + source "drivers/base/Kconfig" source "drivers/bus/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index 71da48160b09..ecc494918773 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -68,6 +68,9 @@ obj-$(CONFIG_CONNECTOR) += connector/ obj-$(CONFIG_FB_I810) += video/fbdev/i810/ obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ +# Hyperhold driver +obj-$(CONFIG_HYPERHOLD) += hyperhold/ + obj-$(CONFIG_PARPORT) += parport/ obj-$(CONFIG_NVM) += lightnvm/ obj-y += base/ block/ misc/ mfd/ nfc/ diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index fe7a4b7d30cf..69719562f1b2 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -37,3 +37,5 @@ config ZRAM_MEMORY_TRACKING /sys/kernel/debug/zram/zramX/block_state. See Documentation/admin-guide/blockdev/zram.rst for more information. + +source "drivers/block/zram/zram_group/Kconfig" diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index de9e457907b1..a8947f7faa98 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,4 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only zram-y := zcomp.o zram_drv.o +zram-$(CONFIG_ZRAM_GROUP) += zram_group/zram_group.o zram_group/zlist.o zram_group/group_writeback.o + obj-$(CONFIG_ZRAM) += zram.o + +ccflags-$(CONFIG_ZRAM_GROUP) += -I$(srctree)/drivers/block/zram/zram_group/ +ccflags-$(CONFIG_HYPERHOLD) += -I$(srctree)/drivers/hyperhold/ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7dce17fd59ba..8751ba2f63f2 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -35,6 +35,10 @@ #include #include +#ifdef CONFIG_ZRAM_GROUP +#include +#endif + #include "zram_drv.h" static DEFINE_IDR(zram_index_idr); @@ -59,22 +63,6 @@ static void zram_free_page(struct zram *zram, size_t index); static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio); - -static int zram_slot_trylock(struct zram *zram, u32 index) -{ - return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); -} - -static void zram_slot_lock(struct zram *zram, u32 index) -{ - bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); -} - -static void zram_slot_unlock(struct zram *zram, u32 index) -{ - bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); -} - static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -85,35 +73,6 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } -static unsigned long zram_get_handle(struct zram *zram, u32 index) -{ - return zram->table[index].handle; -} - -static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) -{ - zram->table[index].handle = handle; -} - -/* flag operations require table entry bit_spin_lock() being held */ -static bool zram_test_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) -{ - return zram->table[index].flags & BIT(flag); -} - -static void zram_set_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) -{ - zram->table[index].flags |= BIT(flag); -} - -static void zram_clear_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) -{ - zram->table[index].flags &= ~BIT(flag); -} - static inline void zram_set_element(struct zram *zram, u32 index, unsigned long element) { @@ -125,19 +84,6 @@ static unsigned long zram_get_element(struct zram *zram, u32 index) return zram->table[index].element; } -static size_t zram_get_obj_size(struct zram *zram, u32 index) -{ - return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); -} - -static void zram_set_obj_size(struct zram *zram, - u32 index, size_t size) -{ - unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT; - - zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size; -} - static inline bool zram_allocated(struct zram *zram, u32 index) { return zram_get_obj_size(zram, index) || @@ -1135,6 +1081,65 @@ static DEVICE_ATTR_RO(bd_stat); #endif static DEVICE_ATTR_RO(debug_stat); +#ifdef CONFIG_ZRAM_GROUP +static ssize_t group_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + if (zram->zgrp_ctrl == ZGRP_NONE) + strcpy(buf, "disable\n"); + else if (zram->zgrp_ctrl == ZGRP_TRACK) + strcpy(buf, "readonly\n"); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + else if (zram->zgrp_ctrl == ZGRP_WRITE) + strcpy(buf, "readwrite"); +#endif + up_read(&zram->init_lock); + + return strlen(buf); +} + +static ssize_t group_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + int ret; +#ifdef CONFIG_ZRAM_GROUP_DEBUG + u32 op, gid, index; + + ret = sscanf(buf, "%u %u %u", &op, &index, &gid); + if (ret == 3) { + pr_info("op[%u] index[%u] gid[%u].\n", op, index, gid); + group_debug(zram, op, index, gid); + return len; + } +#endif + + ret = len; + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Can't setup group ctrl for initialized device!\n"); + ret = -EBUSY; + goto out; + } + if (!strcmp(buf, "disable\n")) + zram->zgrp_ctrl = ZGRP_NONE; + else if (!strcmp(buf, "readonly\n")) + zram->zgrp_ctrl = ZGRP_TRACK; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + else if (!strcmp(buf, "readwrite\n")) + zram->zgrp_ctrl = ZGRP_WRITE; +#endif + else + ret = -EINVAL; +out: + up_write(&zram->init_lock); + + return ret; +} +#endif + static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -1146,6 +1151,9 @@ static void zram_meta_free(struct zram *zram, u64 disksize) zs_destroy_pool(zram->mem_pool); vfree(zram->table); +#ifdef CONFIG_ZRAM_GROUP + zram_group_deinit(zram); +#endif } static bool zram_meta_alloc(struct zram *zram, u64 disksize) @@ -1165,6 +1173,10 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); +#ifdef CONFIG_ZRAM_GROUP + zram_group_init(zram, num_pages); +#endif + return true; } @@ -1177,6 +1189,10 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; +#ifdef CONFIG_ZRAM_GROUP + zram_group_untrack_obj(zram, index); +#endif + #ifdef CONFIG_ZRAM_MEMORY_TRACKING zram->table[index].ac_time = 0; #endif @@ -1242,7 +1258,20 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, zram_get_element(zram, index), bio, partial_io); } +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (!bio) { + ret = zram_group_fault_obj(zram, index); + if (ret) { + zram_slot_unlock(zram, index); + return ret; + } + } + if (zram_test_flag(zram, index, ZRAM_GWB)) { + zram_slot_unlock(zram, index); + return -EIO; + } +#endif handle = zram_get_handle(zram, index); if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) { unsigned long value; @@ -1425,6 +1454,9 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, comp_len); } +#ifdef CONFIG_ZRAM_GROUP + zram_group_track_obj(zram, index, page->mem_cgroup); +#endif zram_slot_unlock(zram, index); /* Update stats */ @@ -1850,6 +1882,9 @@ static DEVICE_ATTR_WO(writeback); static DEVICE_ATTR_RW(writeback_limit); static DEVICE_ATTR_RW(writeback_limit_enable); #endif +#ifdef CONFIG_ZRAM_GROUP +static DEVICE_ATTR_RW(group); +#endif static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -1873,6 +1908,9 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_bd_stat.attr, #endif &dev_attr_debug_stat.attr, +#ifdef CONFIG_ZRAM_GROUP + &dev_attr_group.attr, +#endif NULL, }; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index f2fd46daa760..ae2ec81c0f8a 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -21,6 +21,10 @@ #include "zcomp.h" +#ifdef CONFIG_ZRAM_GROUP +#include "zram_group.h" +#endif + #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) #define ZRAM_LOGICAL_BLOCK_SHIFT 12 @@ -39,7 +43,15 @@ * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), * the higher bits is for zram_pageflags. */ +#ifdef CONFIG_ZRAM_GROUP +/* reserve 16 bits for group id */ +#define ZRAM_SIZE_SHIFT 24 +#define ZRAM_GRPID_SHIFT 16 +#define ZRAM_GRPID_MASK (((1UL << ZRAM_GRPID_SHIFT) - 1) << ZRAM_SIZE_SHIFT) +#define ZRAM_FLAG_SHIFT (ZRAM_SIZE_SHIFT + ZRAM_GRPID_SHIFT) +#else #define ZRAM_FLAG_SHIFT 24 +#endif /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { @@ -50,6 +62,10 @@ enum zram_pageflags { ZRAM_UNDER_WB, /* page is under writeback */ ZRAM_HUGE, /* Incompressible page */ ZRAM_IDLE, /* not accessed page since last idle marking */ +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + ZRAM_GWB, /* obj is group writeback*/ + ZRAM_FAULT, /* obj is needed by a pagefault req */ +#endif __NR_ZRAM_PAGEFLAGS, }; @@ -91,6 +107,10 @@ struct zram_stats { struct zram { struct zram_table_entry *table; +#ifdef CONFIG_ZRAM_GROUP + struct zram_group *zgrp; + unsigned int zgrp_ctrl; +#endif struct zs_pool *mem_pool; struct zcomp *comp; struct gendisk *disk; @@ -126,4 +146,86 @@ struct zram { struct dentry *debugfs_dir; #endif }; + +static inline int zram_slot_trylock(struct zram *zram, u32 index) +{ + return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); +} + +static inline void zram_slot_lock(struct zram *zram, u32 index) +{ + bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); +} + +static inline void zram_slot_unlock(struct zram *zram, u32 index) +{ + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); +} + +static inline unsigned long zram_get_handle(struct zram *zram, u32 index) +{ + return zram->table[index].handle; +} + +static inline void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) +{ + zram->table[index].handle = handle; +} + +/* flag operations require table entry bit_spin_lock() being held */ +static inline bool zram_test_flag(struct zram *zram, u32 index, + enum zram_pageflags flag) +{ + return zram->table[index].flags & BIT(flag); +} + +static inline void zram_set_flag(struct zram *zram, u32 index, + enum zram_pageflags flag) +{ + zram->table[index].flags |= BIT(flag); +} + +static inline void zram_clear_flag(struct zram *zram, u32 index, + enum zram_pageflags flag) +{ + zram->table[index].flags &= ~BIT(flag); +} +#ifdef CONFIG_ZRAM_GROUP +static inline size_t zram_get_obj_size(struct zram *zram, u32 index) +{ + return zram->table[index].flags & (BIT(ZRAM_SIZE_SHIFT) - 1); +} + +static inline void zram_set_obj_size(struct zram *zram, u32 index, size_t size) +{ + unsigned long flags = zram->table[index].flags >> ZRAM_SIZE_SHIFT; + + zram->table[index].flags = (flags << ZRAM_SIZE_SHIFT) | size; +} + +void zram_group_init(struct zram *zram, u32 nr_obj); +void zram_group_deinit(struct zram *zram); +void zram_group_track_obj(struct zram *zram, u32 index, struct mem_cgroup *memcg); +void zram_group_untrack_obj(struct zram *zram, u32 index); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +int zram_group_fault_obj(struct zram *zram, u32 index); +#endif + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void group_debug(struct zram *zram, u32 op, u32 index, u32 gid); +#endif + +#else +static inline size_t zram_get_obj_size(struct zram *zram, u32 index) +{ + return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); +} + +static inline void zram_set_obj_size(struct zram *zram, u32 index, size_t size) +{ + unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT; + + zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size; +} +#endif #endif diff --git a/drivers/block/zram/zram_group/Kconfig b/drivers/block/zram/zram_group/Kconfig new file mode 100644 index 000000000000..0eacf79fb259 --- /dev/null +++ b/drivers/block/zram/zram_group/Kconfig @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 +config ZRAM_GROUP + bool "Manage Zram objs with mem_cgroup" + depends on ZRAM && MEMCG + help + Manage Zram objs with mem_cgroup. + +config ZRAM_GROUP_DEBUG + bool "Debug info for zram group" + depends on ZRAM_GROUP + help + Debug info for ZRAM_GROUP. + +config ZLIST_DEBUG + bool "Debug info for zram group list" + depends on ZRAM_GROUP + help + Debug info for zram group list. + +config ZRAM_GROUP_WRITEBACK + bool "Write back grouped zram objs to Hyperhold driver" + depends on ZRAM_GROUP && HYPERHOLD + help + Write back grouped zram objs to hyperhold. diff --git a/drivers/block/zram/zram_group/group_writeback.c b/drivers/block/zram/zram_group/group_writeback.c new file mode 100644 index 000000000000..f1b2550c94ff --- /dev/null +++ b/drivers/block/zram/zram_group/group_writeback.c @@ -0,0 +1,704 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/block/zram/zram_group/group_writeback.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include + +#include "../zram_drv.h" +#include "zram_group.h" + +#ifdef CONFIG_HYPERHOLD +#include "hyperhold.h" +#endif + +#define CHECK(cond, ...) ((cond) || (pr_err(__VA_ARGS__), false)) +#define CHECK_BOUND(var, min, max) \ + CHECK((var) >= (min) && (var) <= (max), \ + "%s %u out of bounds %u ~ %u!\n", \ + #var, (var), (min), (max)) + +static u16 zram_get_memcg_id(struct zram *zram, u32 index) +{ + return (zram->table[index].flags & ZRAM_GRPID_MASK) >> ZRAM_SIZE_SHIFT; +} + +static void zram_set_memcg_id(struct zram *zram, u32 index, u16 gid) +{ + unsigned long old = zram->table[index].flags & (~ZRAM_GRPID_MASK); + + zram->table[index].flags = old | ((u64)gid << ZRAM_SIZE_SHIFT); +} + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +static bool obj_can_wb(struct zram *zram, u32 index, u16 gid) +{ + /* overwrited obj, just skip */ + if (zram_get_memcg_id(zram, index) != gid) { + pr_info("obj %u is from group %u instead of group %u.\n", + index, zram_get_memcg_id(zram, index), gid); + return false; + } + if (!zgrp_obj_is_isolated(zram->zgrp, index)) { + pr_info("obj %u is not isolated.\n", index); + return false; + } + /* need not to writeback, put back the obj as HOTEST */ + if (zram_test_flag(zram, index, ZRAM_SAME)) { + pr_info("obj %u is filled with same element.\n", index); + goto insert; + } + if (zram_test_flag(zram, index, ZRAM_WB)) { + pr_info("obj %u is writeback.\n", index); + goto insert; + } + /* obj is needed by a pagefault req, do not writeback it. */ + if (zram_test_flag(zram, index, ZRAM_FAULT)) { + pr_info("obj %u is needed by a pagefault request.\n", index); + goto insert; + } + /* should never happen */ + if (zram_test_flag(zram, index, ZRAM_GWB)) { + pr_info("obj %u is group writeback.\n", index); + BUG(); + return false; + } + + return true; +insert: + zgrp_obj_insert(zram->zgrp, index, gid); + + return false; +} + +static void copy_obj(struct hpio *hpio, u32 offset, char *obj, u32 size, bool to) +{ + u32 page_id, start; + char *buf = NULL; + + page_id = offset / PAGE_SIZE; + start = offset % PAGE_SIZE; + if (size + start <= PAGE_SIZE) { + buf = page_to_virt(hyperhold_io_page(hpio, page_id)); + if (to) + memcpy(buf + start, obj, size); + else + memcpy(obj, buf + start, size); + + return; + } + buf = page_to_virt(hyperhold_io_page(hpio, page_id)); + if (to) + memcpy(buf + start, obj, PAGE_SIZE - start); + else + memcpy(obj, buf + start, PAGE_SIZE - start); + buf = page_to_virt(hyperhold_io_page(hpio, page_id + 1)); + if (to) + memcpy(buf, obj + PAGE_SIZE - start, size + start - PAGE_SIZE); + else + memcpy(obj + PAGE_SIZE - start, buf, size + start - PAGE_SIZE); +} + +static u32 move_obj_to_hpio(struct zram *zram, u32 index, u16 gid, + struct hpio *hpio, u32 offset) +{ + u32 size = 0; + unsigned long handle; + char *src = NULL; + u32 ext_size; + u32 eid; + + eid = hyperhold_io_extent(hpio); + ext_size = hyperhold_extent_size(eid); + + zram_slot_lock(zram, index); + if (!obj_can_wb(zram, index, gid)) + goto unlock; + size = zram_get_obj_size(zram, index); + /* no space, put back the obj as COLDEST */ + if (size + offset > ext_size) { + pr_info("obj %u size is %u, but ext %u only %u space left.\n", + index, size, eid, ext_size - offset); + zgrp_obj_putback(zram->zgrp, index, gid); + size = 0; + goto unlock; + } + handle = zram_get_handle(zram, index); + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); + copy_obj(hpio, offset, src, size, true); + zs_unmap_object(zram->mem_pool, handle); + zs_free(zram->mem_pool, handle); + zram_set_handle(zram, index, hyperhold_address(eid, offset)); + zram_set_flag(zram, index, ZRAM_GWB); + wbgrp_obj_insert(zram->zgrp, index, eid); + wbgrp_obj_stats_inc(zram->zgrp, gid, eid, size); + zgrp_obj_stats_dec(zram->zgrp, gid, size); + pr_info("move obj %u of group %u to hpio %p of eid %u, size = %u, offset = %u\n", + index, gid, hpio, eid, size); +unlock: + zram_slot_unlock(zram, index); + + return size; +} + +static void move_obj_from_hpio(struct zram *zram, int index, struct hpio *hpio) +{ + u32 size = 0; + unsigned long handle = 0; + u32 eid, offset; + u64 addr; + char *dst = NULL; + u16 gid; + + eid = hyperhold_io_extent(hpio); +retry: + zram_slot_lock(zram, index); + if (!zram_test_flag(zram, index, ZRAM_GWB)) + goto unlock; + addr = zram_get_handle(zram, index); + if (hyperhold_addr_extent(addr) != eid) + goto unlock; + size = zram_get_obj_size(zram, index); + if (handle) + goto move; + handle = zs_malloc(zram->mem_pool, size, GFP_NOWAIT); + if (handle) + goto move; + zram_slot_unlock(zram, index); + handle = zs_malloc(zram->mem_pool, size, GFP_NOIO | __GFP_NOFAIL); + if (handle) + goto retry; + BUG(); + + return; +move: + offset = hyperhold_addr_offset(addr); + dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); + copy_obj(hpio, offset, dst, size, false); + zs_unmap_object(zram->mem_pool, handle); + zram_set_handle(zram, index, handle); + zram_clear_flag(zram, index, ZRAM_GWB); + gid = zram_get_memcg_id(zram, index); + zgrp_obj_insert(zram->zgrp, index, gid); + wbgrp_obj_stats_dec(zram->zgrp, gid, eid, size); + zgrp_obj_stats_inc(zram->zgrp, gid, size); + pr_info("move obj %u of group %u from hpio %p of eid %u, size = %u, offset = %u\n", + index, gid, hpio, eid, size); +unlock: + zram_slot_unlock(zram, index); +} + + +#define NR_ISOLATE 32 +static bool move_extent_from_hpio(struct zram *zram, struct hpio *hpio) +{ + u32 idxs[NR_ISOLATE]; + u32 eid; + u32 nr; + int i; + bool last = false; + + eid = hyperhold_io_extent(hpio); +repeat: + nr = wbgrp_isolate_objs(zram->zgrp, eid, idxs, NR_ISOLATE, &last); + for (i = 0; i < nr; i++) + move_obj_from_hpio(zram, idxs[i], hpio); + if (last) + return true; + if (nr) + goto repeat; + + return false; +} + +struct hpio_priv { + struct zram *zram; + u16 gid; +}; + +static void write_endio(struct hpio *hpio) +{ + struct hpio_priv *priv = hyperhold_io_private(hpio); + struct zram *zram = priv->zram; + u16 gid = priv->gid; + u32 eid = hyperhold_io_extent(hpio); + + if (hyperhold_io_success(hpio)) + goto out; + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } +out: + hyperhold_io_complete(hpio); + hyperhold_io_put(hpio); + kfree(priv); +} + +static u32 collect_objs(struct zram *zram, u16 gid, struct hpio *hpio, u32 ext_size) +{ + u32 offset = 0; + u32 last_offset; + u32 nr; + u32 idxs[NR_ISOLATE]; + int i; + +more: + last_offset = offset; + nr = zgrp_isolate_objs(zram->zgrp, gid, idxs, NR_ISOLATE, NULL); + for (i = 0; i < nr; i++) + offset += move_obj_to_hpio(zram, idxs[i], gid, hpio, offset); + pr_info("%u data attached, offset = %u.\n", offset - last_offset, offset); + if (offset < ext_size && offset != last_offset) + goto more; + + return offset; +} + +static u64 write_one_extent(struct zram *zram, u16 gid) +{ + int eid; + struct hpio *hpio = NULL; + struct hpio_priv *priv = NULL; + u32 size = 0; + int ret; + + priv = kmalloc(sizeof(struct hpio_priv), GFP_NOIO); + if (!priv) + return 0; + priv->gid = gid; + priv->zram = zram; + eid = hyperhold_alloc_extent(); + if (eid < 0) + goto err; + hpio = hyperhold_io_get(eid, GFP_NOIO, REQ_OP_WRITE); + if (!hpio) + goto free_extent; + + size = collect_objs(zram, gid, hpio, hyperhold_extent_size(eid)); + if (size == 0) { + pr_err("group %u has no data in zram.\n", gid); + goto put_hpio; + } + zgrp_ext_insert(zram->zgrp, eid, gid); + + ret = hyperhold_write_async(hpio, write_endio, priv); + if (ret) + goto move_back; + + return size; +move_back: + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } + eid = -EINVAL; +put_hpio: + hyperhold_io_put(hpio); +free_extent: + if (eid >= 0) + hyperhold_free_extent(eid); +err: + kfree(priv); + + return 0; +} + +static void read_endio(struct hpio *hpio) +{ + struct hpio_priv *priv = hyperhold_io_private(hpio); + struct zram *zram = priv->zram; + u16 gid = priv->gid; + u32 eid = hyperhold_io_extent(hpio); + + if (!hyperhold_io_success(hpio)) { + BUG(); + goto out; + } + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } +out: + hyperhold_io_complete(hpio); + hyperhold_io_put(hpio); + kfree(priv); +} + +static u64 read_one_extent(struct zram *zram, u32 eid, u16 gid) +{ + struct hpio *hpio = NULL; + u32 ext_size = 0; + int ret; + struct hpio_priv *priv = NULL; + + priv = kmalloc(sizeof(struct hpio_priv), GFP_NOIO); + if (!priv) + goto err; + priv->gid = gid; + priv->zram = zram; + hpio = hyperhold_io_get(eid, GFP_NOIO, REQ_OP_READ); + if (!hpio) + goto err; + ext_size = hyperhold_extent_size(eid); + ret = hyperhold_read_async(hpio, read_endio, priv); + if (ret) + goto err; + + return ext_size; +err: + hyperhold_io_put(hpio); + kfree(priv); + + return 0; +} + +static void sync_read_endio(struct hpio *hpio) +{ + hyperhold_io_complete(hpio); +} + +static int read_one_obj_sync(struct zram *zram, u32 index) +{ + struct hpio *hpio = NULL; + int ret; + u32 eid; + u16 gid; + u32 size; + + if (!zram_test_flag(zram, index, ZRAM_GWB)) + return 0; + + pr_info("read obj %u.\n", index); + + gid = zram_get_memcg_id(zram, index); + eid = hyperhold_addr_extent(zram_get_handle(zram, index)); + size = zram_get_obj_size(zram, index); + wbgrp_fault_stats_inc(zram->zgrp, gid, eid, size); +check: + if (!zram_test_flag(zram, index, ZRAM_GWB)) + return 0; + if (!zram_test_flag(zram, index, ZRAM_FAULT)) + goto read; + zram_slot_unlock(zram, index); + wait_event(zram->zgrp->wbgrp.fault_wq, !zram_test_flag(zram, index, ZRAM_FAULT)); + zram_slot_lock(zram, index); + goto check; +read: + zram_set_flag(zram, index, ZRAM_FAULT); + zram_slot_unlock(zram, index); + + hpio = hyperhold_io_get(eid, GFP_NOIO, REQ_OP_READ); + if (!hpio) { + ret = -ENOMEM; + goto out; + } + ret = hyperhold_read_async(hpio, sync_read_endio, NULL); + /* io submit error */ + if (ret && ret != -EAGAIN) + goto out; + + hyperhold_io_wait(hpio); + /* get a write io, data is ready, copy the pages even write failed */ + if (op_is_write(hyperhold_io_operate(hpio))) + goto move; + /* read io failed, return -EIO */ + if (!hyperhold_io_success(hpio)) { + ret = -EIO; + goto out; + } + /* success, copy the data and free extent */ +move: + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } +out: + hyperhold_io_put(hpio); + zram_slot_lock(zram, index); + zram_clear_flag(zram, index, ZRAM_FAULT); + wake_up(&zram->zgrp->wbgrp.fault_wq); + + return ret; +} + +u64 read_group_objs(struct zram *zram, u16 gid, u64 req_size) +{ + u32 eid; + u64 read_size = 0; + u32 nr; + + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 1, zram->zgrp->nr_grp - 1)) + return 0; + + pr_info("read %llu data of group %u.\n", req_size, gid); + + while (!req_size || req_size > read_size) { + nr = zgrp_isolate_exts(zram->zgrp, gid, &eid, 1, NULL); + if (!nr) + break; + read_size += read_one_extent(zram, eid, gid); + } + + return read_size; +} + +u64 write_group_objs(struct zram *zram, u16 gid, u64 req_size) +{ + u64 write_size = 0; + u64 size = 0; + + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 1, zram->zgrp->nr_grp - 1)) + return 0; + + pr_info("write %llu data of group %u.\n", req_size, gid); + + while (!req_size || req_size > write_size) { + size = write_one_extent(zram, gid); + if (!size) + break; + write_size += size; + } + + return write_size; +} +#endif + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +#include +#define ZGRP_TEST_MAX_GRP 101 +#endif + +int zram_group_fault_obj(struct zram *zram, u32 index) +{ + u16 gid; + u32 size; + + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK_BOUND(index, 0, zram->zgrp->nr_obj - 1)) + return 0; + + gid = zram_get_memcg_id(zram, index); + size = zram_get_obj_size(zram, index); + zgrp_fault_stats_inc(zram->zgrp, gid, size); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + return read_one_obj_sync(zram, index); +#else + return 0; +#endif +} + +void zram_group_track_obj(struct zram *zram, u32 index, struct mem_cgroup *memcg) +{ + u16 gid; + + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(index, 0, zram->zgrp->nr_obj - 1)) + return; + if (!CHECK(memcg || !memcg->id.id, "obj %u has no memcg!\n", index)) + return; + gid = zram_get_memcg_id(zram, index); + if (!CHECK(!gid, "obj %u has gid %u.\n", index, gid)) + BUG(); + + gid = memcg->id.id; + zram_set_memcg_id(zram, index, gid); + zgrp_obj_insert(zram->zgrp, index, gid); + zgrp_obj_stats_inc(zram->zgrp, gid, zram_get_obj_size(zram, index)); +} + +void zram_group_untrack_obj(struct zram *zram, u32 index) +{ + u16 gid; + u32 size; + + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(index, 0, zram->zgrp->nr_obj - 1)) + return; + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +check: + if (!zram_test_flag(zram, index, ZRAM_FAULT)) + goto clear; + zram_slot_unlock(zram, index); + wait_event(zram->zgrp->wbgrp.fault_wq, !zram_test_flag(zram, index, ZRAM_FAULT)); + zram_slot_lock(zram, index); + goto check; +clear: +#endif + gid = zram_get_memcg_id(zram, index); + size = zram_get_obj_size(zram, index); + if (!gid) + return; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (zram_test_flag(zram, index, ZRAM_GWB)) { + u32 eid = hyperhold_addr_extent(zram_get_handle(zram, index)); + + if (wbgrp_obj_delete(zram->zgrp, index, eid)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } + zram_clear_flag(zram, index, ZRAM_GWB); + zram_set_memcg_id(zram, index, 0); + wbgrp_obj_stats_dec(zram->zgrp, gid, eid, size); + zram_set_handle(zram, index, 0); + return; + } +#endif + zgrp_obj_delete(zram->zgrp, index, gid); + zram_set_memcg_id(zram, index, 0); + zgrp_obj_stats_dec(zram->zgrp, gid, size); +} + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void group_debug(struct zram *zram, u32 op, u32 index, u32 gid) +{ + if (op == 0) + zram_group_dump(zram->zgrp, gid, index); + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (op == 22) + read_group_objs(zram, gid, index); + if (op == 23) + write_group_objs(zram, gid, index); + if (op == 20) { + if (index) + zram_group_apply_writeback(zram->zgrp, hyperhold_nr_extent()); + else + zram_group_remove_writeback(zram->zgrp); + } +#endif +} +#endif + +static u64 group_obj_stats(struct zram *zram, u16 gid, int type) +{ + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 0, zram->zgrp->nr_grp - 1)) + return 0; + + if (type == CACHE_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].zram_size); + else if (type == CACHE_PAGE) + return atomic_read(&zram->zgrp->stats[gid].zram_pages); + else if (type == CACHE_FAULT) + return atomic64_read(&zram->zgrp->stats[gid].zram_fault); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + else if (type == SWAP_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].wb_size); + else if (type == SWAP_PAGE) + return atomic_read(&zram->zgrp->stats[gid].wb_pages); + else if (type == READ_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].read_size); + else if (type == WRITE_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].write_size); + else if (type == SWAP_FAULT) + return atomic64_read(&zram->zgrp->stats[gid].wb_fault); + BUG(); +#endif + + return 0; +} + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +static u64 zram_group_read(u16 gid, u64 req_size, void *priv) +{ + if (!CHECK(priv, "priv is NULL!\n")) + return 0; + + return read_group_objs((struct zram *)priv, gid, req_size); +} + +static u64 zram_group_write(u16 gid, u64 req_size, void *priv) +{ + if (!CHECK(priv, "priv is NULL!\n")) + return 0; + + return write_group_objs((struct zram *)priv, gid, req_size); +} +#else +static u64 zram_group_read(u16 gid, u64 req_size, void *priv) +{ + return 0; +} +static u64 zram_group_write(u16 gid, u64 req_size, void *priv) +{ + return 0; +} +#endif + + +static u64 zram_group_data_size(u16 gid, int type, void *priv) +{ + if (!CHECK(priv, "priv is NULL!\n")) + return 0; + + return group_obj_stats((struct zram *)priv, gid, type); +} + +struct group_swap_ops zram_group_ops = { + .group_read = zram_group_read, + .group_write = zram_group_write, + .group_data_size = zram_group_data_size, +}; + +static int register_zram_group(struct zram *zram) +{ + if (!CHECK(zram, "zram is NULL!\n")) + return -EINVAL; + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return -EINVAL; + + zram->zgrp->gsdev = register_group_swap(&zram_group_ops, zram); + if (!zram->zgrp->gsdev) { + pr_err("register zram group failed!\n"); + return -ENOMEM; + } + + return 0; +} + +static void unregister_zram_group(struct zram *zram) +{ + if (!CHECK(zram, "zram is NULL!\n")) + return; + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return; + + unregister_group_swap(zram->zgrp->gsdev); + zram->zgrp->gsdev = NULL; +} + +void zram_group_init(struct zram *zram, u32 nr_obj) +{ + unsigned int ctrl = zram->zgrp_ctrl; + + if (ctrl == ZGRP_NONE) + return; + zram->zgrp = zram_group_meta_alloc(nr_obj, ZGRP_MAX_GRP - 1); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (ctrl == ZGRP_WRITE) + zram_group_apply_writeback(zram->zgrp, hyperhold_nr_extent()); +#endif + register_zram_group(zram); +} + +void zram_group_deinit(struct zram *zram) +{ + unregister_zram_group(zram); + zram_group_meta_free(zram->zgrp); + zram->zgrp = NULL; +} diff --git a/drivers/block/zram/zram_group/zlist.c b/drivers/block/zram/zram_group/zlist.c new file mode 100644 index 000000000000..d1fe60875949 --- /dev/null +++ b/drivers/block/zram/zram_group/zlist.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/block/zram/zram_group/zlist.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[ZLIST]" fmt + +#include +#include +#include + +#include "zlist.h" + +#define assert(expr) \ + do { \ + if (expr) \ + break; \ + pr_err("assertion [%s] failed: in func<%s> at %s:%d\n", \ + #expr, __func__, __FILE__, __LINE__); \ + BUG(); \ + } while (0) + +static inline void zlist_node_lock(struct zlist_node *node) +{ + bit_spin_lock(ZLIST_LOCK_BIT, (unsigned long *)node); +} + +static inline void zlist_node_unlock(struct zlist_node *node) +{ + bit_spin_unlock(ZLIST_LOCK_BIT, (unsigned long *)node); +} + +#ifdef CONFIG_ZLIST_DEBUG +static inline void zlist_before_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == next); + assert(idx2node(next->prev, tab) == prev); + assert(idx2node(node->prev, tab) == node); + assert(idx2node(node->next, tab) == node); +} + +static inline void zlist_after_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == node); + assert(idx2node(next->prev, tab) == node); + assert(idx2node(node->prev, tab) == prev); + assert(idx2node(node->next, tab) == next); +} + +static inline void zlist_before_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == node); + assert(idx2node(next->prev, tab) == node); + assert(idx2node(node->prev, tab) == prev); + assert(idx2node(node->next, tab) == next); +} + +static inline void zlist_after_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == next); + assert(idx2node(next->prev, tab) == prev); + assert(idx2node(node->prev, tab) == node); + assert(idx2node(node->next, tab) == node); +} +#else +static inline void zlist_before_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +static inline void zlist_after_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +static inline void zlist_before_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +static inline void zlist_after_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +#endif + +struct zlist_table *zlist_table_alloc(struct zlist_node *(*i2n)(u32, void*), + void *private, gfp_t gfp) +{ + struct zlist_table *tab = kmalloc(sizeof(struct zlist_table), gfp); + + if (!tab) + return NULL; + tab->idx2node = i2n; + tab->private = private; + + return tab; +} + +void zlist_lock(u32 idx, struct zlist_table *tab) +{ + zlist_node_lock(idx2node(idx, tab)); +} + +void zlist_unlock(u32 idx, struct zlist_table *tab) +{ + zlist_node_unlock(idx2node(idx, tab)); +} + +void zlist_add_nolock(u32 hid, u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + struct zlist_node *head = idx2node(hid, tab); + u32 nid = head->next; + struct zlist_node *next = idx2node(nid, tab); + + zlist_before_add_check(tab, head, node, next); + if (idx != hid) + zlist_node_lock(node); + node->prev = hid; + node->next = nid; + if (idx != hid) + zlist_node_unlock(node); + head->next = idx; + if (nid != hid) + zlist_node_lock(next); + next->prev = idx; + if (nid != hid) + zlist_node_unlock(next); + zlist_after_add_check(tab, head, node, next); +} + +void zlist_add_tail_nolock(u32 hid, u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + struct zlist_node *head = idx2node(hid, tab); + u32 tid = head->prev; + struct zlist_node *tail = idx2node(tid, tab); + + zlist_before_add_check(tab, tail, node, head); + if (idx != hid) + zlist_node_lock(node); + node->prev = tid; + node->next = hid; + if (idx != hid) + zlist_node_unlock(node); + head->prev = idx; + if (tid != hid) + zlist_node_lock(tail); + tail->next = idx; + if (tid != hid) + zlist_node_unlock(tail); + zlist_after_add_check(tab, tail, node, head); +} + +bool zlist_del_nolock(u32 hid, u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + u32 pid = node->prev; + u32 nid = node->next; + struct zlist_node *prev = idx2node(pid, tab); + struct zlist_node *next = idx2node(nid, tab); + + zlist_before_del_check(tab, prev, node, next); + if (idx != hid) + zlist_node_lock(node); + node->prev = idx; + node->next = idx; + if (idx != hid) + zlist_node_unlock(node); + if (pid != hid) + zlist_node_lock(prev); + prev->next = nid; + if (pid != hid) + zlist_node_unlock(prev); + if (nid != hid) + zlist_node_lock(next); + next->prev = pid; + if (nid != hid) + zlist_node_unlock(next); + zlist_after_del_check(tab, prev, node, next); + + return zlist_is_isolated_nolock(hid, tab); +} + +bool zlist_is_isolated_nolock(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + + return (node->prev == idx) && (node->next == idx); +} + +bool zlist_set_priv(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + bool ret = false; + + zlist_node_lock(node); + ret = !test_and_set_bit(ZLIST_PRIV_BIT, (unsigned long *)node); + zlist_node_unlock(node); + + return ret; +} + +bool zlist_clr_priv(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + bool ret = false; + + zlist_node_lock(node); + ret = !test_and_clear_bit(ZLIST_PRIV_BIT, (unsigned long *)node); + zlist_node_unlock(node); + + return ret; +} + +void zlist_node_init(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + + memset(node, 0, sizeof(struct zlist_node)); + node->prev = idx; + node->next = idx; +} diff --git a/drivers/block/zram/zram_group/zlist.h b/drivers/block/zram/zram_group/zlist.h new file mode 100644 index 000000000000..430b079bcd49 --- /dev/null +++ b/drivers/block/zram/zram_group/zlist.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/block/zram/zram_group/zlist.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZLIST_H_ +#define _ZLIST_H_ + +#define ZLIST_IDX_SHIFT 30 +#define ZLIST_LOCK_BIT ZLIST_IDX_SHIFT +#define ZLIST_PRIV_BIT ((ZLIST_IDX_SHIFT << 1) + 1) + +#define ZLIST_IDX_MAX (1 << ZLIST_IDX_SHIFT) + +struct zlist_node { + u32 prev : ZLIST_IDX_SHIFT; + u32 lock : 1; + u32 next : ZLIST_IDX_SHIFT; + u32 priv : 1; +}; + +struct zlist_table { + struct zlist_node *(*idx2node)(u32 idx, void *priv); + void *private; +}; + +static inline struct zlist_node *idx2node(u32 idx, struct zlist_table *tab) +{ + return tab->idx2node(idx, tab->private); +} + +static inline u32 next_idx(u32 idx, struct zlist_table *tab) +{ + return idx2node(idx, tab)->next; +} + +static inline u32 prev_idx(u32 idx, struct zlist_table *tab) +{ + return idx2node(idx, tab)->prev; +} + +static inline void zlist_table_free(struct zlist_table *tab) +{ + kfree(tab); +} + +struct zlist_table *zlist_table_alloc(struct zlist_node *(*i2n)(u32, void*), + void *private, gfp_t gfp); + +void zlist_lock(u32 idx, struct zlist_table *tab); +void zlist_unlock(u32 idx, struct zlist_table *tab); + +void zlist_add_nolock(u32 hid, u32 idx, struct zlist_table *tab); +void zlist_add_tail_nolock(u32 hid, u32 idx, struct zlist_table *tab); +bool zlist_del_nolock(u32 hid, u32 idx, struct zlist_table *tab); +bool zlist_is_isolated_nolock(u32 idx, struct zlist_table *tab); + +static inline void zlist_add(u32 hid, u32 idx, struct zlist_table *tab) +{ + zlist_lock(hid, tab); + zlist_add_nolock(hid, idx, tab); + zlist_unlock(hid, tab); +} + +static inline void zlist_add_tail(u32 hid, u32 idx, struct zlist_table *tab) +{ + zlist_lock(hid, tab); + zlist_add_tail_nolock(hid, idx, tab); + zlist_unlock(hid, tab); +} + +static inline bool zlist_del(u32 hid, u32 idx, struct zlist_table *tab) +{ + bool ret = false; + + zlist_lock(hid, tab); + ret = zlist_del_nolock(hid, idx, tab); + zlist_unlock(hid, tab); + + return ret; +} + +bool zlist_get_priv(u32 idx, struct zlist_table *tab); +bool zlist_clr_priv(u32 idx, struct zlist_table *tab); + +void zlist_node_init(u32 idx, struct zlist_table *tab); + +#define zlist_for_each_entry(idx, hid, tab) \ + for ((idx) = next_idx(hid, tab); (idx) != (hid); \ + (idx) = next_idx(idx, tab)) +#define zlist_for_each_entry_reverse(idx, hid, tab) \ + for ((idx) = prev_idx(hid, tab); (idx) != (hid); \ + (idx) = prev_idx(idx, tab)) +#endif diff --git a/drivers/block/zram/zram_group/zram_group.c b/drivers/block/zram/zram_group/zram_group.c new file mode 100644 index 000000000000..ea0cdcfadc7b --- /dev/null +++ b/drivers/block/zram/zram_group/zram_group.c @@ -0,0 +1,590 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/block/zram/zram_group/zram_group.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[ZRAM_GROUP]" fmt + +#include +#include +#include "zram_group.h" + +#define CHECK(cond, ...) ((cond) || (pr_err(__VA_ARGS__), false)) +#define CHECK_BOUND(var, min, max) \ + CHECK((var) >= (min) && (var) <= (max), \ + "%s %u out of bounds %u ~ %u!\n", \ + #var, (var), (min), (max)) + +/* + * idx2node for obj table + */ +static struct zlist_node *get_obj(u32 index, void *private) +{ + struct zram_group *zgrp = private; + + if (index < zgrp->nr_obj) + return &zgrp->obj[index]; + + index -= zgrp->nr_obj; + BUG_ON(!index); + if (index < zgrp->nr_grp) + return &zgrp->grp_obj_head[index]; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + index -= zgrp->nr_grp; + BUG_ON(index >= zgrp->wbgrp.nr_ext); + return &zgrp->wbgrp.ext_obj_head[index]; +#endif + BUG(); +} + +void zram_group_meta_free(struct zram_group *zgrp) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + zram_group_remove_writeback(zgrp); +#endif + vfree(zgrp->grp_obj_head); + vfree(zgrp->obj); + zlist_table_free(zgrp->obj_tab); + vfree(zgrp->stats); + kfree(zgrp); + + pr_info("zram group freed.\n"); +} + +struct zram_group *zram_group_meta_alloc(u32 nr_obj, u32 nr_grp) +{ + struct zram_group *zgrp = NULL; + u32 i; + + if (!CHECK_BOUND(nr_grp, 1, ZGRP_MAX_GRP - 1)) + return NULL; + + /* reserve gid 0 */ + nr_grp++; + if (!CHECK_BOUND(nr_obj, 1, ZGRP_MAX_OBJ)) + return NULL; + zgrp = kzalloc(sizeof(struct zram_group), GFP_KERNEL); + if (!zgrp) + goto err; + zgrp->nr_obj = nr_obj; + zgrp->nr_grp = nr_grp; + zgrp->grp_obj_head = vmalloc(sizeof(struct zlist_node) * zgrp->nr_grp); + if (!zgrp->grp_obj_head) + goto err; + zgrp->obj = vmalloc(sizeof(struct zlist_node) * zgrp->nr_obj); + if (!zgrp->obj) + goto err; + zgrp->obj_tab = zlist_table_alloc(get_obj, zgrp, GFP_KERNEL); + if (!zgrp->obj_tab) + goto err; + zgrp->stats = vzalloc(sizeof(struct zram_group_stats) * zgrp->nr_grp); + if (!zgrp->stats) + goto err; + zgrp->gsdev = NULL; + + for (i = 0; i < zgrp->nr_obj; i++) + zlist_node_init(i, zgrp->obj_tab); + for (i = 1; i < zgrp->nr_grp; i++) + zlist_node_init(i + zgrp->nr_obj, zgrp->obj_tab); + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + zgrp->wbgrp.enable = false; + mutex_init(&zgrp->wbgrp.init_lock); +#endif + pr_info("zram_group alloc succ.\n"); + return zgrp; +err: + pr_err("zram_group alloc failed!\n"); + zram_group_meta_free(zgrp); + + return NULL; +} + +/* + * insert obj at @index into group @gid as the HOTTEST obj + */ +void zgrp_obj_insert(struct zram_group *zgrp, u32 index, u16 gid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + hid = gid + zgrp->nr_obj; + zlist_add(hid, index, zgrp->obj_tab); + pr_info("insert obj %u to group %u\n", index, gid); +} + +/* + * remove obj at @index from group @gid + */ +bool zgrp_obj_delete(struct zram_group *zgrp, u32 index, u16 gid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return false; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return false; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return false; + pr_info("delete obj %u from group %u\n", index, gid); + hid = gid + zgrp->nr_obj; + + return zlist_del(hid, index, zgrp->obj_tab); +} + +/* + * try to isolate the last @nr objs of @gid, store their indexes in array @idxs + * and @return the obj cnt actually isolated. isolate all objs if nr is 0. + */ +u32 zgrp_isolate_objs(struct zram_group *zgrp, u16 gid, u32 *idxs, u32 nr, bool *last) +{ + u32 hid, idx; + u32 cnt = 0; + u32 i; + + if (last) + *last = false; + if (!CHECK(zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return 0; + if (!CHECK(idxs, "return array idxs is null!\n")) + return 0; + hid = gid + zgrp->nr_obj; + zlist_lock(hid, zgrp->obj_tab); + zlist_for_each_entry_reverse(idx, hid, zgrp->obj_tab) { + idxs[cnt++] = idx; + if (nr && cnt == nr) + break; + } + for (i = 0; i < cnt; i++) + zlist_del_nolock(hid, idxs[i], zgrp->obj_tab); + if (last) + *last = cnt && zlist_is_isolated_nolock(hid, zgrp->obj_tab); + zlist_unlock(hid, zgrp->obj_tab); + + pr_info("isolated %u objs from group %u.\n", cnt, gid); + + return cnt; +} + +/* + * check if the obj at @index is isolate from zram groups + */ +bool zgrp_obj_is_isolated(struct zram_group *zgrp, u32 index) +{ + bool ret = false; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return false; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return false; + + zlist_lock(index, zgrp->obj_tab); + ret = zlist_is_isolated_nolock(index, zgrp->obj_tab); + zlist_unlock(index, zgrp->obj_tab); + + return ret; +} +/* + * insert obj at @index into group @gid as the COLDEST obj + */ +void zgrp_obj_putback(struct zram_group *zgrp, u32 index, u16 gid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + hid = gid + zgrp->nr_obj; + zlist_add_tail(hid, index, zgrp->obj_tab); + pr_info("putback obj %u to group %u\n", index, gid); +} + +void zgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 size) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + + atomic_inc(&zgrp->stats[gid].zram_pages); + atomic64_add(size, &zgrp->stats[gid].zram_size); + atomic_inc(&zgrp->stats[0].zram_pages); + atomic64_add(size, &zgrp->stats[0].zram_size); +} + +void zgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 size) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + + atomic_dec(&zgrp->stats[gid].zram_pages); + atomic64_sub(size, &zgrp->stats[gid].zram_size); + atomic_dec(&zgrp->stats[0].zram_pages); + atomic64_sub(size, &zgrp->stats[0].zram_size); +} + +void zgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 size) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + + atomic64_inc(&zgrp->stats[gid].zram_fault); + atomic64_inc(&zgrp->stats[0].zram_fault); +} + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void zram_group_dump(struct zram_group *zgrp, u16 gid, u32 index) +{ + u32 hid, idx; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + hid = gid + zgrp->nr_obj; + if (gid == 0) { + struct zlist_node *node = NULL; + + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + node = idx2node(index, zgrp->obj_tab); + pr_err("dump index %u = %u %u %u %u\n", index, + node->prev, node->next, + node->lock, node->priv); + } else { + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + pr_err("dump index of group %u\n", gid); + zlist_for_each_entry(idx, hid, zgrp->obj_tab) + pr_err("%u\n", idx); + } +} +#endif + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +/* + * idx2node for ext table + */ +static struct zlist_node *get_ext(u32 index, void *private) +{ + struct zram_group *zgrp = private; + + if (index < zgrp->wbgrp.nr_ext) + return &zgrp->wbgrp.ext[index]; + + index -= zgrp->wbgrp.nr_ext; + BUG_ON(!index); + return &zgrp->wbgrp.grp_ext_head[index]; +} + +/* + * disable writeback for zram group @zgrp + */ +void zram_group_remove_writeback(struct zram_group *zgrp) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + zgrp->wbgrp.enable = false; + vfree(zgrp->wbgrp.grp_ext_head); + vfree(zgrp->wbgrp.ext); + zlist_table_free(zgrp->wbgrp.ext_tab); + vfree(zgrp->wbgrp.ext_obj_head); + pr_info("zram group writeback is removed.\n"); +} + +/* + * init & enable writeback on exist zram group @zgrp with a backing device of + * @nr_ext extents. + */ +int zram_group_apply_writeback(struct zram_group *zgrp, u32 nr_ext) +{ + struct writeback_group *wbgrp = NULL; + u32 i; + int ret = 0; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return -EINVAL; + + mutex_lock(&zgrp->wbgrp.init_lock); + if (!CHECK(!zgrp->wbgrp.enable, "zram group writeback is already enable!\n")) + goto out; + if (!CHECK_BOUND(nr_ext, 1, ZGRP_MAX_EXT)) { + ret = -EINVAL; + goto out; + } + wbgrp = &zgrp->wbgrp; + wbgrp->nr_ext = nr_ext; + wbgrp->grp_ext_head = vmalloc(sizeof(struct zlist_node) * zgrp->nr_grp); + if (!wbgrp->grp_ext_head) { + ret = -ENOMEM; + goto out; + } + wbgrp->ext = vmalloc(sizeof(struct zlist_node) * wbgrp->nr_ext); + if (!wbgrp->ext) { + ret = -ENOMEM; + goto out; + } + wbgrp->ext_obj_head = vmalloc(sizeof(struct zlist_node) * wbgrp->nr_ext); + if (!wbgrp->ext_obj_head) { + ret = -ENOMEM; + goto out; + } + + wbgrp->ext_tab = zlist_table_alloc(get_ext, zgrp, GFP_KERNEL); + if (!wbgrp->ext_tab) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < wbgrp->nr_ext; i++) + zlist_node_init(i, wbgrp->ext_tab); + for (i = 1; i < zgrp->nr_grp; i++) + zlist_node_init(i + wbgrp->nr_ext, wbgrp->ext_tab); + + for (i = 0; i < wbgrp->nr_ext; i++) + zlist_node_init(i + zgrp->nr_obj + zgrp->nr_grp, zgrp->obj_tab); + + init_waitqueue_head(&wbgrp->fault_wq); + wbgrp->enable = true; + pr_info("zram group writeback is enabled.\n"); +out: + mutex_unlock(&zgrp->wbgrp.init_lock); + + if (ret) { + zram_group_remove_writeback(zgrp); + pr_err("zram group writeback enable failed!\n"); + } + + return ret; +} + +/* + * attach extent at @eid to group @gid as the HOTTEST extent + */ +void zgrp_ext_insert(struct zram_group *zgrp, u32 eid, u16 gid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + hid = gid + zgrp->wbgrp.nr_ext; + zlist_add(hid, eid, zgrp->wbgrp.ext_tab); + pr_info("insert extent %u to group %u\n", eid, gid); +} + +/* + * remove extent at @eid from group @gid + */ +bool zgrp_ext_delete(struct zram_group *zgrp, u32 eid, u16 gid) +{ + u32 hid; + bool isolated = false; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return false; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return false; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return false; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return false; + + zlist_lock(eid, zgrp->wbgrp.ext_tab); + isolated = zlist_is_isolated_nolock(eid, zgrp->wbgrp.ext_tab); + zlist_unlock(eid, zgrp->wbgrp.ext_tab); + if (isolated) { + pr_info("extent %u is already isolated, skip delete.\n", eid); + return false; + } + + pr_info("delete extent %u from group %u\n", eid, gid); + hid = gid + zgrp->wbgrp.nr_ext; + return zlist_del(hid, eid, zgrp->wbgrp.ext_tab); +} + +/* + * try to isolate the first @nr exts of @gid, store their eids in array @eids + * and @return the cnt actually isolated. isolate all exts if nr is 0. + */ +u32 zgrp_isolate_exts(struct zram_group *zgrp, u16 gid, u32 *eids, u32 nr, bool *last) +{ + u32 hid, idx; + u32 cnt = 0; + u32 i; + + if (last) + *last = false; + if (!CHECK(zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return 0; + if (!CHECK(eids, "return array eids is null!\n")) + return 0; + hid = gid + zgrp->wbgrp.nr_ext; + zlist_lock(hid, zgrp->wbgrp.ext_tab); + zlist_for_each_entry_reverse(idx, hid, zgrp->wbgrp.ext_tab) { + eids[cnt++] = idx; + if (nr && cnt == nr) + break; + } + for (i = 0; i < cnt; i++) + zlist_del_nolock(hid, eids[i], zgrp->wbgrp.ext_tab); + if (last) + *last = cnt && zlist_is_isolated_nolock(hid, zgrp->wbgrp.ext_tab); + zlist_unlock(hid, zgrp->wbgrp.ext_tab); + + pr_info("isolated %u exts from group %u.\n", cnt, gid); + + return cnt; +} + +/* + * insert obj at @index into extent @eid + */ +void wbgrp_obj_insert(struct zram_group *zgrp, u32 index, u32 eid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + zlist_add_tail(hid, index, zgrp->obj_tab); + pr_info("insert obj %u to extent %u\n", index, eid); +} + +/* + * remove obj at @index from extent @eid + */ +bool wbgrp_obj_delete(struct zram_group *zgrp, u32 index, u32 eid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return false; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return false; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return false; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return false; + pr_info("delete obj %u from extent %u\n", index, eid); + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + + return zlist_del(hid, index, zgrp->obj_tab); +} + +/* + * try to isolate the first @nr writeback objs of @eid, store their indexes in + * array @idxs and @return the obj cnt actually isolated. isolate all objs if + * @nr is 0. + */ +u32 wbgrp_isolate_objs(struct zram_group *zgrp, u32 eid, u32 *idxs, u32 nr, bool *last) +{ + u32 hid, idx; + u32 cnt = 0; + u32 i; + + if (last) + *last = false; + if (!CHECK(zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return 0; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return 0; + if (!CHECK(idxs, "return array idxs is null!\n")) + return 0; + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + zlist_lock(hid, zgrp->obj_tab); + zlist_for_each_entry(idx, hid, zgrp->obj_tab) { + idxs[cnt++] = idx; + if (nr && cnt == nr) + break; + } + for (i = 0; i < cnt; i++) + zlist_del_nolock(hid, idxs[i], zgrp->obj_tab); + if (last) + *last = cnt && zlist_is_isolated_nolock(hid, zgrp->obj_tab); + zlist_unlock(hid, zgrp->obj_tab); + + pr_info("isolated %u objs from extent %u.\n", cnt, eid); + + return cnt; +} + +void wbgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + atomic_inc(&zgrp->stats[gid].wb_pages); + atomic64_add(size, &zgrp->stats[gid].wb_size); + atomic_inc(&zgrp->stats[0].wb_pages); + atomic64_add(size, &zgrp->stats[0].wb_size); +} + +void wbgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 eid, u32 size) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + atomic_dec(&zgrp->stats[gid].wb_pages); + atomic64_sub(size, &zgrp->stats[gid].wb_size); + atomic_dec(&zgrp->stats[0].wb_pages); + atomic64_sub(size, &zgrp->stats[0].wb_size); +} + +void wbgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + atomic64_inc(&zgrp->stats[gid].wb_fault); + atomic64_inc(&zgrp->stats[0].wb_fault); +} +#endif diff --git a/drivers/block/zram/zram_group/zram_group.h b/drivers/block/zram/zram_group/zram_group.h new file mode 100644 index 000000000000..7ac16ba87703 --- /dev/null +++ b/drivers/block/zram/zram_group/zram_group.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/block/zram/zram_group/zram_group.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZRAM_GROUP_H_ +#define _ZRAM_GROUP_H_ + +#include +#include + +#include "zlist.h" + +#define ZGRP_MAX_GRP USHRT_MAX +#define ZGRP_MAX_OBJ (1 << 30) + +enum { + ZGRP_NONE = 0, + ZGRP_TRACK, +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + ZGRP_WRITE, +#endif +}; + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +#define ZGRP_MAX_EXT (ZLIST_IDX_MAX - ZGRP_MAX_GRP - ZGRP_MAX_OBJ) +struct writeback_group { + bool enable; + u32 nr_ext; + struct zlist_node *grp_ext_head; + struct zlist_node *ext; + struct zlist_table *ext_tab; + struct zlist_node *ext_obj_head; + struct mutex init_lock; + wait_queue_head_t fault_wq; +}; +#endif + +struct zram_group_stats { + atomic64_t zram_size; + atomic_t zram_pages; + atomic64_t zram_fault; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + atomic64_t wb_size; + atomic_t wb_pages; + atomic64_t wb_fault; + atomic_t wb_exts; + atomic64_t write_size; + atomic64_t read_size; +#endif +}; + +struct zram_group { + u32 nr_obj; + u32 nr_grp; + struct zlist_node *grp_obj_head; + struct zlist_node *obj; + struct zlist_table *obj_tab; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + struct writeback_group wbgrp; +#endif + struct group_swap_device *gsdev; + struct zram_group_stats *stats; +}; + +void zram_group_meta_free(struct zram_group *zgrp); +struct zram_group *zram_group_meta_alloc(u32 nr_obj, u32 nr_grp); +void zgrp_obj_insert(struct zram_group *zgrp, u32 index, u16 gid); +bool zgrp_obj_delete(struct zram_group *zgrp, u32 index, u16 gid); +u32 zgrp_isolate_objs(struct zram_group *zgrp, u16 gid, u32 *idxs, u32 nr, bool *last); +bool zgrp_obj_is_isolated(struct zram_group *zgrp, u32 index); +void zgrp_obj_putback(struct zram_group *zgrp, u32 index, u16 gid); +void zgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 size); +void zgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 size); +void zgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 size); + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void zram_group_dump(struct zram_group *zgrp, u16 gid, u32 index); +#endif + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +void zram_group_remove_writeback(struct zram_group *zgrp); +int zram_group_apply_writeback(struct zram_group *zgrp, u32 nr_ext); +void zgrp_ext_insert(struct zram_group *zgrp, u32 eid, u16 gid); +bool zgrp_ext_delete(struct zram_group *zgrp, u32 eid, u16 gid); +u32 zgrp_isolate_exts(struct zram_group *zgrp, u16 gid, u32 *eids, u32 nr, bool *last); +void wbgrp_obj_insert(struct zram_group *zgrp, u32 index, u32 eid); +bool wbgrp_obj_delete(struct zram_group *zgrp, u32 index, u32 eid); +u32 wbgrp_isolate_objs(struct zram_group *zgrp, u32 eid, u32 *idxs, u32 nr, bool *last); +void wbgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size); +void wbgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 eid, u32 size); +void wbgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size); +#endif +#endif diff --git a/drivers/hyperhold/Kconfig b/drivers/hyperhold/Kconfig new file mode 100644 index 000000000000..8e5e7a1ee957 --- /dev/null +++ b/drivers/hyperhold/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +config HYPERHOLD + bool "Hyperhold driver" + default n + help + Hyperhold driver. + +config HYPERHOLD_DEBUG + bool "Debug info for Hyperhold driver" + depends on HYPERHOLD + help + Debug info for Hyperhold driver. diff --git a/drivers/hyperhold/Makefile b/drivers/hyperhold/Makefile new file mode 100644 index 000000000000..b45a1a678466 --- /dev/null +++ b/drivers/hyperhold/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +hyperhold-y := hp_core.o hp_device.o hp_space.o hp_iotab.o + +obj-$(CONFIG_HYPERHOLD) += hyperhold.o diff --git a/drivers/hyperhold/hp_core.c b/drivers/hyperhold/hp_core.c new file mode 100644 index 000000000000..86a9e4704f2e --- /dev/null +++ b/drivers/hyperhold/hp_core.c @@ -0,0 +1,654 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/hyperhold/hp_core.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + + #define pr_fmt(fmt) "[HYPERHOLD]" fmt + +#include +#include +#include + +#include "hyperhold.h" +#include "hp_device.h" +#include "hp_space.h" +#include "hp_iotab.h" + +#ifdef CONFIG_HYPERHOLD_DEBUG +#define HP_DFLT_DEVICE "/dev/loop6" +#else +#define HP_DFLT_DEVICE "/dev/by-name/hyperhold" +#endif +#define HP_DFLT_EXT_SIZE (1 << 15) +#define HP_DEV_NAME_LEN 256 +#define HP_STATE_LEN 10 + +#define CHECK(cond, ...) ((cond) || (pr_err(__VA_ARGS__), false)) +#define CHECK_BOUND(var, min, max) \ + CHECK((var) >= (min) && (var) <= (max), \ + "%s %u out of bounds %u ~ %u!\n", #var, (var), (min), (max)) +#define CHECK_INITED CHECK(hyperhold.inited, "hyperhold is not enable!\n") +#define CHECK_ENABLE (CHECK_INITED && CHECK(hyperhold.enable, "hyperhold is readonly!\n")) + +struct hyperhold { + bool enable; + bool inited; + + char device_name[HP_DEV_NAME_LEN]; + u32 extent_size; + + struct hp_device dev; + struct hp_space spc; + + struct workqueue_struct *read_wq; + struct workqueue_struct *write_wq; + + struct mutex init_lock; +}; + +struct hyperhold hyperhold; + +atomic64_t mem_used = ATOMIC64_INIT(0); +#ifdef CONFIG_HYPERHOLD_DEBUG +/* + * return the memory overhead of hyperhold module + */ +u64 hyperhold_memory_used(void) +{ + return atomic64_read(&mem_used) + hpio_memory() + space_memory(); +} +#endif + +void hyperhold_disable(bool force) +{ + if (!CHECK_INITED) + return; + if (!force && !CHECK_ENABLE) + return; + + mutex_lock(&hyperhold.init_lock); + hyperhold.enable = false; + if (!wait_for_space_empty(&hyperhold.spc, force)) + goto out; + hyperhold.inited = false; + wait_for_iotab_empty(); + if (hyperhold.read_wq) + destroy_workqueue(hyperhold.read_wq); + if (hyperhold.write_wq) + destroy_workqueue(hyperhold.write_wq); + deinit_space(&hyperhold.spc); + unbind_bdev(&hyperhold.dev); +out: + if (hyperhold.inited) + pr_info("hyperhold is disabled, read only.\n"); + else + pr_info("hyperhold is totally disabled!\n"); + mutex_unlock(&hyperhold.init_lock); +} +EXPORT_SYMBOL(hyperhold_disable); + +void hyperhold_enable(void) +{ + bool enable = true; + + if (hyperhold.inited) + goto out; + + mutex_lock(&hyperhold.init_lock); + if (hyperhold.inited) + goto unlock; + if (!bind_bdev(&hyperhold.dev, hyperhold.device_name)) + goto err; + if (!init_space(&hyperhold.spc, hyperhold.dev.dev_size, hyperhold.extent_size)) + goto err; + hyperhold.read_wq = alloc_workqueue("hyperhold_read", WQ_HIGHPRI | WQ_UNBOUND, 0); + if (!hyperhold.read_wq) + goto err; + hyperhold.write_wq = alloc_workqueue("hyperhold_write", 0, 0); + if (!hyperhold.write_wq) + goto err; + hyperhold.inited = true; + goto unlock; +err: + if (hyperhold.read_wq) + destroy_workqueue(hyperhold.read_wq); + if (hyperhold.write_wq) + destroy_workqueue(hyperhold.write_wq); + deinit_space(&hyperhold.spc); + unbind_bdev(&hyperhold.dev); + enable = false; +unlock: + mutex_unlock(&hyperhold.init_lock); +out: + if (enable) { + hyperhold.enable = true; + pr_info("hyperhold is enabled.\n"); + } else { + hyperhold.enable = false; + pr_err("hyperhold enable failed!\n"); + } +} +EXPORT_SYMBOL(hyperhold_enable); + +static int hyperhold_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (write) { + if (!strcmp(buffer, "enable\n")) + hyperhold_enable(); + else if (!strcmp(buffer, "disable\n")) + hyperhold_disable(false); + else if (!strcmp(buffer, "force_disable\n")) + hyperhold_disable(true); + } else { + if (*lenp < HP_STATE_LEN || *ppos) { + *lenp = 0; + return 0; + } + if (hyperhold.enable) + strcpy(buffer, "enable\n"); + else if (hyperhold.inited) + strcpy(buffer, "readonly\n"); + else + strcpy(buffer, "disable\n"); + *lenp = strlen(buffer); + *ppos += *lenp; +#ifdef CONFIG_HYPERHOLD_DEBUG + pr_info("hyperhold memory overhead = %llu.\n", hyperhold_memory_used()); +#endif + } + return 0; +} + +static struct ctl_table_header *hp_sysctl_header; +static struct ctl_table hp_table[] = { + { + .procname = "enable", + .mode = 0644, + .proc_handler = hyperhold_sysctl_handler, + }, + { + .procname = "device", + .data = &hyperhold.device_name, + .maxlen = sizeof(hyperhold.device_name), + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "extent_size", + .data = &hyperhold.extent_size, + .maxlen = sizeof(hyperhold.extent_size), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + {} +}; +static struct ctl_table hp_kernel_table[] = { + { + .procname = "hyperhold", + .mode = 0555, + .child = hp_table, + }, + {} +}; +static struct ctl_table hp_sys_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = hp_kernel_table, + }, + {} +}; + +bool is_hyperhold_enable(void) +{ + return CHECK_ENABLE; +} + +static int __init hyperhold_init(void) +{ + strcpy(hyperhold.device_name, HP_DFLT_DEVICE); + hyperhold.extent_size = HP_DFLT_EXT_SIZE; + mutex_init(&hyperhold.init_lock); + hp_sysctl_header = register_sysctl_table(hp_sys_table); + if (!hp_sysctl_header) { + pr_err("register hyperhold sysctl table failed!\n"); + return -EINVAL; + } + + return 0; +} + +static void __exit hyperhold_exit(void) +{ + unregister_sysctl_table(hp_sysctl_header); + hyperhold_disable(true); +} + +static struct hp_space *space_of(u32 eid) +{ + return &hyperhold.spc; +} + +/* replace this func for multi devices */ +static struct hp_device *device_of(u32 eid) +{ + return &hyperhold.dev; +} + +/* replace this func for multi devices */ +u32 hyperhold_nr_extent(void) +{ + if (!CHECK_INITED) + return 0; + + return hyperhold.spc.nr_ext; +} +EXPORT_SYMBOL(hyperhold_nr_extent); + +u32 hyperhold_extent_size(u32 eid) +{ + struct hp_space *spc = NULL; + + if (!CHECK_INITED) + return 0; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return 0; + + return spc->ext_size; +} +EXPORT_SYMBOL(hyperhold_extent_size); + +/* replace this func for multi devices */ +long hyperhold_address(u32 eid, u32 offset) +{ + struct hp_space *spc = NULL; + + if (!CHECK_INITED) + return -EINVAL; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return -EINVAL; + if (!CHECK_BOUND(offset, 0, spc->ext_size - 1)) + return -EINVAL; + + return (u64)eid * spc->ext_size + offset; +} +EXPORT_SYMBOL(hyperhold_address); + +/* replace this func for multi devices */ +int hyperhold_addr_extent(u64 addr) +{ + struct hp_space *spc = NULL; + u32 eid; + + if (!CHECK_INITED) + return -EINVAL; + eid = addr / hyperhold.spc.ext_size; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return -EINVAL; + + return eid; +} +EXPORT_SYMBOL(hyperhold_addr_extent); + +/* replace this func for multi devices */ +int hyperhold_addr_offset(u64 addr) +{ + if (!CHECK_INITED) + return -EINVAL; + + return addr % hyperhold.spc.ext_size; +} +EXPORT_SYMBOL(hyperhold_addr_offset); + +/* replace this func for multi devices */ +int hyperhold_alloc_extent(void) +{ + if (!CHECK_ENABLE) + return -EINVAL; + + return alloc_eid(&hyperhold.spc); +} +EXPORT_SYMBOL(hyperhold_alloc_extent); + +void hyperhold_free_extent(u32 eid) +{ + struct hp_space *spc = NULL; + + if (!CHECK_INITED) + return; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return; + + free_eid(spc, eid); +} +EXPORT_SYMBOL(hyperhold_free_extent); + +void hyperhold_should_free_extent(u32 eid) +{ + struct hpio *hpio = NULL; + struct hp_space *spc = NULL; + + if (!CHECK_INITED) + return; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u", eid)) + return; + + hpio = hpio_get(eid); + if (!hpio) { + free_eid(spc, eid); + return; + } + hpio->free_extent = hyperhold_free_extent; + hpio_put(hpio); +} +EXPORT_SYMBOL(hyperhold_should_free_extent); + +/* + * alloc hpio struct for r/w extent at @eid, will fill hpio with new alloced + * pages if @new_page. @return NULL on fail. + */ +struct hpio *hyperhold_io_alloc(u32 eid, gfp_t gfp, unsigned int op, bool new_page) +{ + struct hpio *hpio = NULL; + struct hp_space *spc; + u32 nr_page; + + if (!CHECK_ENABLE) + return NULL; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return NULL; + + nr_page = spc->ext_size / PAGE_SIZE; + hpio = hpio_alloc(nr_page, gfp, op, new_page); + if (!hpio) + goto err; + hpio->eid = eid; + + return hpio; +err: + hpio_free(hpio); + + return NULL; +} +EXPORT_SYMBOL(hyperhold_io_alloc); + +void hyperhold_io_free(struct hpio *hpio) +{ + if (!CHECK_INITED) + return; + if (!CHECK(hpio, "hpio is null!\n")) + return; + + hpio_free(hpio); +} +EXPORT_SYMBOL(hyperhold_io_free); + +/* + * find exist read hpio of the extent @eid in iotab and inc its refcnt, + * alloc a new hpio and insert it into iotab if there is no hpio for @eid + */ +struct hpio *hyperhold_io_get(u32 eid, gfp_t gfp, unsigned int op) +{ + struct hp_space *spc = NULL; + u32 nr_page; + + if (!CHECK_INITED) + return NULL; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u", eid)) + return NULL; + + nr_page = spc->ext_size / PAGE_SIZE; + return hpio_get_alloc(eid, nr_page, gfp, op); +} +EXPORT_SYMBOL(hyperhold_io_get); + +bool hyperhold_io_put(struct hpio *hpio) +{ + if (!CHECK_INITED) + return false; + if (!CHECK(hpio, "hpio is null!\n")) + return false; + + return hpio_put(hpio); +} +EXPORT_SYMBOL(hyperhold_io_put); + +/* + * notify all threads waiting for this hpio + */ +void hyperhold_io_complete(struct hpio *hpio) +{ + if (!CHECK_INITED) + return; + if (!CHECK(hpio, "hpio is null!\n")) + return; + + hpio_complete(hpio); +} +EXPORT_SYMBOL(hyperhold_io_complete); + +void hyperhold_io_wait(struct hpio *hpio) +{ + if (!CHECK_INITED) + return; + if (!CHECK(hpio, "hpio is null!\n")) + return; + + hpio_wait(hpio); +} +EXPORT_SYMBOL(hyperhold_io_wait); + +bool hyperhold_io_success(struct hpio *hpio) +{ + if (!CHECK_INITED) + return false; + if (!CHECK(hpio, "hpio is null!\n")) + return false; + + return hpio_get_state(hpio) == HPIO_DONE; +} +EXPORT_SYMBOL(hyperhold_io_success); + +int hyperhold_io_extent(struct hpio *hpio) +{ + if (!CHECK_INITED) + return -EINVAL; + if (!CHECK(hpio, "hpio is null!\n")) + return -EINVAL; + + return hpio->eid; +} +EXPORT_SYMBOL(hyperhold_io_extent); + +int hyperhold_io_operate(struct hpio *hpio) +{ + if (!CHECK_INITED) + return -EINVAL; + if (!CHECK(hpio, "hpio is null!\n")) + return -EINVAL; + + return hpio->op; +} +EXPORT_SYMBOL(hyperhold_io_operate); + +struct page *hyperhold_io_page(struct hpio *hpio, u32 index) +{ + if (!CHECK_INITED) + return NULL; + if (!CHECK(hpio, "hpio is null!\n")) + return NULL; + if (!CHECK_BOUND(index, 0, hpio->nr_page - 1)) + return NULL; + + return hpio->pages[index]; +} +EXPORT_SYMBOL(hyperhold_io_page); + +bool hyperhold_io_add_page(struct hpio *hpio, u32 index, struct page *page) +{ + if (!CHECK_INITED) + return false; + if (!CHECK(hpio, "hpio is null!\n")) + return false; + if (!CHECK(page, "page is null!\n")) + return false; + if (!CHECK_BOUND(index, 0, hpio->nr_page - 1)) + return false; + + get_page(page); + atomic64_add(PAGE_SIZE, &mem_used); + BUG_ON(hpio->pages[index]); + hpio->pages[index] = page; + + return true; +} +EXPORT_SYMBOL(hyperhold_io_add_page); + +u32 hyperhold_io_nr_page(struct hpio *hpio) +{ + if (!CHECK_INITED) + return 0; + if (!CHECK(hpio, "hpio is null!\n")) + return 0; + + return hpio->nr_page; +} +EXPORT_SYMBOL(hyperhold_io_nr_page); + +void *hyperhold_io_private(struct hpio *hpio) +{ + if (!CHECK_INITED) + return NULL; + if (!CHECK(hpio, "hpio is null!\n")) + return NULL; + + return hpio->private; +} +EXPORT_SYMBOL(hyperhold_io_private); + +static void hp_endio_work(struct work_struct *work) +{ + struct hpio *hpio = container_of(work, struct hpio, endio_work); + + if (hpio->endio) + hpio->endio(hpio); +} + +static void hpio_endio(struct bio *bio) +{ + struct hpio *hpio = bio->bi_private; + struct workqueue_struct *wq = NULL; + + pr_info("hpio %p for eid %u returned %d.\n", + hpio, hpio->eid, bio->bi_status); + hpio_set_state(hpio, bio->bi_status ? HPIO_FAIL : HPIO_DONE); + wq = op_is_write(hpio->op) ? hyperhold.write_wq : hyperhold.read_wq; + queue_work(wq, &hpio->endio_work); + bio_put(bio); + atomic64_sub(sizeof(struct bio), &mem_used); +} + +static int hpio_submit(struct hpio *hpio) +{ + struct hp_device *dev = NULL; + struct bio *bio = NULL; + u32 ext_size; + sector_t sec; + int i; + + bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + if (!bio) { + pr_err("bio alloc failed!\n"); + return -ENOMEM; + } + atomic64_add(sizeof(struct bio), &mem_used); + + dev = device_of(hpio->eid); + bio_set_op_attrs(bio, hpio->op, 0); + bio_set_dev(bio, dev->bdev); + + ext_size = space_of(hpio->eid)->ext_size; + sec = (u64)hpio->eid * ext_size / dev->sec_size; + bio->bi_iter.bi_sector = sec; + for (i = 0; i < hpio->nr_page; i++) { + if (!hpio->pages[i]) + break; + hpio->pages[i]->index = sec; + if (!bio_add_page(bio, hpio->pages[i], PAGE_SIZE, 0)) + goto err; + sec += PAGE_SIZE / dev->sec_size; + } + + bio->bi_private = hpio; + bio->bi_end_io = hpio_endio; + submit_bio(bio); + pr_info("submit hpio %p for eid %u.\n", hpio, hpio->eid); + + return 0; +err: + bio_put(bio); + atomic64_sub(sizeof(struct bio), &mem_used); + return -EIO; +} + +static int rw_extent_async(struct hpio *hpio, hp_endio endio, void *priv, unsigned int op) +{ + int ret = 0; + + if (!hpio_change_state(hpio, HPIO_INIT, HPIO_SUBMIT)) + return -EAGAIN; + + hpio->private = priv; + hpio->endio = endio; + INIT_WORK(&hpio->endio_work, hp_endio_work); + + ret = hpio_submit(hpio); + if (ret) { + hpio_set_state(hpio, HPIO_FAIL); + hpio_complete(hpio); + } + + return ret; +} + +int hyperhold_write_async(struct hpio *hpio, hp_endio endio, void *priv) +{ + if (!CHECK_ENABLE) { + hpio_set_state(hpio, HPIO_FAIL); + hpio_complete(hpio); + return -EINVAL; + } + + BUG_ON(!op_is_write(hpio->op)); + + return rw_extent_async(hpio, endio, priv, REQ_OP_WRITE); +} +EXPORT_SYMBOL(hyperhold_write_async); + +int hyperhold_read_async(struct hpio *hpio, hp_endio endio, void *priv) +{ + if (!CHECK_INITED) { + hpio_set_state(hpio, HPIO_FAIL); + hpio_complete(hpio); + return -EINVAL; + } + + if (op_is_write(hpio->op)) + return -EAGAIN; + + return rw_extent_async(hpio, endio, priv, REQ_OP_READ); +} +EXPORT_SYMBOL(hyperhold_read_async); + +module_init(hyperhold_init) +module_exit(hyperhold_exit) diff --git a/drivers/hyperhold/hp_device.c b/drivers/hyperhold/hp_device.c new file mode 100644 index 000000000000..0fd81be5ffa8 --- /dev/null +++ b/drivers/hyperhold/hp_device.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/hyperhold/hp_device.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[HYPERHOLD]" fmt + +#include + +#include "hp_device.h" + +void unbind_bdev(struct hp_device *dev) +{ + int ret; + + if (!dev->bdev) + goto close; + if (!dev->old_block_size) + goto put; + ret = set_blocksize(dev->bdev, dev->old_block_size); + if (ret) + pr_err("set old block size %d failed, err = %d!\n", + dev->old_block_size, ret); + dev->old_block_size = 0; +put: + blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + dev->bdev = NULL; +close: + if (dev->filp) + filp_close(dev->filp, NULL); + dev->filp = NULL; + + pr_info("hyperhold bdev unbinded.\n"); +} + +bool bind_bdev(struct hp_device *dev, const char *name) +{ + struct inode *inode = NULL; + int ret; + + dev->filp = filp_open(name, O_RDWR | O_LARGEFILE, 0); + if (IS_ERR(dev->filp)) { + pr_err("open file %s failed, err = %ld!\n", name, PTR_ERR(dev->filp)); + dev->filp = NULL; + goto err; + } + inode = dev->filp->f_mapping->host; + if (!S_ISBLK(inode->i_mode)) { + pr_err("%s is not a block device!\n", name); + goto err; + } + dev->bdev = blkdev_get_by_dev(inode->i_rdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, dev); + if (IS_ERR(dev->bdev)) { + ret = PTR_ERR(dev->bdev); + dev->bdev = NULL; + pr_err("get blkdev %s failed, err = %d!\n", name, ret); + goto err; + } + dev->old_block_size = block_size(dev->bdev); + ret = set_blocksize(dev->bdev, PAGE_SIZE); + if (ret) { + pr_err("set %s block size failed, err = %d!\n", name, ret); + goto err; + } + dev->dev_size = (u64)i_size_read(inode); + dev->sec_size = SECTOR_SIZE; + + pr_info("hyperhold bind bdev %s of size %llu / %u succ.\n", + name, dev->dev_size, dev->sec_size); + + return true; +err: + unbind_bdev(dev); + + return false; +} diff --git a/drivers/hyperhold/hp_device.h b/drivers/hyperhold/hp_device.h new file mode 100644 index 000000000000..52d5de370fda --- /dev/null +++ b/drivers/hyperhold/hp_device.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/hyperhold/hp_device.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _HP_DEVICE_H_ +#define _HP_DEVICE_H_ + +#include + +struct hp_device { + struct file *filp; + struct block_device *bdev; + u32 old_block_size; + u64 dev_size; + u32 sec_size; +}; + +void unbind_bdev(struct hp_device *dev); +bool bind_bdev(struct hp_device *dev, const char *name); +#endif diff --git a/drivers/hyperhold/hp_iotab.c b/drivers/hyperhold/hp_iotab.c new file mode 100644 index 000000000000..258cb83a16c3 --- /dev/null +++ b/drivers/hyperhold/hp_iotab.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/hyperhold/hp_iotab.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[HYPERHOLD]" fmt + +#include +#include + +#include "hp_iotab.h" + +atomic64_t hpio_mem = ATOMIC64_INIT(0); +u64 hpio_memory(void) +{ + return atomic64_read(&hpio_mem); +} + +struct hp_iotab { + struct list_head io_list; + rwlock_t lock; + u32 io_cnt; + wait_queue_head_t empty_wq; +}; + +/* store all inflight hpio in iotab */ +struct hp_iotab iotab = { + .io_list = LIST_HEAD_INIT(iotab.io_list), + .lock = __RW_LOCK_UNLOCKED(iotab.lock), + .io_cnt = 0, + .empty_wq = __WAIT_QUEUE_HEAD_INITIALIZER(iotab.empty_wq), +}; + +static struct hpio *__iotab_search_get(struct hp_iotab *iotab, u32 eid) +{ + struct hpio *hpio = NULL; + + list_for_each_entry(hpio, &iotab->io_list, list) + if (hpio->eid == eid && kref_get_unless_zero(&hpio->refcnt)) + return hpio; + + return NULL; +} + +static struct hpio *iotab_search_get(struct hp_iotab *iotab, u32 eid) +{ + struct hpio *hpio = NULL; + unsigned long flags; + + read_lock_irqsave(&iotab->lock, flags); + hpio = __iotab_search_get(iotab, eid); + read_unlock_irqrestore(&iotab->lock, flags); + + pr_info("find hpio %p for eid %u.\n", hpio, eid); + + return hpio; +} + +/* + * insert @hpio into @iotab, cancel insertion if there is a hpio of the same + * @eid, inc the refcnt of duplicated hpio and return it + */ +static struct hpio *iotab_insert(struct hp_iotab *iotab, struct hpio *hpio) +{ + struct hpio *dup = NULL; + unsigned long flags; + + write_lock_irqsave(&iotab->lock, flags); + dup = __iotab_search_get(iotab, hpio->eid); + if (dup) { + pr_info("find exist hpio %p for eid %u, insert hpio %p failed.\n", + dup, hpio->eid, hpio); + goto unlock; + } + list_add(&hpio->list, &iotab->io_list); + iotab->io_cnt++; + pr_info("insert new hpio %p for eid %u.\n", hpio, hpio->eid); +unlock: + write_unlock_irqrestore(&iotab->lock, flags); + + return dup; +} + +static void iotab_delete(struct hp_iotab *iotab, struct hpio *hpio) +{ + unsigned long flags; + + write_lock_irqsave(&iotab->lock, flags); + list_del(&hpio->list); + iotab->io_cnt--; + if (!iotab->io_cnt) + wake_up(&iotab->empty_wq); + write_unlock_irqrestore(&iotab->lock, flags); + + pr_info("delete hpio %p for eid %u from iotab.\n", hpio, hpio->eid); +} + +static void hpio_clear_pages(struct hpio *hpio) +{ + int i; + + if (!hpio->pages) + return; + + for (i = 0; i < hpio->nr_page; i++) + if (hpio->pages[i]) { + put_page(hpio->pages[i]); + atomic64_sub(PAGE_SIZE, &hpio_mem); + } + kfree(hpio->pages); + atomic64_sub(sizeof(struct page *) * hpio->nr_page, &hpio_mem); + hpio->nr_page = 0; + hpio->pages = NULL; +} + +/* + * alloc pages array for @hpio, fill in new alloced pages if @new_page + */ +static bool hpio_fill_pages(struct hpio *hpio, u32 nr_page, gfp_t gfp, bool new_page) +{ + int i; + + BUG_ON(hpio->pages); + hpio->nr_page = nr_page; + hpio->pages = kcalloc(hpio->nr_page, sizeof(struct page *), gfp); + if (!hpio->pages) + goto err; + atomic64_add(sizeof(struct page *) * hpio->nr_page, &hpio_mem); + + if (!new_page) + goto out; + for (i = 0; i < hpio->nr_page; i++) { + hpio->pages[i] = alloc_page(gfp); + if (!hpio->pages[i]) + goto err; + atomic64_add(PAGE_SIZE, &hpio_mem); + } +out: + return true; +err: + hpio_clear_pages(hpio); + + return false; +} + +void hpio_free(struct hpio *hpio) +{ + if (!hpio) + return; + + pr_info("free hpio = %p.\n", hpio); + + hpio_clear_pages(hpio); + kfree(hpio); + atomic64_sub(sizeof(struct hpio), &hpio_mem); +} + +struct hpio *hpio_alloc(u32 nr_page, gfp_t gfp, unsigned int op, bool new_page) +{ + struct hpio *hpio = NULL; + + hpio = kzalloc(sizeof(struct hpio), gfp); + if (!hpio) + goto err; + atomic64_add(sizeof(struct hpio), &hpio_mem); + if (!hpio_fill_pages(hpio, nr_page, gfp, new_page)) + goto err; + hpio->op = op; + atomic_set(&hpio->state, HPIO_INIT); + kref_init(&hpio->refcnt); + init_completion(&hpio->wait); + + return hpio; +err: + hpio_free(hpio); + + return NULL; +} + +struct hpio *hpio_get(u32 eid) +{ + return iotab_search_get(&iotab, eid); +} + +struct hpio *hpio_get_alloc(u32 eid, u32 nr_page, gfp_t gfp, unsigned int op) +{ + struct hpio *hpio = NULL; + struct hpio *dup = NULL; + + hpio = iotab_search_get(&iotab, eid); + if (hpio) { + pr_info("find exist hpio %p for eid %u.\n", hpio, eid); + goto out; + } + hpio = hpio_alloc(nr_page, gfp, op, true); + if (!hpio) + goto out; + hpio->eid = eid; + + pr_info("alloc hpio %p for eid %u.\n", hpio, eid); + + dup = iotab_insert(&iotab, hpio); + if (dup) { + hpio_free(hpio); + hpio = dup; + } +out: + return hpio; +} + +static void hpio_release(struct kref *kref) +{ + struct hpio *hpio = container_of(kref, struct hpio, refcnt); + + iotab_delete(&iotab, hpio); + if (hpio->free_extent) + hpio->free_extent(hpio->eid); + hpio_free(hpio); +} + +bool hpio_put(struct hpio *hpio) +{ + pr_info("put hpio %p for eid %u, ref = %u.\n", hpio, hpio->eid, kref_read(&hpio->refcnt)); + return kref_put(&hpio->refcnt, hpio_release); +} + +void hpio_complete(struct hpio *hpio) +{ + pr_info("complete hpio %p for eid %u.\n", hpio, hpio->eid); + complete_all(&hpio->wait); +} + +void hpio_wait(struct hpio *hpio) +{ + wait_for_completion(&hpio->wait); +} + +enum hpio_state hpio_get_state(struct hpio *hpio) +{ + return atomic_read(&hpio->state); +} + +void hpio_set_state(struct hpio *hpio, enum hpio_state state) +{ + atomic_set(&hpio->state, state); +} + +bool hpio_change_state(struct hpio *hpio, enum hpio_state from, enum hpio_state to) +{ + return atomic_cmpxchg(&hpio->state, from, to) == from; +} + +static void dump_iotab(struct hp_iotab *iotab) +{ + struct hpio *hpio = NULL; + unsigned long flags; + + pr_info("dump inflight hpio in iotab.\n"); + read_lock_irqsave(&iotab->lock, flags); + list_for_each_entry(hpio, &iotab->io_list, list) + pr_info("hpio %p for eid %u is inflight.\n", hpio, hpio->eid); + read_unlock_irqrestore(&iotab->lock, flags); +} + +void wait_for_iotab_empty(void) +{ + dump_iotab(&iotab); + wait_event(iotab.empty_wq, !iotab.io_cnt); +} diff --git a/drivers/hyperhold/hp_iotab.h b/drivers/hyperhold/hp_iotab.h new file mode 100644 index 000000000000..a2f03620af13 --- /dev/null +++ b/drivers/hyperhold/hp_iotab.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/hyperhold/hp_iotab.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _HP_IOTAB_H_ +#define _HP_IOTAB_H_ + +#include +#include +#include +#include + +enum hpio_state { + HPIO_INIT, + HPIO_SUBMIT, + HPIO_DONE, + HPIO_FAIL, +}; + +struct hpio; + +typedef void (*hp_endio)(struct hpio *); + +struct hpio { + u32 eid; + struct page **pages; + u32 nr_page; + void *private; + + unsigned int op; + void (*free_extent)(u32 eid); + + atomic_t state; + struct kref refcnt; + struct completion wait; + hp_endio endio; + struct work_struct endio_work; + + struct list_head list; +}; + +struct hpio *hpio_alloc(u32 nr_page, gfp_t gfp, unsigned int op, bool new_page); +void hpio_free(struct hpio *hpio); + +struct hpio *hpio_get(u32 eid); +bool hpio_put(struct hpio *hpio); +struct hpio *hpio_get_alloc(u32 eid, u32 nr_page, gfp_t gfp, unsigned int op); + +void hpio_complete(struct hpio *hpio); +void hpio_wait(struct hpio *hpio); + +enum hpio_state hpio_get_state(struct hpio *hpio); +void hpio_set_state(struct hpio *hpio, enum hpio_state state); +bool hpio_change_state(struct hpio *hpio, enum hpio_state from, enum hpio_state to); + +void wait_for_iotab_empty(void); + +u64 hpio_memory(void); +#endif diff --git a/drivers/hyperhold/hp_space.c b/drivers/hyperhold/hp_space.c new file mode 100644 index 000000000000..95d42d064290 --- /dev/null +++ b/drivers/hyperhold/hp_space.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/hyperhold/hp_space.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[HYPERHOLD]" fmt + +#include + +#include "hp_space.h" + +atomic64_t spc_mem = ATOMIC64_INIT(0); + +u64 space_memory(void) +{ + return atomic64_read(&spc_mem); +} + +void deinit_space(struct hp_space *spc) +{ + kvfree(spc->bitmap); + atomic64_sub(BITS_TO_LONGS(spc->nr_ext) * sizeof(long), &spc_mem); + spc->ext_size = 0; + spc->nr_ext = 0; + atomic_set(&spc->last_alloc_bit, 0); + atomic_set(&spc->nr_alloced, 0); + + pr_info("hyperhold space deinited.\n"); +} + +bool init_space(struct hp_space *spc, u64 dev_size, u32 ext_size) +{ + if (ext_size & (PAGE_SIZE - 1)) { + pr_err("extent size %u do not align to page size %lu!", ext_size, PAGE_SIZE); + return false; + } + if (dev_size & (ext_size - 1)) { + pr_err("device size %llu do not align to extent size %u!", dev_size, ext_size); + return false; + } + spc->ext_size = ext_size; + spc->nr_ext = dev_size / ext_size; + atomic_set(&spc->last_alloc_bit, 0); + atomic_set(&spc->nr_alloced, 0); + init_waitqueue_head(&spc->empty_wq); + spc->bitmap = kvzalloc(BITS_TO_LONGS(spc->nr_ext) * sizeof(long), GFP_KERNEL); + if (!spc->bitmap) { + pr_err("hyperhold bitmap alloc failed.\n"); + return false; + } + atomic64_add(BITS_TO_LONGS(spc->nr_ext) * sizeof(long), &spc_mem); + + pr_info("hyperhold space init succ, capacity = %u x %u.\n", ext_size, spc->nr_ext); + + return true; +} + +int alloc_eid(struct hp_space *spc) +{ + u32 bit; + u32 last_bit; + +retry: + last_bit = atomic_read(&spc->last_alloc_bit); + bit = find_next_zero_bit(spc->bitmap, spc->nr_ext, last_bit); + if (bit == spc->nr_ext) + bit = find_next_zero_bit(spc->bitmap, spc->nr_ext, 0); + if (bit == spc->nr_ext) + goto full; + if (test_and_set_bit(bit, spc->bitmap)) + goto retry; + + atomic_set(&spc->last_alloc_bit, bit); + atomic_inc(&spc->nr_alloced); + + pr_info("hyperhold alloc extent %u.\n", bit); + + return bit; +full: + pr_err("hyperhold space is full.\n"); + + return -ENOSPC; +} + +void free_eid(struct hp_space *spc, u32 eid) +{ + if (!test_and_clear_bit(eid, spc->bitmap)) { + pr_err("eid is not alloced!\n"); + BUG(); + return; + } + if (atomic_dec_and_test(&spc->nr_alloced)) { + pr_info("notify space empty.\n"); + wake_up(&spc->empty_wq); + } + pr_info("hyperhold free extent %u.\n", eid); +} + +static void dump_space(struct hp_space *spc) +{ + u32 i = 0; + + pr_info("dump alloced extent in space.\n"); + for (i = 0; i < spc->nr_ext; i++) + if (test_bit(i, spc->bitmap)) + pr_info("alloced eid %u.\n", i); +} + +bool wait_for_space_empty(struct hp_space *spc, bool force) +{ + if (!atomic_read(&spc->nr_alloced)) + return true; + if (!force) + return false; + + dump_space(spc); + wait_event(spc->empty_wq, !atomic_read(&spc->nr_alloced)); + + return true; +} diff --git a/drivers/hyperhold/hp_space.h b/drivers/hyperhold/hp_space.h new file mode 100644 index 000000000000..caaaf92a07f7 --- /dev/null +++ b/drivers/hyperhold/hp_space.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/hyperhold/hp_space.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _HP_SPACE_H_ +#define _HP_SPACE_H_ + +#include + +struct hp_space { + u32 ext_size; + u32 nr_ext; + unsigned long *bitmap; + atomic_t last_alloc_bit; + atomic_t nr_alloced; + wait_queue_head_t empty_wq; +}; + +void deinit_space(struct hp_space *spc); +bool init_space(struct hp_space *spc, u64 dev_size, u32 ext_size); +int alloc_eid(struct hp_space *spc); +void free_eid(struct hp_space *spc, u32 eid); + +bool wait_for_space_empty(struct hp_space *spc, bool force); + +u64 space_memory(void); +#endif diff --git a/drivers/hyperhold/hyperhold.h b/drivers/hyperhold/hyperhold.h new file mode 100644 index 000000000000..b65ff5444513 --- /dev/null +++ b/drivers/hyperhold/hyperhold.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/hyperhold/hyperhold.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _HYPERHOLD_H_ +#define _HYPERHOLD_H_ + +#include + +struct hpio; + +typedef void (*hp_endio)(struct hpio *); + +void hyperhold_disable(bool force); +void hyperhold_enable(void); +bool is_hyperhold_enable(void); + +u32 hyperhold_nr_extent(void); +u32 hyperhold_extent_size(u32 eid); +long hyperhold_address(u32 eid, u32 offset); +int hyperhold_addr_extent(u64 addr); +int hyperhold_addr_offset(u64 addr); + +int hyperhold_alloc_extent(void); +void hyperhold_free_extent(u32 eid); +void hyperhold_should_free_extent(u32 eid); + +struct hpio *hyperhold_io_alloc(u32 eid, gfp_t gfp, unsigned int op, bool new_page); +void hyperhold_io_free(struct hpio *hpio); + +struct hpio *hyperhold_io_get(u32 eid, gfp_t gfp, unsigned int op); +bool hyperhold_io_put(struct hpio *hpio); + +void hyperhold_io_complete(struct hpio *hpio); +void hyperhold_io_wait(struct hpio *hpio); + +bool hyperhold_io_success(struct hpio *hpio); + +int hyperhold_io_extent(struct hpio *hpio); +int hyperhold_io_operate(struct hpio *hpio); +struct page *hyperhold_io_page(struct hpio *hpio, u32 index); +bool hyperhold_io_add_page(struct hpio *hpio, u32 index, struct page *page); +u32 hyperhold_io_nr_page(struct hpio *hpio); +void *hyperhold_io_private(struct hpio *hpio); + +int hyperhold_write_async(struct hpio *hpio, hp_endio endio, void *priv); +int hyperhold_read_async(struct hpio *hpio, hp_endio endio, void *priv); + +#endif diff --git a/include/linux/hyperhold_inf.h b/include/linux/hyperhold_inf.h new file mode 100644 index 000000000000..7d2bd1e88c1c --- /dev/null +++ b/include/linux/hyperhold_inf.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/hyperhold_inf.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef HYPERHOLD_INF_H +#define HYPERHOLD_INF_H + +#ifdef CONFIG_HYPERHOLD + +extern bool is_hyperhold_enable(void); + +#else + +static inline is_hyperhold_enable(void) +{ + return false; +} +#endif + +#endif -- Gitee From 850fd45a70738814f4da04540d6c5da7c0845b0e Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Thu, 13 Jan 2022 16:06:00 +0800 Subject: [PATCH 005/113] hyperhold: add support hyperhold file lru and add memcg management for hyperhold ohos inclusion category: feature issue: #I4RXQ3 CVE: NA ----------------- Put file page into pgdat lru list instead of memcg lru list for management in hyperhold. Add some interfaces in memcg for management in hyperhold. Signed-off-by: Chen Wandun --- include/linux/memcg_policy.h | 40 +++ include/linux/memcontrol.h | 50 ++++ include/linux/mmzone.h | 12 + include/linux/swap.h | 15 + include/linux/vm_event_item.h | 4 + mm/Kconfig | 17 ++ mm/Makefile | 2 + mm/internal.h | 122 ++++++++ mm/memcg_control.c | 428 ++++++++++++++++++++++++++++ mm/memcg_reclaim.c | 516 ++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 64 ++++- mm/page_alloc.c | 3 + mm/swap.c | 6 + mm/vmscan.c | 223 +++++++-------- mm/vmstat.c | 4 + mm/workingset.c | 72 ++++- 16 files changed, 1460 insertions(+), 118 deletions(-) create mode 100644 include/linux/memcg_policy.h create mode 100644 mm/memcg_control.c create mode 100644 mm/memcg_reclaim.c diff --git a/include/linux/memcg_policy.h b/include/linux/memcg_policy.h new file mode 100644 index 000000000000..8dd856ead095 --- /dev/null +++ b/include/linux/memcg_policy.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/memcg_policy.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + * + */ +#ifndef _MEMCG_POLICY_H +#define _MEMCG_POLICY_H + +struct mem_cgroup; +struct pglist_data; +struct scan_control; + + +extern struct list_head score_head; +extern bool score_head_inited; +extern spinlock_t score_list_lock; +extern struct cgroup_subsys memory_cgrp_subsys; +#ifdef CONFIG_HYPERHOLD_FILE_LRU +void shrink_anon_memcg(struct pglist_data *pgdat, + struct mem_cgroup *memcg, struct scan_control *sc, + unsigned long *nr); +bool shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc); +#endif /* CONFIG_HYPERHOLD_FILE_LRU */ + +#ifdef CONFIG_HYPERHOLD_MEMCG +struct mem_cgroup *get_next_memcg(struct mem_cgroup *prev); +void get_next_memcg_break(struct mem_cgroup *memcg); +void memcg_app_score_update(struct mem_cgroup *target); + +struct memcg_reclaim { + atomic64_t app_score; + atomic64_t ub_ufs2zram_ratio; +}; +#define MAX_APP_SCORE 1000 +#endif + + +#endif /* _LINUX_MEMCG_POLICY_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4b975111b536..2469ca802798 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -21,6 +21,7 @@ #include #include #include +#include struct mem_cgroup; struct obj_cgroup; @@ -53,6 +54,11 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; +static inline bool is_prot_page(struct page *page) +{ + return false; +} + #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 @@ -295,6 +301,13 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure; +#ifdef CONFIG_HYPERHOLD_MEMCG + struct list_head score_node; +#define MEM_CGROUP_NAME_MAX_LEN 100 + char name[MEM_CGROUP_NAME_MAX_LEN]; + struct memcg_reclaim memcg_reclaimed; +#endif + #ifdef CONFIG_MEMCG_KMEM /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; @@ -549,6 +562,10 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return 0; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!memcg) + return -1; +#endif return memcg->id.id; } @@ -566,6 +583,11 @@ static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) if (mem_cgroup_disabled()) return NULL; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return NULL; +#endif + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return mz->memcg; } @@ -763,6 +785,10 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return node_page_state(lruvec_pgdat(lruvec), idx); +#endif pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); x = atomic_long_read(&pn->lruvec_stat[idx]); #ifdef CONFIG_SMP @@ -782,6 +808,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return node_page_state(lruvec_pgdat(lruvec), idx); +#endif + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); for_each_possible_cpu(cpu) x += per_cpu(pn->lruvec_stat_local->count[idx], cpu); @@ -830,6 +861,17 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, local_irq_restore(flags); } +#ifdef CONFIG_HYPERHOLD_FILE_LRU +static __always_inline bool is_file_page(struct page *page) +{ + if (!PageUnevictable(page) && !PageSwapBacked(page) && page_mapping(page)) + return true; + + return false; + +} +#endif + static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { @@ -837,6 +879,14 @@ static inline void __mod_lruvec_page_state(struct page *page, pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_file_page(page) && !is_prot_page(page)) { + __mod_node_page_state(pgdat, idx, val); + return; + + } +#endif + /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!head->mem_cgroup) { __mod_node_page_state(pgdat, idx, val); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3ac2799dcb4a..b73098363526 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -829,6 +829,11 @@ typedef struct pglist_data { #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) +static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) +{ + return &pgdat->__lruvec; +} + static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; @@ -875,6 +880,13 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) #endif } +#ifdef CONFIG_HYPERHOLD_FILE_LRU +static inline int is_node_lruvec(struct lruvec *lruvec) +{ + return &lruvec_pgdat(lruvec)->__lruvec == lruvec; +} +#endif + extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); #ifdef CONFIG_HAVE_MEMORYLESS_NODES diff --git a/include/linux/swap.h b/include/linux/swap.h index fbc6805358da..970062c26311 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -380,7 +380,22 @@ extern int sysctl_min_slab_ratio; #define node_reclaim_mode 0 #endif +struct scan_control; + +extern unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct lruvec *lruvec, + struct scan_control *sc); +extern bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru); +extern bool cgroup_reclaim(struct scan_control *sc); extern void check_move_unevictable_pages(struct pagevec *pvec); +extern unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, + int priority); +extern bool writeback_throttling_sane(struct scan_control *sc); +extern inline bool should_continue_reclaim(struct pglist_data *pgdat, + unsigned long nr_reclaimed, + struct scan_control *sc); + +extern int current_may_throttle(void); extern int kswapd_run(int nid); extern void kswapd_stop(int nid); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 18e75974d4e3..c64c3d73893c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -120,6 +120,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, +#endif +#ifdef CONFIG_HYPERHOLD_MEMCG + FREEZE_RECLAIMED, + FREEZE_RECLAIME_COUNT, #endif NR_VM_EVENT_ITEMS }; diff --git a/mm/Kconfig b/mm/Kconfig index ed97e8ddd70b..27d50ee6e19f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -63,6 +63,23 @@ config SPARSEMEM_MANUAL endchoice +config HYPERHOLD_FILE_LRU + bool "Enable HyperHold FILE LRU" + depends on HYPERHOLD && MEMCG + select HYPERHOLD_MEMCG + default n + help + File-LRU is a mechanism that put file page in global lru list, + and anon page in memcg lru list(if MEMCG is enable), what's + more, recliam of anonymous pages and file page are separated. + +config HYPERHOLD_MEMCG + bool "Enable Memcg Management in HyperHold" + depends on HYPERHOLD && MEMCG + help + Add more attributes in memory cgroup, these attribute is used + to show information, shrink memory, swapin page and so on. + config DISCONTIGMEM def_bool y depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL diff --git a/mm/Makefile b/mm/Makefile index d73aed0fc99c..6a80a70eff30 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -120,3 +120,5 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o +obj-$(CONFIG_HYPERHOLD_FILE_LRU) += memcg_reclaim.o +obj-$(CONFIG_HYPERHOLD_MEMCG) += memcg_control.o diff --git a/mm/internal.h b/mm/internal.h index 840b8a330b9a..cb3d736d1856 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include /* * The set of flags that only affect watermark checking and reclaim @@ -32,6 +34,121 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) +enum reclaim_invoker { + ALL, + KSWAPD, + ZSWAPD, + DIRECT_RECLAIM, + NODE_RECLAIM, + SOFT_LIMIT, + RCC_RECLAIM, + FILE_RECLAIM, + ANON_RECLAIM +}; + +struct scan_control { + /* How many pages shrink_list() should reclaim */ + unsigned long nr_to_reclaim; + + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; + + /* + * The memory cgroup that hit its limit and as a result is the + * primary target of this reclaim invocation. + */ + struct mem_cgroup *target_mem_cgroup; + + /* + * Scan pressure balancing between anon and file LRUs + */ + unsigned long anon_cost; + unsigned long file_cost; + + /* Can active pages be deactivated as part of reclaim? */ +#define DEACTIVATE_ANON 1 +#define DEACTIVATE_FILE 2 + unsigned int may_deactivate:2; + unsigned int force_deactivate:1; + unsigned int skipped_deactivate:1; + + /* Writepage batching in laptop mode; RECLAIM_WRITE */ + unsigned int may_writepage:1; + + /* Can mapped pages be reclaimed? */ + unsigned int may_unmap:1; + + /* Can pages be swapped as part of reclaim? */ + unsigned int may_swap:1; + + /* + * Cgroups are not reclaimed below their configured memory.low, + * unless we threaten to OOM. If any cgroups are skipped due to + * memory.low and nothing was reclaimed, go back for memory.low. + */ + unsigned int memcg_low_reclaim:1; + unsigned int memcg_low_skipped:1; + + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* There is easily reclaimable cold cache in the current node */ + unsigned int cache_trim_mode:1; + + /* The file pages on the current node are dangerously low */ + unsigned int file_is_tiny:1; + + /* Allocation order */ + s8 order; + + /* Scan (total_size >> priority) pages at once */ + s8 priority; + + /* The highest zone to isolate pages for reclaim from */ + s8 reclaim_idx; + + /* This context's GFP mask */ + gfp_t gfp_mask; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; + + struct { + unsigned int dirty; + unsigned int unqueued_dirty; + unsigned int congested; + unsigned int writeback; + unsigned int immediate; + unsigned int file_taken; + unsigned int taken; + } nr; + + enum reclaim_invoker invoker; + u32 isolate_count; + unsigned long nr_scanned_anon; + unsigned long nr_scanned_file; + unsigned long nr_reclaimed_anon; + unsigned long nr_reclaimed_file; + + /* for recording the reclaimed slab by now */ + struct reclaim_state reclaim_state; +}; + +enum scan_balance { + SCAN_EQUAL, + SCAN_FRACT, + SCAN_ANON, + SCAN_FILE, +}; + void page_writeback_init(void); vm_fault_t do_swap_page(struct vm_fault *vmf); @@ -110,6 +227,11 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); +extern unsigned int shrink_page_list(struct list_head *page_list, + struct pglist_data *pgdat, + struct scan_control *sc, + struct reclaim_stat *stat, + bool ignore_references); /* * in mm/rmap.c: diff --git a/mm/memcg_control.c b/mm/memcg_control.c new file mode 100644 index 000000000000..d56a2ba665b6 --- /dev/null +++ b/mm/memcg_control.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/memcg_control.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ +#include +#include +#include +#include +#include +#include "internal.h" + +#include "zswapd_internal.h" + +#ifdef CONFIG_HYPERHOLD_MEMCG + +struct list_head score_head; +bool score_head_inited; +DEFINE_SPINLOCK(score_list_lock); +DEFINE_MUTEX(reclaim_para_lock); + +/** + * get_next_memcg - iterate over memory cgroup score_list + * @prev: previously returned memcg, NULL on first invocation + * + * Returns references to the next memg on score_list of @prev, + * or %NULL after a full round-trip. + * + * Caller must pass the return value in @prev on subsequent + * invocations for reference counting, or use get_next_memcg_break() + * to cancel a walk before the round-trip is complete. + */ +struct mem_cgroup *get_next_memcg(struct mem_cgroup *prev) +{ + struct mem_cgroup *memcg = NULL; + struct list_head *pos = NULL; + unsigned long flags; + + if (unlikely(!score_head_inited)) + return NULL; + + spin_lock_irqsave(&score_list_lock, flags); + + if (unlikely(!prev)) + pos = &score_head; + else + pos = &(prev->score_node); + + if (list_empty(pos)) /* deleted node */ + goto unlock; + + if (pos->next == &score_head) + goto unlock; + + memcg = list_entry(pos->next, + struct mem_cgroup, score_node); + + if (!css_tryget(&memcg->css)) + memcg = NULL; + +unlock: + spin_unlock_irqrestore(&score_list_lock, flags); + + if (prev) + css_put(&prev->css); + + return memcg; +} + +void get_next_memcg_break(struct mem_cgroup *memcg) +{ + if (memcg) + css_put(&memcg->css); +} + +struct mem_cgroup *get_prev_memcg(struct mem_cgroup *next) +{ + struct mem_cgroup *memcg = NULL; + struct list_head *pos = NULL; + unsigned long flags; + + if (unlikely(!score_head_inited)) + return NULL; + + spin_lock_irqsave(&score_list_lock, flags); + + if (unlikely(!next)) + pos = &score_head; + else + pos = &next->score_node; + + if (list_empty(pos)) /* deleted node */ + goto unlock; + + if (pos->prev == &score_head) + goto unlock; + + memcg = list_entry(pos->prev, + struct mem_cgroup, score_node); + + if (unlikely(!memcg)) + goto unlock; + + if (!css_tryget(&memcg->css)) + memcg = NULL; + +unlock: + spin_unlock_irqrestore(&score_list_lock, flags); + + if (next) + css_put(&next->css); + return memcg; +} + +void get_prev_memcg_break(struct mem_cgroup *memcg) +{ + if (memcg) + css_put(&memcg->css); +} + +void memcg_app_score_update(struct mem_cgroup *target) +{ + struct list_head *pos = NULL; + struct list_head *tmp; + unsigned long flags; + + spin_lock_irqsave(&score_list_lock, flags); + list_for_each_prev_safe(pos, tmp, &score_head) { + struct mem_cgroup *memcg = list_entry(pos, + struct mem_cgroup, score_node); + if (atomic64_read(&memcg->memcg_reclaimed.app_score) < + atomic64_read(&target->memcg_reclaimed.app_score)) + break; + } + list_move_tail(&target->score_node, pos); + spin_unlock_irqrestore(&score_list_lock, flags); +} + +static u64 mem_cgroup_app_score_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return atomic64_read(&memcg->memcg_reclaimed.app_score); +} + +static int mem_cgroup_app_score_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val > MAX_APP_SCORE) + return -EINVAL; + + if (atomic64_read(&memcg->memcg_reclaimed.app_score) != val) { + atomic64_set(&memcg->memcg_reclaimed.app_score, val); + memcg_app_score_update(memcg); + } + + return 0; +} + +static unsigned long move_pages_to_page_list(struct lruvec *lruvec, enum lru_list lru, + struct list_head *page_list) +{ + struct list_head *src = &lruvec->lists[lru]; + unsigned long nr_isolated = 0; + struct page *page; + + while (!list_empty(src)) { + page = lru_to_page(src); + + if (PageUnevictable(page)) + continue; + + if (likely(get_page_unless_zero(page))) { + if (isolate_lru_page(page)) { + put_page(page); + continue; + } + put_page(page); + + } else { + continue; + } + + + if (PageUnevictable(page)) { + putback_lru_page(page); + continue; + } + + if (PageAnon(page) && !PageSwapBacked(page)) { + putback_lru_page(page); + continue; + } + + list_add(&page->lru, page_list); + nr_isolated++; + } + + return nr_isolated; +} + + +unsigned long reclaim_all_anon_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long nr_reclaimed; + LIST_HEAD(page_list); + struct page *page; + struct reclaim_stat stat = {}; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + }; + + count_vm_event(FREEZE_RECLAIME_COUNT); + move_pages_to_page_list(lruvec, LRU_INACTIVE_ANON, &page_list); + + nr_reclaimed = shrink_page_list(&page_list, pgdat, &sc, &stat, true); + count_vm_event(FREEZE_RECLAIMED); + + while (!list_empty(&page_list)) { + page = lru_to_page(&page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + return nr_reclaimed; +} + +static ssize_t memcg_force_shrink_anon(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct pglist_data *pgdat; + int nid; + + for_each_online_node(nid) { + pgdat = NODE_DATA(nid); + reclaim_all_anon_memcg(pgdat, memcg); + } + + return nbytes; +} + +static int memcg_name_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "%s\n", memcg->name); + return 0; +} + +static ssize_t memcg_name_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + buf = strstrip(buf); + if (nbytes >= MEM_CGROUP_NAME_MAX_LEN) + return -EINVAL; + + mutex_lock(&reclaim_para_lock); + if (memcg) + strcpy(memcg->name, buf); + mutex_unlock(&reclaim_para_lock); + + return nbytes; +} + +static int memcg_total_info_per_app_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = NULL; + struct mem_cgroup_per_node *mz = NULL; + struct lruvec *lruvec = NULL; + unsigned long anon_size; + unsigned long zram_compress_size; + unsigned long eswap_compress_size; + + + while ((memcg = get_next_memcg(memcg))) { + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) { + get_next_memcg_break(memcg); + return 0; + } + + lruvec = &mz->lruvec; + if (!lruvec) { + get_next_memcg_break(memcg); + return 0; + } + + anon_size = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + zram_compress_size = memcg_data_size(memcg, CACHE_SIZE); + eswap_compress_size = memcg_data_size(memcg, SWAP_SIZE); + anon_size *= PAGE_SIZE / SZ_1K; + zram_compress_size /= SZ_1K; + eswap_compress_size /= SZ_1K; + + if (!strlen(memcg->name)) + continue; + + seq_printf(m, "%s %lu %lu %lu\n", memcg->name, anon_size, + zram_compress_size, eswap_compress_size); + } + + return 0; +} + +static int memcg_ub_ufs2zram_ratio_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + const unsigned int ratio = 100; + + if (val > ratio) + return -EINVAL; + + atomic64_set(&memcg->memcg_reclaimed.ub_ufs2zram_ratio, val); + + return 0; +} + +static u64 memcg_ub_ufs2zram_ratio_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio); +} + +static int memcg_force_swapin_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + unsigned long size; + const unsigned int ratio = 100; + + size = memcg_data_size(memcg, SWAP_SIZE); + size = atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio) * size / ratio; + + swapin_memcg(memcg, size); + + return 0; +} + +static struct cftype memcg_policy_files[] = { + { + .name = "name", + .write = memcg_name_write, + .seq_show = memcg_name_show, + }, + { + .name = "ub_ufs2zram_ratio", + .write_u64 = memcg_ub_ufs2zram_ratio_write, + .read_u64 = memcg_ub_ufs2zram_ratio_read, + }, + { + .name = "total_info_per_app", + .seq_show = memcg_total_info_per_app_show, + }, + { + .name = "app_score", + .write_u64 = mem_cgroup_app_score_write, + .read_u64 = mem_cgroup_app_score_read, + }, + { + .name = "force_shrink_anon", + .write = memcg_force_shrink_anon + }, + { + .name = "force_swapin", + .write_u64 = memcg_force_swapin_write, + }, + { }, /* terminate */ +}; + +static int __init memcg_policy_init(void) +{ + if (!mem_cgroup_disabled()) + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, + memcg_policy_files)); + + return 0; +} +subsys_initcall(memcg_policy_init); +#else +struct mem_cgroup *get_next_memcg(struct mem_cgroup *prev) +{ + return NULL; +} + +void get_next_memcg_break(struct mem_cgroup *memcg) +{ +} + + +struct mem_cgroup *get_prev_memcg(struct mem_cgroup *next) +{ + return NULL; +} + +void get_prev_memcg_break(struct mem_cgroup *memcg) +{ +} + +static u64 mem_cgroup_app_score_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return 0; +} + +static int mem_cgroup_app_score_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + return 0; +} + +void memcg_app_score_update(struct mem_cgroup *target) +{ +} +#endif diff --git a/mm/memcg_reclaim.c b/mm/memcg_reclaim.c new file mode 100644 index 000000000000..f88826c13ae2 --- /dev/null +++ b/mm/memcg_reclaim.c @@ -0,0 +1,516 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/memcg_reclaim.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ +#include +#include +#include + +#ifdef CONFIG_HYPERHOLD_FILE_LRU +#include +#include "internal.h" +#endif + +static inline bool is_swap_not_allowed(struct scan_control *sc, int swappiness) +{ + return !sc->may_swap || !swappiness || !get_nr_swap_pages(); +} + +/* + * From 0 .. 100. Higher means more swappy. + */ +#define HYPERHOLD_SWAPPINESS 100 + +static int get_hyperhold_swappiness(void) +{ + return is_hyperhold_enable() ? HYPERHOLD_SWAPPINESS : vm_swappiness; +} + +static void get_scan_count_hyperhold(struct pglist_data *pgdat, + struct scan_control *sc, unsigned long *nr, + unsigned long *lru_pages) +{ + int swappiness = get_hyperhold_swappiness(); + struct lruvec *lruvec = node_lruvec(pgdat); + u64 fraction[2]; + u64 denominator; + enum scan_balance scan_balance; + unsigned long ap, fp; + enum lru_list lru; + unsigned long pgdatfile; + unsigned long pgdatfree; + int z; + unsigned long anon_cost, file_cost, total_cost; + unsigned long total_high_wmark = 0; + + + if (cgroup_reclaim(sc) && !swappiness) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Do not apply any pressure balancing cleverness when the + * system is close to OOM, scan both anon and file equally + * (unless the swappiness setting disagrees with swapping). + */ + if (!sc->priority && swappiness) { + scan_balance = SCAN_EQUAL; + goto out; + } + + if (!cgroup_reclaim(sc)) { + pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { + /* + * Force SCAN_ANON if there are enough inactive + * anonymous pages on the LRU in eligible zones. + * Otherwise, the small LRU gets thrashed. + */ + if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON) && + (lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, + sc->reclaim_idx) >> + (unsigned int)sc->priority)) { + scan_balance = SCAN_ANON; + goto out; + } + } + } + + /* + * If there is enough inactive page cache, i.e. if the size of the + * inactive list is greater than that of the active list *and* the + * inactive list actually has some pages to scan on this priority, we + * do not reclaim anything from the anonymous working set right now. + * Without the second condition we could end up never scanning an + * lruvec even if it has plenty of old anonymous pages unless the + * system is under heavy pressure. + */ + + if (!IS_ENABLED(CONFIG_BALANCE_ANON_FILE_RECLAIM) && + !inactive_is_low(lruvec, LRU_INACTIVE_FILE) && + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { + scan_balance = SCAN_FILE; + goto out; + } + + scan_balance = SCAN_FRACT; + + /* + * Calculate the pressure balance between anon and file pages. + * + * The amount of pressure we put on each LRU is inversely + * proportional to the cost of reclaiming each list, as + * determined by the share of pages that are refaulting, times + * the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). + * + * Although we limit that influence to ensure no list gets + * left behind completely: at least a third of the pressure is + * applied, before swappiness. + * + * With swappiness at 100, anon and file have equal IO cost. + */ + total_cost = sc->anon_cost + sc->file_cost; + anon_cost = total_cost + sc->anon_cost; + file_cost = total_cost + sc->file_cost; + total_cost = anon_cost + file_cost; + + ap = swappiness * (total_cost + 1); + ap /= anon_cost + 1; + + fp = (200 - swappiness) * (total_cost + 1); + fp /= file_cost + 1; + + fraction[0] = ap; + fraction[1] = fp; + denominator = ap + fp; + +out: + *lru_pages = 0; + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long lruvec_size; + unsigned long scan; + + lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + scan = lruvec_size; + *lru_pages += scan; + scan >>= sc->priority; + + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + * Make sure we don't miss the last page on + * the offlined memory cgroups because of a + * round-off error. + */ + scan = DIV64_U64_ROUND_UP(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); + } + + nr[lru] = scan; + } +} + +#define ISOLATE_LIMIT_CNT 5 +void shrink_anon_memcg(struct pglist_data *pgdat, + struct mem_cgroup *memcg, struct scan_control *sc, + unsigned long *nr) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + struct blk_plug plug; + + blk_start_plug(&plug); + + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) { + for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + nr_reclaimed += + shrink_list(lru, nr_to_scan, + lruvec, sc); + } + } + if (sc->nr_reclaimed >= sc->nr_to_reclaim || + (sc->isolate_count > ISOLATE_LIMIT_CNT && + sc->invoker == DIRECT_RECLAIM)) + break; + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + sc->nr_reclaimed_anon += nr_reclaimed; +} + +static void shrink_anon(struct pglist_data *pgdat, + struct scan_control *sc, unsigned long *nr) +{ + unsigned long reclaimed; + unsigned long scanned; + struct mem_cgroup *memcg = NULL; + struct mem_cgroup *target_memcg = sc->target_mem_cgroup; + unsigned long nr_memcg[NR_LRU_LISTS]; + unsigned long nr_node_active = lruvec_lru_size( + node_lruvec(pgdat), LRU_ACTIVE_ANON, MAX_NR_ZONES); + unsigned long nr_node_inactive = lruvec_lru_size( + node_lruvec(pgdat), LRU_INACTIVE_ANON, MAX_NR_ZONES); + + while ((memcg = get_next_memcg(memcg))) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + reclaimed = sc->nr_reclaimed; + scanned = sc->nr_scanned; + + nr_memcg[LRU_ACTIVE_ANON] = nr[LRU_ACTIVE_ANON] * + lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) / (nr_node_active + 1); + nr_memcg[LRU_INACTIVE_ANON] = nr[LRU_INACTIVE_ANON] * + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, + MAX_NR_ZONES) / (nr_node_inactive + 1); + nr_memcg[LRU_ACTIVE_FILE] = 0; + nr_memcg[LRU_INACTIVE_FILE] = 0; + + /* + * This loop can become CPU-bound when target memcgs + * aren't eligible for reclaim - either because they + * don't have any reclaimable pages, or because their + * memory is explicitly protected. Avoid soft lockups. + */ + cond_resched(); + + mem_cgroup_calculate_protection(target_memcg, memcg); + + if (mem_cgroup_below_min(memcg)) { + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + } else if (mem_cgroup_below_low(memcg)) { + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; + continue; + } + memcg_memory_event(memcg, MEMCG_LOW); + } + + shrink_anon_memcg(pgdat, memcg, sc, nr_memcg); + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority); + + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + + if (sc->nr_reclaimed >= sc->nr_to_reclaim || + (sc->isolate_count > ISOLATE_LIMIT_CNT && + sc->invoker == DIRECT_RECLAIM)) { + get_next_memcg_break(memcg); + break; + } + } +} + +static void shrink_file(struct pglist_data *pgdat, + struct scan_control *sc, unsigned long *nr) +{ + struct lruvec *lruvec = node_lruvec(pgdat); + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + struct blk_plug plug; + + blk_start_plug(&plug); + + while (nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { + for (lru = LRU_INACTIVE_FILE; lru <= LRU_ACTIVE_FILE; lru++) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + nr_reclaimed += + shrink_list(lru, + nr_to_scan, + lruvec, sc); + } + } + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + sc->nr_reclaimed_file += nr_reclaimed; +} + +bool shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc) +{ + unsigned long nr_reclaimed, nr_scanned; + struct lruvec *target_lruvec; + bool reclaimable = false; + unsigned long file; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + do { + /* Get scan count for file and anon */ + unsigned long node_lru_pages = 0; + unsigned long nr[NR_LRU_LISTS] = {0}; + + memset(&sc->nr, 0, sizeof(sc->nr)); + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; + + /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&pgdat->lru_lock); + sc->anon_cost = mem_cgroup_lruvec(NULL, pgdat)->anon_cost; + sc->file_cost = node_lruvec(pgdat)->file_cost; + spin_unlock_irq(&pgdat->lru_lock); + + /* + * Target desirable inactive:active list ratios for the anon + * and file LRU lists. + */ + if (!sc->force_deactivate) { + unsigned long refaults; + + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_ANON); + if (refaults != target_lruvec->refaults[0] || + inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) + sc->may_deactivate |= DEACTIVATE_ANON; + else + sc->may_deactivate &= ~DEACTIVATE_ANON; + + /* + * When refaults are being observed, it means a new + * workingset is being established. Deactivate to get + * rid of any stale active pages quickly. + */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + refaults = lruvec_page_state(node_lruvec(pgdat), + WORKINGSET_ACTIVATE_FILE); + if (refaults != node_lruvec(pgdat)->refaults[1] || + inactive_is_low(node_lruvec(pgdat), LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; +#else + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_FILE); + if (refaults != target_lruvec->refaults[1] || + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; +#endif + else + sc->may_deactivate &= ~DEACTIVATE_FILE; + } else + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; + + /* + * If we have plenty of inactive file pages that aren't + * thrashing, try to reclaim those first before touching + * anonymous pages. + */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + file = lruvec_page_state(node_lruvec(pgdat), NR_INACTIVE_FILE); +#else + file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); +#endif + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + sc->cache_trim_mode = 1; + else + sc->cache_trim_mode = 0; + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (!cgroup_reclaim(sc)) { + unsigned long total_high_wmark = 0; + unsigned long free, anon; + int z; + + free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + file = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + /* + * Consider anon: if that's low too, this isn't a + * runaway file reclaim problem, but rather just + * extreme pressure. Reclaim as per usual then. + */ + anon = node_page_state(pgdat, NR_INACTIVE_ANON); + + sc->file_is_tiny = + file + free <= total_high_wmark && + !(sc->may_deactivate & DEACTIVATE_ANON) && + anon >> sc->priority; + } + + get_scan_count_hyperhold(pgdat, sc, nr, &node_lru_pages); + + /* Shrink the Total-File-LRU */ + shrink_file(pgdat, sc, nr); + + /* Shrink Anon by iterating score_list */ + shrink_anon(pgdat, sc, nr); + + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + + if (current_is_kswapd()) { + /* + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim. */ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); + + /* + * If kswapd scans pages marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so also forcibly stall. + */ + if (sc->nr.immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + /* + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in wait_iff_congested(). + */ + if ((current_is_kswapd() || + (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); + + /* + * Stall direct reclaim for IO completions if underlying BDIs + * and node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!current_is_kswapd() && current_may_throttle() && + !sc->hibernation_mode && + test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) + wait_iff_congested(BLK_RW_ASYNC, HZ/10); + + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, + sc)); + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + pgdat->kswapd_failures = 0; + + return reclaimable; +} diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 167169b3907d..47f5d8d61d2a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -666,7 +666,15 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, static unsigned long soft_limit_excess(struct mem_cgroup *memcg) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + struct mem_cgroup_per_node *mz = mem_cgroup_nodeinfo(memcg, 0); + struct lruvec *lruvec = &mz->lruvec; + unsigned long nr_pages = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, + MAX_NR_ZONES); +#else unsigned long nr_pages = page_counter_read(&memcg->memory); +#endif unsigned long soft_limit = READ_ONCE(memcg->soft_limit); unsigned long excess = 0; @@ -854,8 +862,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); /* Update memcg and lruvec */ - if (!mem_cgroup_disabled()) + if (!mem_cgroup_disabled()) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return; +#endif __mod_memcg_lruvec_state(lruvec, idx, val); + } } void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) @@ -906,6 +919,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!memcg) + return; +#endif x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); if (unlikely(x > MEMCG_CHARGE_BATCH)) { @@ -1350,6 +1367,13 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd goto out; } +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_file_lru(page_lru(page)) && + !is_prot_page(page)) { + lruvec = node_lruvec(pgdat); + goto out; + } +#endif memcg = page->mem_cgroup; /* * Swapcache readahead pages are added to the LRU - and @@ -1392,6 +1416,10 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, if (mem_cgroup_disabled()) return; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return; +#endif mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); lru_size = &mz->lru_zone_size[zid][lru]; @@ -5191,6 +5219,10 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { WARN_ON_ONCE(!rcu_read_lock_held()); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (id == -1) + return NULL; +#endif return idr_find(&mem_cgroup_idr, id); } @@ -5229,6 +5261,7 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) lruvec_init(&pn->lruvec); pn->usage_in_excess = 0; + pn->lruvec.pgdat = NODE_DATA(node); pn->on_tree = false; pn->memcg = memcg; @@ -5334,6 +5367,17 @@ static struct mem_cgroup *mem_cgroup_alloc(void) INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); memcg->deferred_split_queue.split_queue_len = 0; #endif + +#ifdef CONFIG_HYPERHOLD_MEMCG + if (unlikely(!score_head_inited)) { + INIT_LIST_HEAD(&score_head); + score_head_inited = true; + } +#endif + +#ifdef CONFIG_HYPERHOLD_MEMCG + INIT_LIST_HEAD(&memcg->score_node); +#endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; fail: @@ -5355,6 +5399,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(memcg)) return ERR_CAST(memcg); +#ifdef CONFIG_HYPERHOLD_MEMCG + atomic64_set(&memcg->memcg_reclaimed.app_score, 300); +#endif + page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); @@ -5421,6 +5469,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) return -ENOMEM; } +#ifdef CONFIG_HYPERHOLD_MEMCG + memcg_app_score_update(memcg); + css_get(css); +#endif + /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); css_get(css); @@ -5432,6 +5485,15 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event, *tmp; +#ifdef CONFIG_HYPERHOLD_MEMCG + unsigned long flags; + + spin_lock_irqsave(&score_list_lock, flags); + list_del_init(&memcg->score_node); + spin_unlock_irqrestore(&score_list_lock, flags); + css_put(css); +#endif + /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83c0146cb59e..843bf87e12a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6932,6 +6932,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); lruvec_init(&pgdat->__lruvec); +#if defined(CONFIG_HYPERHOLD_FILE_LRU) && defined(CONFIG_MEMCG) + pgdat->__lruvec.pgdat = pgdat; +#endif } static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, diff --git a/mm/swap.c b/mm/swap.c index 47a47681c86b..4ea819c7a9e4 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -311,6 +311,12 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) void lru_note_cost_page(struct page *page) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (page_is_file_lru(page)) { + lru_note_cost(&(page_pgdat(page)->__lruvec), 1, thp_nr_pages(page)); + return; + } +#endif lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)), page_is_file_lru(page), thp_nr_pages(page)); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9f292132ed88..0c1c2c9d66fc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -63,97 +63,9 @@ #define CREATE_TRACE_POINTS #include -struct scan_control { - /* How many pages shrink_list() should reclaim */ - unsigned long nr_to_reclaim; - - /* - * Nodemask of nodes allowed by the caller. If NULL, all nodes - * are scanned. - */ - nodemask_t *nodemask; - - /* - * The memory cgroup that hit its limit and as a result is the - * primary target of this reclaim invocation. - */ - struct mem_cgroup *target_mem_cgroup; - - /* - * Scan pressure balancing between anon and file LRUs - */ - unsigned long anon_cost; - unsigned long file_cost; - - /* Can active pages be deactivated as part of reclaim? */ -#define DEACTIVATE_ANON 1 -#define DEACTIVATE_FILE 2 - unsigned int may_deactivate:2; - unsigned int force_deactivate:1; - unsigned int skipped_deactivate:1; - - /* Writepage batching in laptop mode; RECLAIM_WRITE */ - unsigned int may_writepage:1; - - /* Can mapped pages be reclaimed? */ - unsigned int may_unmap:1; - - /* Can pages be swapped as part of reclaim? */ - unsigned int may_swap:1; - - /* - * Cgroup memory below memory.low is protected as long as we - * don't threaten to OOM. If any cgroup is reclaimed at - * reduced force or passed over entirely due to its memory.low - * setting (memcg_low_skipped), and nothing is reclaimed as a - * result, then go back for one more cycle that reclaims the protected - * memory (memcg_low_reclaim) to avert OOM. - */ - unsigned int memcg_low_reclaim:1; - unsigned int memcg_low_skipped:1; - - unsigned int hibernation_mode:1; - - /* One of the zones is ready for compaction */ - unsigned int compaction_ready:1; - - /* There is easily reclaimable cold cache in the current node */ - unsigned int cache_trim_mode:1; - - /* The file pages on the current node are dangerously low */ - unsigned int file_is_tiny:1; - - /* Allocation order */ - s8 order; - - /* Scan (total_size >> priority) pages at once */ - s8 priority; - - /* The highest zone to isolate pages for reclaim from */ - s8 reclaim_idx; - - /* This context's GFP mask */ - gfp_t gfp_mask; - - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Number of pages freed so far during a call to shrink_zones() */ - unsigned long nr_reclaimed; - - struct { - unsigned int dirty; - unsigned int unqueued_dirty; - unsigned int congested; - unsigned int writeback; - unsigned int immediate; - unsigned int file_taken; - unsigned int taken; - } nr; - - /* for recording the reclaimed slab by now */ - struct reclaim_state reclaim_state; -}; +#ifdef CONFIG_HYPERHOLD_FILE_LRU +#include +#endif #ifdef ARCH_HAS_PREFETCHW #define prefetchw_prev_lru_page(_page, _base, _field) \ @@ -169,6 +81,10 @@ struct scan_control { #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) #endif +#ifdef CONFIG_HYPERHOLD_FILE_LRU +unsigned int enough_inactive_file = 1; +#endif + /* * From 0 .. 200. Higher means more swappy. */ @@ -230,7 +146,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) idr_remove(&shrinker_idr, id); } -static bool cgroup_reclaim(struct scan_control *sc) +bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; } @@ -248,7 +164,7 @@ static bool cgroup_reclaim(struct scan_control *sc) * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational. */ -static bool writeback_throttling_sane(struct scan_control *sc) +bool writeback_throttling_sane(struct scan_control *sc) { if (!cgroup_reclaim(sc)) return true; @@ -268,12 +184,12 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) { } -static bool cgroup_reclaim(struct scan_control *sc) +bool cgroup_reclaim(struct scan_control *sc) { return false; } -static bool writeback_throttling_sane(struct scan_control *sc) +bool writeback_throttling_sane(struct scan_control *sc) { return true; } @@ -308,6 +224,20 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone unsigned long size = 0; int zid; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!mem_cgroup_disabled() && is_node_lruvec(lruvec)) { + for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; + + if (!managed_zone(zone)) + continue; + + size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); + } + + return size; + } +#endif for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) { struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; @@ -638,9 +568,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * * Returns the number of reclaimed slab objects. */ -static unsigned long shrink_slab(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, - int priority) +unsigned long shrink_slab(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, + int priority) { unsigned long ret, freed = 0; struct shrinker *shrinker; @@ -1064,11 +994,11 @@ static void page_check_dirty_writeback(struct page *page, /* * shrink_page_list() returns the number of reclaimed pages */ -static unsigned int shrink_page_list(struct list_head *page_list, - struct pglist_data *pgdat, - struct scan_control *sc, - struct reclaim_stat *stat, - bool ignore_references) +unsigned int shrink_page_list(struct list_head *page_list, + struct pglist_data *pgdat, + struct scan_control *sc, + struct reclaim_stat *stat, + bool ignore_references) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -1845,6 +1775,10 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, LIST_HEAD(pages_to_free); struct page *page; enum lru_list lru; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + bool prot; + bool file; +#endif while (!list_empty(list)) { page = lru_to_page(list); @@ -1878,8 +1812,23 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, list_add(&page->lru, &pages_to_free); } else { nr_moved += nr_pages; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (PageActive(page)) { + prot = is_prot_page(page); + file = page_is_file_lru(page); + if (!prot && file) { + lruvec = node_lruvec(pgdat); + workingset_age_nonresident(lruvec, + nr_pages); + } else { + workingset_age_nonresident(lruvec, + nr_pages); + } + } +#else if (PageActive(page)) workingset_age_nonresident(lruvec, nr_pages); +#endif } } @@ -1897,7 +1846,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, * In that case we should only throttle if the backing device it is * writing to is congested. In other cases it is safe to throttle. */ -static int current_may_throttle(void) +int current_may_throttle(void) { return !(current->flags & PF_LOCAL_THROTTLE) || current->backing_dev_info == NULL || @@ -1926,6 +1875,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (stalled) return 0; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + sc->isolate_count++; +#endif /* wait a bit for the reclaimer. */ msleep(100); stalled = true; @@ -1961,7 +1913,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, move_pages_to_lru(lruvec, &page_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (file) + lru_note_cost(node_lruvec(pgdat), file, stat.nr_pageout); + else + lru_note_cost(lruvec, file, stat.nr_pageout); +#else lru_note_cost(lruvec, file, stat.nr_pageout); + +#endif + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); @@ -2150,7 +2111,7 @@ unsigned long reclaim_pages(struct list_head *page_list) return nr_reclaimed; } -static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, +unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { @@ -2192,7 +2153,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, * 1TB 101 10GB * 10TB 320 32GB */ -static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) +bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { enum lru_list active_lru = inactive_lru + LRU_ACTIVE; unsigned long inactive, active; @@ -2211,13 +2172,6 @@ static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) return inactive * inactive_ratio < active; } -enum scan_balance { - SCAN_EQUAL, - SCAN_FRACT, - SCAN_ANON, - SCAN_FILE, -}; - /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined @@ -2227,6 +2181,7 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ +#ifndef CONFIG_HYPERHOLD_FILE_LRU static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { @@ -2536,6 +2491,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } +#endif /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) @@ -2555,9 +2511,9 @@ static bool in_reclaim_compaction(struct scan_control *sc) * calls try_to_compact_pages() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ -static inline bool should_continue_reclaim(struct pglist_data *pgdat, - unsigned long nr_reclaimed, - struct scan_control *sc) +inline bool should_continue_reclaim(struct pglist_data *pgdat, + unsigned long nr_reclaimed, + struct scan_control *sc) { unsigned long pages_for_compaction; unsigned long inactive_lru_pages; @@ -2608,6 +2564,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return inactive_lru_pages > pages_for_compaction; } +#ifndef CONFIG_HYPERHOLD_FILE_LRU static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { struct mem_cgroup *target_memcg = sc->target_mem_cgroup; @@ -2856,6 +2813,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) if (reclaimable) pgdat->kswapd_failures = 0; } +#endif /* * Returns true if compaction should go ahead for a costly-order request, or @@ -2972,7 +2930,11 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + shrink_node_hyperhold(zone->zone_pgdat, sc); +#else shrink_node(zone->zone_pgdat, sc); +#endif } /* @@ -2987,6 +2949,14 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) struct lruvec *target_lruvec; unsigned long refaults; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + struct lruvec *lruvec; + + lruvec = node_lruvec(pgdat); + lruvec->refaults[0] = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE_ANON); /* modified */ + lruvec->refaults[1] = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE_FILE); /* modified */ +#endif + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); target_lruvec->refaults[0] = refaults; @@ -3291,6 +3261,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, }; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + unsigned long nr[NR_LRU_LISTS]; +#endif WARN_ON_ONCE(!current->reclaim_state); @@ -3307,7 +3280,17 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + nr[LRU_ACTIVE_ANON] = lruvec_lru_size(lruvec, + LRU_ACTIVE_ANON, MAX_NR_ZONES); + nr[LRU_INACTIVE_ANON] = lruvec_lru_size(lruvec, + LRU_INACTIVE_ANON, MAX_NR_ZONES); + nr[LRU_ACTIVE_FILE] = 0; + nr[LRU_INACTIVE_FILE] = 0; + shrink_anon_memcg(pgdat, memcg, &sc, nr); +#else shrink_lruvec(lruvec, &sc); +#endif trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -3512,7 +3495,11 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * Historically care was taken to put equal pressure on all zones but * now pressure is applied based on node LRU order. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + shrink_node_hyperhold(pgdat, sc); +#else shrink_node(pgdat, sc); +#endif /* * Fragmentation may mean that the system cannot be rebalanced for @@ -4198,7 +4185,11 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in * priorities until we have enough memory freed. */ do { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + shrink_node_hyperhold(pgdat, &sc); +#else shrink_node(pgdat, &sc); +#endif } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 698bc0bc18d1..b362b22b30d8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1350,6 +1350,10 @@ const char * const vmstat_text[] = { "swap_ra", "swap_ra_hit", #endif +#ifdef CONFIG_HYPERHOLD_MEMCG + "freeze_reclaimed", + "freeze_reclaim_count", +#endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ diff --git a/mm/workingset.c b/mm/workingset.c index 975a4d2dd02e..28d9bf0c5e5d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -263,7 +263,16 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) VM_BUG_ON_PAGE(!PageLocked(page), page); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(page) && page_is_file_lru(page)) { + lruvec = node_lruvec(pgdat); + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + } else { + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + } +#else workingset_age_nonresident(lruvec, thp_nr_pages(page)); +#endif /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); @@ -313,9 +322,19 @@ void workingset_refault(struct page *page, void *shadow) * would be better if the root_mem_cgroup existed in all * configurations instead. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (memcgid == -1) + eviction_lruvec = node_lruvec(pgdat); + else { + eviction_memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_disabled() && !eviction_memcg) + goto out; + } +#else eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) goto out; +#endif eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -347,8 +366,15 @@ void workingset_refault(struct page *page, void *shadow) */ memcg = page_memcg(page); lruvec = mem_cgroup_lruvec(memcg, pgdat); - +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(page) && file) + inc_lruvec_state(node_lruvec(pgdat), + WORKINGSET_REFAULT_BASE + file); + else + inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file); +#else inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file); +#endif /* * Compare the distance to the existing workingset size. We @@ -357,10 +383,21 @@ void workingset_refault(struct page *page, void *shadow) * workingset competition needs to consider anon or not depends * on having swap. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + workingset_size = lruvec_page_state(node_lruvec(pgdat), NR_ACTIVE_FILE); +#else workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); +#endif + if (!file) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + workingset_size += lruvec_page_state(node_lruvec(pgdat), + NR_INACTIVE_FILE); +#else + workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); +#endif } if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, @@ -374,8 +411,19 @@ void workingset_refault(struct page *page, void *shadow) goto out; SetPageActive(page); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(page) && file) { + workingset_age_nonresident(node_lruvec(pgdat), + thp_nr_pages(page)); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file); + } else { + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file); + } +#else workingset_age_nonresident(lruvec, thp_nr_pages(page)); inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file); +#endif /* Page was active prior to eviction */ if (workingset) { @@ -384,7 +432,14 @@ void workingset_refault(struct page *page, void *shadow) spin_lock_irq(&page_pgdat(page)->lru_lock); lru_note_cost_page(page); spin_unlock_irq(&page_pgdat(page)->lru_lock); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(page) && file) + inc_lruvec_state(node_lruvec(pgdat), WORKINGSET_RESTORE_BASE + file); + else + inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file); +#else inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file); +#endif } out: rcu_read_unlock(); @@ -411,7 +466,16 @@ void workingset_activation(struct page *page) if (!mem_cgroup_disabled() && !memcg) goto out; lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(page) && page_is_file_lru(page)) { + lruvec = node_lruvec(page_pgdat(page)); + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + } else { + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + } +#else workingset_age_nonresident(lruvec, thp_nr_pages(page)); +#endif out: rcu_read_unlock(); } @@ -487,6 +551,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ #ifdef CONFIG_MEMCG +#ifdef CONFIG_HYPERHOLD_FILE_LRU + pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + + node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); +#else + if (sc->memcg) { struct lruvec *lruvec; int i; @@ -500,6 +569,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, pages += lruvec_page_state_local( lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; } else +#endif #endif pages = node_present_pages(sc->nid); -- Gitee From 9f64322026fe2bd9367030889bf7cd8eb8d54a21 Mon Sep 17 00:00:00 2001 From: CY Fan Date: Sat, 22 Jan 2022 17:33:57 +0800 Subject: [PATCH 006/113] hyperhold: add zswapd to actively reclaim and swap out anonymous pages ohos inclusion category: feature issue: #I4RXQ3 CVE: NA ----------------- zswapd periodically reclaims anonymous pages in the background and swaps zram memory to eswap, thus maintaining the memory buffer in a healthy state. Signed-off-by: CY Fan --- include/linux/memcg_policy.h | 7 + include/linux/mmzone.h | 6 + include/linux/swap.h | 3 + include/linux/vm_event_item.h | 14 + include/linux/zswapd.h | 98 ++++ include/trace/events/vmscan.h | 30 ++ mm/Kconfig | 10 + mm/Makefile | 1 + mm/internal.h | 16 +- mm/memcontrol.c | 10 +- mm/memory_hotplug.c | 7 + mm/page_alloc.c | 9 + mm/swapfile.c | 23 + mm/vmscan.c | 14 +- mm/vmstat.c | 14 + mm/zswapd.c | 882 ++++++++++++++++++++++++++++++++++ mm/zswapd_control.c | 878 +++++++++++++++++++++++++++++++++ mm/zswapd_internal.h | 41 ++ 18 files changed, 2049 insertions(+), 14 deletions(-) create mode 100644 include/linux/zswapd.h create mode 100644 mm/zswapd.c create mode 100644 mm/zswapd_control.c create mode 100644 mm/zswapd_internal.h diff --git a/include/linux/memcg_policy.h b/include/linux/memcg_policy.h index 8dd856ead095..201b0e973e3c 100644 --- a/include/linux/memcg_policy.h +++ b/include/linux/memcg_policy.h @@ -32,6 +32,13 @@ void memcg_app_score_update(struct mem_cgroup *target); struct memcg_reclaim { atomic64_t app_score; atomic64_t ub_ufs2zram_ratio; +#ifdef CONFIG_HYPERHOLD_ZSWAPD + atomic_t ub_zram2ufs_ratio; + atomic_t ub_mem2zram_ratio; + atomic_t refault_threshold; + /* anon refault */ + unsigned long long reclaimed_pagefault; +#endif }; #define MAX_APP_SCORE 1000 #endif diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b73098363526..855a598ff674 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -763,6 +763,12 @@ typedef struct pglist_data { int kswapd_failures; /* Number of 'reclaimed == 0' runs */ +#ifdef CONFIG_HYPERHOLD_ZSWAPD + wait_queue_head_t zswapd_wait; + atomic_t zswapd_wait_flag; + struct task_struct *zswapd; +#endif + #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; diff --git a/include/linux/swap.h b/include/linux/swap.h index 970062c26311..517ab5adb973 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -458,6 +458,9 @@ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; extern bool has_usable_swap(void); +#ifdef CONFIG_HYPERHOLD_ZSWAPD +extern bool free_swap_is_low(void); +#endif /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index c64c3d73893c..add63d0bc703 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -121,6 +121,20 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, SWAP_RA, SWAP_RA_HIT, #endif +#ifdef CONFIG_HYPERHOLD_ZSWAPD + ZSWAPD_WAKEUP, + ZSWAPD_REFAULT, + ZSWAPD_MEDIUM_PRESS, + ZSWAPD_CRITICAL_PRESS, + ZSWAPD_MEMCG_RATIO_SKIP, + ZSWAPD_MEMCG_REFAULT_SKIP, + ZSWAPD_SWAPOUT, + ZSWAPD_EMPTY_ROUND, + ZSWAPD_EMPTY_ROUND_SKIP_TIMES, + ZSWAPD_SNAPSHOT_TIMES, + ZSWAPD_RECLAIMED, + ZSWAPD_SCANNED, +#endif #ifdef CONFIG_HYPERHOLD_MEMCG FREEZE_RECLAIMED, FREEZE_RECLAIME_COUNT, diff --git a/include/linux/zswapd.h b/include/linux/zswapd.h new file mode 100644 index 000000000000..44cd060b12e4 --- /dev/null +++ b/include/linux/zswapd.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/zswapd.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZSWAPD_H +#define _ZSWAPD_H + +enum { + CACHE_SIZE, + SWAP_SIZE, + CACHE_PAGE, + SWAP_PAGE, + CACHE_FAULT, + SWAP_FAULT, + READ_SIZE, + WRITE_SIZE, +}; + +struct group_swap_ops { + u64 (*group_read)(u16 gid, u64 req_size, void *priv); + u64 (*group_write)(u16 gid, u64 req_size, void *priv); + u64 (*group_data_size)(u16 gid, int type, void *priv); +}; + +struct group_swap_device { + void *priv; + struct group_swap_ops *ops; + struct list_head list; +}; + +#ifdef CONFIG_HYPERHOLD_ZSWAPD +extern int zswapd_run(int nid); +extern void zswapd_stop(int nid); +extern void wakeup_zswapd(pg_data_t *pgdat); +extern bool zram_watermark_ok(void); +extern void zswapd_status_show(struct seq_file *m); +extern void wake_all_zswapd(void); +extern void set_snapshotd_init_flag(unsigned int val); +extern pid_t get_zswapd_pid(void); +extern unsigned long long get_free_swap_threshold(void); +extern struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv); +extern void unregister_group_swap(struct group_swap_device *gsdev); +extern void memcg_eswap_info_show(struct seq_file *m); +#else +static inline int zswap_run(int nid) +{ + return 0; +} + +static inline void zswapd_stop(int nid) +{ +} + +static inline void wakeup_zswapd(pg_data_t *pgdat) +{ +} + +static inline bool zram_watermark_ok(void) +{ + return true; +} + +static inline void zswapd_status_show(struct seq_file *m) +{ +} + +static inline void wake_all_zswapd(void) +{ +} + +static inline void set_snapshotd_init_flag(unsigned int val) +{ +} + +static inline pid_t get_zswapd_pid(void) +{ + return -EINVAL; +} + +static inline u64 get_free_swap_threshold(void) +{ + return 0; +} + +static struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv) +{ + return NULL; +} + +static void unregister_group_swap(struct group_swap_device *gsdev) +{ +} +#endif + +#endif /* _LINUX_ZSWAPD_H */ diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 2070df64958e..a71ba5860e56 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -336,6 +336,36 @@ TRACE_EVENT(mm_vmscan_writepage, show_reclaim_flags(__entry->reclaim_flags)) ); +#ifdef CONFIG_HYPERHOLD_ZSWAPD +TRACE_EVENT(mm_vmscan_lru_zswapd_shrink_active, + + TP_PROTO(int nid, unsigned long nr_taken, + unsigned long nr_deactivated, int priority), + + TP_ARGS(nid, nr_taken, nr_deactivated, priority), + + TP_STRUCT__entry( + __field(int, nid) + __field(unsigned long, nr_taken) + __field(unsigned long, nr_deactivated) + __field(int, priority) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->nr_taken = nr_taken; + __entry->nr_deactivated = nr_deactivated; + __entry->priority = priority; + ), + + TP_printk("nid=%d nr_taken=%ld nr_deactivated=%ld priority=%d", + __entry->nid, + __entry->nr_taken, + __entry->nr_deactivated, + __entry->priority) +); +#endif + TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, diff --git a/mm/Kconfig b/mm/Kconfig index 27d50ee6e19f..6760018a1c8c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -80,6 +80,16 @@ config HYPERHOLD_MEMCG Add more attributes in memory cgroup, these attribute is used to show information, shrink memory, swapin page and so on. +config HYPERHOLD_ZSWAPD + bool "Enable zswapd thread to reclaim anon pages in background" + depends on HYPERHOLD + default n + help + zswapd is a kernel thread that reclaim anonymous pages in the + background. When the use of swap pages reaches the watermark + and the refault of anonymous pages is high, the content of + zram will exchanged to eswap by a certain percentage. + config DISCONTIGMEM def_bool y depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL diff --git a/mm/Makefile b/mm/Makefile index 6a80a70eff30..56abb804cc19 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -122,3 +122,4 @@ obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_HYPERHOLD_FILE_LRU) += memcg_reclaim.o obj-$(CONFIG_HYPERHOLD_MEMCG) += memcg_control.o +obj-$(CONFIG_HYPERHOLD_ZSWAPD) += zswapd.o zswapd_control.o diff --git a/mm/internal.h b/mm/internal.h index cb3d736d1856..ccdee4a0368d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -227,11 +227,17 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); -extern unsigned int shrink_page_list(struct list_head *page_list, - struct pglist_data *pgdat, - struct scan_control *sc, - struct reclaim_stat *stat, - bool ignore_references); +extern unsigned int shrink_page_list(struct list_head *page_list, struct pglist_data *pgdat, + struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references); +extern unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct lruvec *lruvec, + struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, + enum lru_list lru); +extern unsigned move_pages_to_lru(struct lruvec *lruvec, struct list_head *list); +extern void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru); +extern unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru); +extern void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc); /* * in mm/rmap.c: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 47f5d8d61d2a..30e068e95e21 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -65,6 +65,7 @@ #include "slab.h" #include +#include #include @@ -4196,6 +4197,9 @@ static int memcg_stat_show(struct seq_file *m, void *v) } #endif +#ifdef CONFIG_HYPERHOLD_DEBUG + memcg_eswap_info_show(m); +#endif return 0; } @@ -5402,7 +5406,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #ifdef CONFIG_HYPERHOLD_MEMCG atomic64_set(&memcg->memcg_reclaimed.app_score, 300); #endif - +#ifdef CONFIG_HYPERHOLD_ZSWAPD + atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, 10); + atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, 60); + atomic_set(&memcg->memcg_reclaimed.refault_threshold, 50); +#endif page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6275b1c05f11..5da1c0299456 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -851,6 +852,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, kswapd_run(nid); kcompactd_run(nid); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + zswapd_run(nid); +#endif writeback_set_ratelimit(); @@ -1600,6 +1604,9 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) if (arg.status_change_nid >= 0) { kswapd_stop(node); kcompactd_stop(node); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + zswapd_stop(node); +#endif } writeback_set_ratelimit(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 843bf87e12a1..15d25006cfa0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include @@ -4924,6 +4925,11 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + wake_all_zswapd(); +#endif + if (should_fail_alloc_page(gfp_mask, order)) return false; @@ -6928,6 +6934,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + init_waitqueue_head(&pgdat->zswapd_wait); +#endif pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); diff --git a/mm/swapfile.c b/mm/swapfile.c index 5af6b0f770de..181cfc1b1296 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -43,6 +43,7 @@ #include #include #include +#include static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); @@ -3441,6 +3442,28 @@ void si_swapinfo(struct sysinfo *val) spin_unlock(&swap_lock); } +#ifdef CONFIG_HYPERHOLD_ZSWAPD +bool free_swap_is_low(void) +{ + unsigned int type; + unsigned long long freeswap = 0; + unsigned long nr_to_be_unused = 0; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *si = swap_info[type]; + + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) + nr_to_be_unused += si->inuse_pages; + } + freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; + spin_unlock(&swap_lock); + + return (freeswap < get_free_swap_threshold()); +} +EXPORT_SYMBOL(free_swap_is_low); +#endif + /* * Verify that a swap entry is valid and increment its swap map count. * diff --git a/mm/vmscan.c b/mm/vmscan.c index 0c1c2c9d66fc..86da03e277c5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1572,7 +1572,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, * * returns how many pages were moved onto *@dst. */ -static unsigned long isolate_lru_pages(unsigned long nr_to_scan, +unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, enum lru_list lru) @@ -1767,8 +1767,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, * Returns the number of pages moved to the given lruvec. */ -static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, - struct list_head *list) +unsigned move_pages_to_lru(struct lruvec *lruvec, struct list_head *list) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); int nr_pages, nr_moved = 0; @@ -1857,9 +1856,8 @@ int current_may_throttle(void) * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ -static noinline_for_stack unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, - struct scan_control *sc, enum lru_list lru) +unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru) { LIST_HEAD(page_list); unsigned long nr_scanned; @@ -1962,7 +1960,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return nr_reclaimed; } -static void shrink_active_list(unsigned long nr_to_scan, +void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) @@ -2378,7 +2376,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, } } -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; diff --git a/mm/vmstat.c b/mm/vmstat.c index b362b22b30d8..a03aa6b3e4dc 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1350,6 +1350,20 @@ const char * const vmstat_text[] = { "swap_ra", "swap_ra_hit", #endif +#ifdef CONFIG_HYPERHOLD_ZSWAPD + "zswapd_running", + "zswapd_hit_refaults", + "zswapd_medium_press", + "zswapd_critical_press", + "zswapd_memcg_ratio_skip", + "zswapd_memcg_refault_skip", + "zswapd_swapout", + "zswapd_empty_round", + "zswapd_empty_round_skip_times", + "zswapd_snapshot_times", + "zswapd_reclaimed", + "zswapd_scanned", +#endif #ifdef CONFIG_HYPERHOLD_MEMCG "freeze_reclaimed", "freeze_reclaim_count", diff --git a/mm/zswapd.c b/mm/zswapd.c new file mode 100644 index 000000000000..577d97974229 --- /dev/null +++ b/mm/zswapd.c @@ -0,0 +1,882 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/zswapd.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include +#include + +#include "zswapd_internal.h" +#include "internal.h" + +#define UNSET_ZRAM_WM_RATIO 0 +#define DEFAULT_ZRAM_WM_RATIO 37 +#define SWAP_MORE_ZRAM (50 * (SZ_1M)) + +static wait_queue_head_t snapshotd_wait; +static atomic_t snapshotd_wait_flag; +static atomic_t snapshotd_init_flag = ATOMIC_INIT(0); +static struct task_struct *snapshotd_task; + +static pid_t zswapd_pid = -1; +static unsigned long long last_anon_pagefault; +static unsigned long long anon_refault_ratio; +static unsigned long long zswapd_skip_interval; +static unsigned long last_zswapd_time; +static unsigned long last_snapshot_time; +bool last_round_is_empty; + + +DECLARE_RWSEM(gs_lock); +LIST_HEAD(gs_list); + +void unregister_group_swap(struct group_swap_device *gsdev) +{ + down_write(&gs_lock); + list_del(&gsdev->list); + up_write(&gs_lock); + + kfree(gsdev); +} +EXPORT_SYMBOL(unregister_group_swap); + +struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv) +{ + struct group_swap_device *gsdev = kzalloc(sizeof(struct group_swap_device), GFP_KERNEL); + + if (!gsdev) + return NULL; + + gsdev->priv = priv; + gsdev->ops = ops; + + down_write(&gs_lock); + list_add(&gsdev->list, &gs_list); + up_write(&gs_lock); + + return gsdev; +} +EXPORT_SYMBOL(register_group_swap); + +u64 memcg_data_size(struct mem_cgroup *memcg, int type) +{ + struct group_swap_device *gsdev = NULL; + u64 size = 0; + + down_read(&gs_lock); + list_for_each_entry(gsdev, &gs_list, list) + size += gsdev->ops->group_data_size(memcg->id.id, type, gsdev->priv); + up_read(&gs_lock); + + return size; +} + +u64 swapin_memcg(struct mem_cgroup *memcg, u64 req_size) +{ + u64 swap_size = memcg_data_size(memcg, SWAP_SIZE); + u64 read_size = 0; + u64 ratio = atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio); + struct group_swap_device *gsdev = NULL; + + if (req_size > swap_size * ratio) + req_size = swap_size * ratio; + down_read(&gs_lock); + list_for_each_entry(gsdev, &gs_list, list) { + read_size += gsdev->ops->group_write(memcg->id.id, req_size - read_size, + gsdev->priv); + if (read_size >= req_size) + break; + } + up_read(&gs_lock); + + return read_size; +} + +static u64 swapout_memcg(struct mem_cgroup *memcg, u64 req_size) +{ + u64 cache_size = memcg_data_size(memcg, CACHE_SIZE); + u64 swap_size = memcg_data_size(memcg, SWAP_SIZE); + u64 all_size = cache_size + swap_size; + u64 write_size = 0; + u32 ratio = atomic_read(&memcg->memcg_reclaimed.ub_zram2ufs_ratio); + struct group_swap_device *gsdev = NULL; + + if (all_size * ratio <= swap_size) + return 0; + if (req_size > all_size * ratio - swap_size) + req_size = all_size * ratio - swap_size; + down_read(&gs_lock); + list_for_each_entry(gsdev, &gs_list, list) { + write_size += gsdev->ops->group_write(memcg->id.id, req_size - write_size, + gsdev->priv); + if (write_size >= req_size) + break; + } + up_read(&gs_lock); + + return write_size; +} + +static u64 swapout(u64 req_size) +{ + struct mem_cgroup *memcg = NULL; + u64 write_size = 0; + + while ((memcg = get_next_memcg(memcg))) { + write_size += swapout_memcg(memcg, req_size - write_size); + if (write_size >= req_size) + break; + } + + return write_size; +} + +static unsigned long long get_zram_used_pages(void) +{ + struct mem_cgroup *memcg = NULL; + unsigned long long zram_pages = 0; + + while ((memcg = get_next_memcg(memcg))) + zram_pages += memcg_data_size(memcg, CACHE_PAGE); + + return zram_pages; +} + +static unsigned long long get_eswap_used_pages(void) +{ + struct mem_cgroup *memcg = NULL; + unsigned long long eswap_pages = 0; + + while ((memcg = get_next_memcg(memcg))) + eswap_pages += memcg_data_size(memcg, SWAP_PAGE); + + return eswap_pages; +} + +static unsigned long long get_zram_pagefault(void) +{ + struct mem_cgroup *memcg = NULL; + unsigned long long cache_fault = 0; + + while ((memcg = get_next_memcg(memcg))) + cache_fault += memcg_data_size(memcg, CACHE_FAULT); + + return cache_fault; +} + +static unsigned int calc_sys_cur_avail_buffers(void) +{ + const unsigned int percent_constant = 100; + unsigned long freemem; + unsigned long active_file; + unsigned long inactive_file; + unsigned long inactive_anon; + unsigned long buffers; + + freemem = global_zone_page_state(NR_FREE_PAGES) * PAGE_SIZE / SZ_1K; + active_file = global_node_page_state(NR_ACTIVE_FILE) * PAGE_SIZE / SZ_1K; + inactive_file = global_node_page_state(NR_INACTIVE_FILE) * PAGE_SIZE / SZ_1K; + inactive_anon = global_node_page_state(NR_INACTIVE_ANON) * PAGE_SIZE / SZ_1K; + + buffers = freemem + inactive_file * get_inactive_file_ratio() / percent_constant + + active_file * get_active_file_ratio() / percent_constant; + + return (buffers * SZ_1K / SZ_1M); /* kb to mb */ +} + +void zswapd_status_show(struct seq_file *m) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + seq_printf(m, "buffer_size:%u\n", buffers); + seq_printf(m, "recent_refault:%llu\n", anon_refault_ratio); +} + +pid_t get_zswapd_pid(void) +{ + return zswapd_pid; +} + +static bool min_buffer_is_suitable(void) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + if (buffers >= get_min_avail_buffers()) + return true; + + return false; +} + +static bool buffer_is_suitable(void) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + if (buffers >= get_avail_buffers()) + return true; + + return false; +} + +static bool high_buffer_is_suitable(void) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + if (buffers >= get_high_avail_buffers()) + return true; + + return false; +} + +static void snapshot_anon_refaults(void) +{ + struct mem_cgroup *memcg = NULL; + + while (memcg = get_next_memcg(memcg)) + memcg->memcg_reclaimed.reclaimed_pagefault = memcg_data_size(memcg, CACHE_FAULT); + + last_anon_pagefault = get_zram_pagefault(); + last_snapshot_time = jiffies; +} + +/* + * Return true if refault changes between two read operations. + */ +static bool get_memcg_anon_refault_status(struct mem_cgroup *memcg) +{ + const unsigned int percent_constant = 100; + unsigned long long anon_pagefault; + unsigned long anon_total; + unsigned long long ratio; + struct mem_cgroup_per_node *mz = NULL; + struct lruvec *lruvec = NULL; + + if (!memcg) + return false; + + anon_pagefault = memcg_data_size(memcg, CACHE_FAULT); + if (anon_pagefault == memcg->memcg_reclaimed.reclaimed_pagefault) + return false; + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) + return false; + + lruvec = &mz->lruvec; + if (!lruvec) + return false; + + anon_total = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES) + + memcg_data_size(memcg, SWAP_PAGE) + memcg_data_size(memcg, CACHE_PAGE); + + ratio = (anon_pagefault - memcg->memcg_reclaimed.reclaimed_pagefault) * + percent_constant / (anon_total + 1); + if (ratio > atomic_read(&memcg->memcg_reclaimed.refault_threshold)) + return true; + + return false; +} + +static bool get_area_anon_refault_status(void) +{ + const unsigned int percent_constant = 1000; + unsigned long long anon_pagefault; + unsigned long long ratio; + unsigned long long time; + + anon_pagefault = get_zram_pagefault(); + time = jiffies; + if (anon_pagefault == last_anon_pagefault || time == last_snapshot_time) + return false; + + ratio = (anon_pagefault - last_anon_pagefault) * percent_constant / + (jiffies_to_msecs(time - last_snapshot_time) + 1); + anon_refault_ratio = ratio; + + if (ratio > get_area_anon_refault_threshold()) + return true; + + return false; +} + +void wakeup_snapshotd(void) +{ + unsigned long snapshot_interval; + + snapshot_interval = jiffies_to_msecs(jiffies - last_snapshot_time); + if (snapshot_interval >= get_anon_refault_snapshot_min_interval()) { + atomic_set(&snapshotd_wait_flag, 1); + wake_up_interruptible(&snapshotd_wait); + } +} + +static int snapshotd(void *p) +{ + int ret; + + while (!kthread_should_stop()) { + ret = wait_event_interruptible(snapshotd_wait, atomic_read(&snapshotd_wait_flag)); + if (ret) + continue; + + atomic_set(&snapshotd_wait_flag, 0); + + snapshot_anon_refaults(); + count_vm_event(ZSWAPD_SNAPSHOT_TIMES); + } + + return 0; +} + +void set_snapshotd_init_flag(unsigned int val) +{ + atomic_set(&snapshotd_init_flag, val); +} + +/* + * This snapshotd start function will be called by init. + */ +int snapshotd_run(void) +{ + atomic_set(&snapshotd_wait_flag, 0); + init_waitqueue_head(&snapshotd_wait); + + snapshotd_task = kthread_run(snapshotd, NULL, "snapshotd"); + if (IS_ERR(snapshotd_task)) { + pr_err("Failed to start snapshotd\n"); + return PTR_ERR(snapshotd_task); + } + + return 0; +} + +static int __init snapshotd_init(void) +{ + snapshotd_run(); + + return 0; +} +module_init(snapshotd_init); + +static int get_zswapd_eswap_policy(void) +{ + if (get_zram_wm_ratio() == UNSET_ZRAM_WM_RATIO) + return CHECK_BUFFER_ONLY; + else + return CHECK_BUFFER_ZRAMRATIO_BOTH; +} + +static unsigned int get_policy_zram_wm_ratio(void) +{ + enum zswapd_eswap_policy policy = get_zswapd_eswap_policy(); + + if (policy == CHECK_BUFFER_ONLY) + return DEFAULT_ZRAM_WM_RATIO; + else + return get_zram_wm_ratio(); +} + +int get_zram_current_watermark(void) +{ + long long diff_buffers; + const unsigned int percent_constant = 10; + u64 nr_total; + unsigned int zram_wm_ratio = get_policy_zram_wm_ratio(); + + nr_total = totalram_pages(); + /* B_target - B_current */ + diff_buffers = get_avail_buffers() - calc_sys_cur_avail_buffers(); + /* MB to page */ + diff_buffers *= SZ_1M / PAGE_SIZE; + /* after_comp to before_comp */ + diff_buffers *= get_compress_ratio(); + /* page to ratio */ + diff_buffers = diff_buffers * percent_constant / nr_total; + + return min(zram_wm_ratio, zram_wm_ratio - diff_buffers); +} + +bool zram_watermark_ok(void) +{ + const unsigned int percent_constant = 100; + u64 nr_zram_used; + u64 nr_wm; + u64 ratio; + + ratio = get_zram_current_watermark(); + nr_zram_used = get_zram_used_pages(); + nr_wm = totalram_pages() * ratio / percent_constant; + if (nr_zram_used > nr_wm) + return true; + + return false; +} + +bool zram_watermark_exceed(void) +{ + u64 nr_zram_used; + const unsigned long long nr_wm = get_zram_critical_threshold() * (SZ_1M / PAGE_SIZE); + + if (!nr_wm) + return false; + + nr_zram_used = get_zram_used_pages(); + if (nr_zram_used > nr_wm) + return true; + return false; +} + +void wakeup_zswapd(pg_data_t *pgdat) +{ + unsigned long interval; + + if (IS_ERR(pgdat->zswapd)) + return; + + if (!wq_has_sleeper(&pgdat->zswapd_wait)) + return; + + /* + * make anon pagefault snapshots + * wake up snapshotd + */ + if (atomic_read(&snapshotd_init_flag) == 1) + wakeup_snapshotd(); + + /* wake up when the buffer is lower than min_avail_buffer */ + if (min_buffer_is_suitable()) + return; + + interval = jiffies_to_msecs(jiffies - last_zswapd_time); + if (interval < zswapd_skip_interval) { + count_vm_event(ZSWAPD_EMPTY_ROUND_SKIP_TIMES); + return; + } + + atomic_set(&pgdat->zswapd_wait_flag, 1); + wake_up_interruptible(&pgdat->zswapd_wait); +} + +void wake_all_zswapd(void) +{ + pg_data_t *pgdat = NULL; + int nid; + + for_each_online_node(nid) { + pgdat = NODE_DATA(nid); + wakeup_zswapd(pgdat); + } +} + +static void zswapd_shrink_active_list(unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) +{ + unsigned int nr_deactivate; + unsigned long nr_scanned; + unsigned long nr_taken; + + struct page *page = NULL; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + unsigned long *node_anon_cost = &pgdat->__lruvec.anon_cost; + unsigned long *anon_cost = &lruvec->anon_cost; + LIST_HEAD(l_inactive); + LIST_HEAD(l_hold); + + lru_add_drain(); + + spin_lock_irq(&pgdat->lru_lock); + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON, nr_taken); + *anon_cost += nr_taken; + *node_anon_cost += nr_taken; + __count_vm_events(PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + spin_unlock_irq(&pgdat->lru_lock); + + while (!list_empty(&l_hold)) { + cond_resched(); + page = lru_to_page(&l_hold); + list_del(&page->lru); + + if (unlikely(!page_evictable(page))) { + putback_lru_page(page); + continue; + } + + ClearPageActive(page); + SetPageWorkingset(page); + list_add(&page->lru, &l_inactive); + } + + spin_lock_irq(&pgdat->lru_lock); + nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON, -nr_taken); + spin_unlock_irq(&pgdat->lru_lock); + + mem_cgroup_uncharge_list(&l_inactive); + free_unref_page_list(&l_inactive); + + trace_mm_vmscan_lru_zswapd_shrink_active(pgdat->node_id, nr_taken, + nr_deactivate, sc->priority); +} + +static unsigned long zswapd_shrink_list(enum lru_list lru, + unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc) +{ + if (is_active_lru(lru)) { + if (sc->may_deactivate & (1 << is_file_lru(lru))) + zswapd_shrink_active_list(nr_to_scan, lruvec, sc, lru); + else + sc->skipped_deactivate = 1; + return 0; + } + + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); +} + +static void zswapd_shrink_anon_memcg(struct pglist_data *pgdat, + struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long nr_reclaimed = 0; + unsigned long nr_to_scan; + struct blk_plug plug; + enum lru_list lru; + + blk_start_plug(&plug); + + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) { + for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + nr_reclaimed += zswapd_shrink_list(lru, + nr_to_scan, lruvec, sc); + } + } + } + + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; +} + +static bool zswapd_shrink_anon(pg_data_t *pgdat, struct scan_control *sc) +{ + const unsigned int percent_constant = 100; + struct mem_cgroup *memcg = NULL; + unsigned long nr[NR_LRU_LISTS]; + + while ((memcg = get_next_memcg(memcg))) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + u64 nr_active, nr_inactive, nr_zram, nr_eswap, zram_ratio; + + /* reclaim and try to meet the high buffer watermark */ + if (high_buffer_is_suitable()) { + get_next_memcg_break(memcg); + break; + } + + if (get_memcg_anon_refault_status(memcg)) { + count_vm_event(ZSWAPD_MEMCG_REFAULT_SKIP); + continue; + } + + nr_active = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES); + nr_inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + nr_zram = memcg_data_size(memcg, CACHE_PAGE); + nr_eswap = memcg_data_size(memcg, SWAP_PAGE); + + zram_ratio = (nr_zram + nr_eswap) * percent_constant / + (nr_inactive + nr_active + nr_zram + nr_eswap + 1); + if (zram_ratio >= (u32)atomic_read(&memcg->memcg_reclaimed.ub_mem2zram_ratio)) { + count_vm_event(ZSWAPD_MEMCG_RATIO_SKIP); + continue; + } + + nr[LRU_ACTIVE_ANON] = nr_active >> (unsigned int)sc->priority; + nr[LRU_INACTIVE_ANON] = nr_inactive >> (unsigned int)sc->priority; + nr[LRU_ACTIVE_FILE] = 0; + nr[LRU_INACTIVE_FILE] = 0; + +#ifdef CONFIG_HYPERHOLD_FILE_LRU + zswapd_shrink_anon_memcg(pgdat, memcg, sc, nr); +#else + shrink_lruvec(lruvec, sc); +#endif + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); + + if (sc->nr_reclaimed >= sc->nr_to_reclaim) { + get_next_memcg_break(memcg); + break; + } + } + + return sc->nr_scanned >= sc->nr_to_reclaim; +} + +static u64 __calc_nr_to_reclaim(void) +{ + unsigned int buffers; + unsigned int high_buffers; + unsigned int max_reclaim_size; + u64 reclaim_size = 0; + + high_buffers = get_high_avail_buffers(); + buffers = calc_sys_cur_avail_buffers(); + max_reclaim_size = get_zswapd_max_reclaim_size(); + if (buffers < high_buffers) + reclaim_size = high_buffers - buffers; + + /* once max reclaim target is max_reclaim_size */ + reclaim_size = min(reclaim_size, max_reclaim_size); + + /* MB to pages */ + return reclaim_size * SZ_1M / PAGE_SIZE; +} + +static void zswapd_shrink_node(pg_data_t *pgdat) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .order = 0, + .priority = DEF_PRIORITY / 2, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, + }; + const unsigned int increase_rate = 2; + + do { + unsigned long nr_reclaimed = sc.nr_reclaimed; + bool raise_priority = true; + + /* reclaim and try to meet the high buffer watermark */ + if (high_buffer_is_suitable()) + break; + + sc.nr_scanned = 0; + sc.nr_to_reclaim = __calc_nr_to_reclaim(); + + if (zswapd_shrink_anon(pgdat, &sc)) + raise_priority = false; + count_vm_events(ZSWAPD_SCANNED, sc.nr_scanned); + count_vm_events(ZSWAPD_RECLAIMED, sc.nr_reclaimed); + if (try_to_freeze() || kthread_should_stop()) + break; + + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + if (raise_priority || !nr_reclaimed) + sc.priority--; + } while (sc.priority >= 1); + + /* + * When meets the first empty round, set the interval to t. + * If the following round is still empty, set the intervall + * to 2t. If the round is always empty, then 4t, 8t, and so on. + * But make sure the interval is not more than the max_skip_interval. + * Once a non-empty round occurs, reset the interval to 0. + */ + if (sc.nr_reclaimed < get_empty_round_check_threshold()) { + count_vm_event(ZSWAPD_EMPTY_ROUND); + if (last_round_is_empty) + zswapd_skip_interval = min(zswapd_skip_interval * + increase_rate, get_max_skip_interval()); + else + zswapd_skip_interval = get_empty_round_skip_interval(); + last_round_is_empty = true; + } else { + zswapd_skip_interval = 0; + last_round_is_empty = false; + } +} + +u64 zram_watermark_diff(void) +{ + const unsigned int percent_constant = 100; + u64 nr_zram_used; + u64 nr_wm; + u64 ratio; + + ratio = get_zram_current_watermark(); + nr_zram_used = get_zram_used_pages(); + nr_wm = totalram_pages() * ratio / percent_constant; + if (nr_zram_used > nr_wm) + return (nr_zram_used - nr_wm) * PAGE_SIZE + SWAP_MORE_ZRAM; + + return 0; +} + +u64 zswapd_buffer_diff(void) +{ + u64 buffers; + u64 avail; + + buffers = calc_sys_cur_avail_buffers(); + avail = get_high_avail_buffers(); + if (buffers < avail) + return (avail - buffers) * SZ_1M; + + return 0; +} + +u64 get_do_eswap_size(bool refault) +{ + u64 size = 0; + enum zswapd_eswap_policy policy = get_zswapd_eswap_policy(); + + if (policy == CHECK_BUFFER_ZRAMRATIO_BOTH) + size = max(zram_watermark_diff(), zswapd_buffer_diff()); + else if (policy == CHECK_BUFFER_ONLY && (zram_watermark_ok() || refault)) + size = zswapd_buffer_diff(); + + return size; +} + +static int zswapd(void *p) +{ + struct task_struct *tsk = current; + pg_data_t *pgdat = (pg_data_t *)p; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + /* save zswapd pid for schedule strategy */ + zswapd_pid = tsk->pid; + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + while (!kthread_should_stop()) { + bool refault = false; + u64 size = 0; + + (void)wait_event_freezable(pgdat->zswapd_wait, + atomic_read(&pgdat->zswapd_wait_flag)); + atomic_set(&pgdat->zswapd_wait_flag, 0); + count_vm_event(ZSWAPD_WAKEUP); + zswapd_pressure_report(LEVEL_LOW); + + if (get_area_anon_refault_status()) { + refault = true; + count_vm_event(ZSWAPD_REFAULT); + goto do_eswap; + } + + zswapd_shrink_node(pgdat); + last_zswapd_time = jiffies; + +do_eswap: + size = get_do_eswap_size(refault); + if (size >= SZ_1M) { + count_vm_event(ZSWAPD_SWAPOUT); + size = swapout(size); + } + + if (!buffer_is_suitable()) { + if (free_swap_is_low() || zram_watermark_exceed()) { + zswapd_pressure_report(LEVEL_CRITICAL); + count_vm_event(ZSWAPD_CRITICAL_PRESS); + pr_info("%s:zrampages:%llu, eswappages:%llu\n", __func__, + get_zram_used_pages(), get_eswap_used_pages()); + } else { + zswapd_pressure_report(LEVEL_MEDIUM); + count_vm_event(ZSWAPD_MEDIUM_PRESS); + } + } + } + + return 0; +} + +/* + * This zswapd start function will be called by init and node-hot-add. + */ +int zswapd_run(int nid) +{ + const unsigned int priority_less = 5; + struct sched_param param = { + .sched_priority = MAX_PRIO - priority_less, + }; + pg_data_t *pgdat = NODE_DATA(nid); + + if (pgdat->zswapd) + return 0; + + atomic_set(&pgdat->zswapd_wait_flag, 0); + pgdat->zswapd = kthread_create(zswapd, pgdat, "zswapd%d", nid); + if (IS_ERR(pgdat->zswapd)) { + pr_err("Failed to start zswapd on node %d\n", nid); + return PTR_ERR(pgdat->zswapd); + } + + sched_setscheduler_nocheck(pgdat->zswapd, SCHED_NORMAL, ¶m); + set_user_nice(pgdat->zswapd, PRIO_TO_NICE(param.sched_priority)); + wake_up_process(pgdat->zswapd); + + return 0; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void zswapd_stop(int nid) +{ + struct task_struct *zswapd = NODE_DATA(nid)->zswapd; + + if (zswapd) { + kthread_stop(zswapd); + NODE_DATA(nid)->zswapd = NULL; + } + + zswapd_pid = -1; +} + +/* + * It's optimal to keep kswapds on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes away, + * we get changed to run anywhere: as the first one comes back, restore + * their cpu bindings. + */ +static int zswapd_cpu_online(unsigned int cpu) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->zswapd, mask); + } + + return 0; +} + +static int __init zswapd_init(void) +{ + int nid; + int ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/zswapd:online", + zswapd_cpu_online, NULL); + if (ret < 0) { + pr_err("zswapd: failed to register hotplug callbacks.\n"); + return ret; + } + + for_each_node_state(nid, N_MEMORY) + zswapd_run(nid); + + return 0; +} +module_init(zswapd_init) diff --git a/mm/zswapd_control.c b/mm/zswapd_control.c new file mode 100644 index 000000000000..934eff21f09b --- /dev/null +++ b/mm/zswapd_control.c @@ -0,0 +1,878 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/zswapd_control.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include + +#include "zswapd_internal.h" + +#define ANON_REFAULT_SNAPSHOT_MIN_INTERVAL 200 +#define AREA_ANON_REFAULT_THRESHOLD 22000 +#define EMPTY_ROUND_CHECK_THRESHOLD 10 +#define EMPTY_ROUND_SKIP_INTERVAL 20 +#define ZSWAPD_MAX_LEVEL_NUM 10 +#define MAX_SKIP_INTERVAL 1000 +#define MAX_RECLAIM_SIZE 100 + +#define INACTIVE_FILE_RATIO 90 +#define ACTIVE_FILE_RATIO 70 +#define COMPRESS_RATIO 30 +#define ZRAM_WM_RATIO 0 +#define MAX_RATIO 100 + +struct zswapd_param { + unsigned int min_score; + unsigned int max_score; + unsigned int ub_mem2zram_ratio; + unsigned int ub_zram2ufs_ratio; + unsigned int refault_threshold; +}; + +static struct zswapd_param zswap_param[ZSWAPD_MAX_LEVEL_NUM]; +struct eventfd_ctx *zswapd_press_efd[LEVEL_COUNT]; +static DEFINE_MUTEX(pressure_event_lock); +static DEFINE_MUTEX(reclaim_para_lock); + +atomic_t avail_buffers = ATOMIC_INIT(0); +atomic_t min_avail_buffers = ATOMIC_INIT(0); +atomic_t high_avail_buffers = ATOMIC_INIT(0); +atomic_t max_reclaim_size = ATOMIC_INIT(MAX_RECLAIM_SIZE); + +atomic_t inactive_file_ratio = ATOMIC_INIT(INACTIVE_FILE_RATIO); +atomic_t active_file_ratio = ATOMIC_INIT(ACTIVE_FILE_RATIO); +atomic_t zram_wm_ratio = ATOMIC_INIT(ZRAM_WM_RATIO); +atomic_t compress_ratio = ATOMIC_INIT(COMPRESS_RATIO); + +atomic64_t zram_critical_threshold = ATOMIC_LONG_INIT(0); +atomic64_t free_swap_threshold = ATOMIC_LONG_INIT(0); +atomic64_t area_anon_refault_threshold = ATOMIC_LONG_INIT(AREA_ANON_REFAULT_THRESHOLD); +atomic64_t anon_refault_snapshot_min_interval = + ATOMIC_LONG_INIT(ANON_REFAULT_SNAPSHOT_MIN_INTERVAL); +atomic64_t empty_round_skip_interval = ATOMIC_LONG_INIT(EMPTY_ROUND_SKIP_INTERVAL); +atomic64_t max_skip_interval = ATOMIC_LONG_INIT(MAX_SKIP_INTERVAL); +atomic64_t empty_round_check_threshold = ATOMIC_LONG_INIT(EMPTY_ROUND_CHECK_THRESHOLD); + +inline unsigned int get_zram_wm_ratio(void) +{ + return atomic_read(&zram_wm_ratio); +} + +inline unsigned int get_compress_ratio(void) +{ + return atomic_read(&compress_ratio); +} + +inline unsigned int get_inactive_file_ratio(void) +{ + return atomic_read(&inactive_file_ratio); +} + +inline unsigned int get_active_file_ratio(void) +{ + return atomic_read(&active_file_ratio); +} + +inline unsigned int get_avail_buffers(void) +{ + return atomic_read(&avail_buffers); +} + +inline unsigned int get_min_avail_buffers(void) +{ + return atomic_read(&min_avail_buffers); +} + +inline unsigned int get_high_avail_buffers(void) +{ + return atomic_read(&high_avail_buffers); +} + +inline unsigned int get_zswapd_max_reclaim_size(void) +{ + return atomic_read(&max_reclaim_size); +} + +inline unsigned long long get_free_swap_threshold(void) +{ + return atomic64_read(&free_swap_threshold); +} + +inline unsigned long long get_area_anon_refault_threshold(void) +{ + return atomic64_read(&area_anon_refault_threshold); +} + +inline unsigned long long get_anon_refault_snapshot_min_interval(void) +{ + return atomic64_read(&anon_refault_snapshot_min_interval); +} + +inline unsigned long long get_empty_round_skip_interval(void) +{ + return atomic64_read(&empty_round_skip_interval); +} + +inline unsigned long long get_max_skip_interval(void) +{ + return atomic64_read(&max_skip_interval); +} + +inline unsigned long long get_empty_round_check_threshold(void) +{ + return atomic64_read(&empty_round_check_threshold); +} + +inline unsigned long long get_zram_critical_threshold(void) +{ + return atomic64_read(&zram_critical_threshold); +} + +static ssize_t avail_buffers_params_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned long long threshold; + unsigned int high_buffers; + unsigned int min_buffers; + unsigned int buffers; + + buf = strstrip(buf); + + if (sscanf(buf, "%u %u %u %llu", &buffers, &min_buffers, &high_buffers, &threshold) != 4) + return -EINVAL; + + atomic_set(&avail_buffers, buffers); + atomic_set(&min_avail_buffers, min_buffers); + atomic_set(&high_avail_buffers, high_buffers); + atomic64_set(&free_swap_threshold, (threshold * (SZ_1M / PAGE_SIZE))); + + if (atomic_read(&min_avail_buffers) == 0) + set_snapshotd_init_flag(0); + else + set_snapshotd_init_flag(1); + + wake_all_zswapd(); + + return nbytes; +} + +static ssize_t zswapd_max_reclaim_size_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + u32 max; + int ret; + + buf = strstrip(buf); + ret = kstrtouint(buf, 10, &max); + if (ret) + return -EINVAL; + + atomic_set(&max_reclaim_size, max); + + return nbytes; +} + +static ssize_t buffers_ratio_params_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int inactive; + unsigned int active; + + buf = strstrip(buf); + + if (sscanf(buf, "%u %u", &inactive, &active) != 2) + return -EINVAL; + + if (inactive > MAX_RATIO || active > MAX_RATIO) + return -EINVAL; + + atomic_set(&inactive_file_ratio, inactive); + atomic_set(&active_file_ratio, active); + + return nbytes; +} + +static int area_anon_refault_threshold_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&area_anon_refault_threshold, val); + + return 0; +} + +static int empty_round_skip_interval_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&empty_round_skip_interval, val); + + return 0; +} + +static int max_skip_interval_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&max_skip_interval, val); + + return 0; +} + +static int empty_round_check_threshold_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&empty_round_check_threshold, val); + + return 0; +} + +static int anon_refault_snapshot_min_interval_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&anon_refault_snapshot_min_interval, val); + + return 0; +} + +static int zram_critical_thres_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&zram_critical_threshold, val); + + return 0; +} + +static ssize_t zswapd_pressure_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int level; + unsigned int efd; + struct fd efile; + int ret; + + buf = strstrip(buf); + if (sscanf(buf, "%u %u", &efd, &level) != 2) + return -EINVAL; + + if (level >= LEVEL_COUNT) + return -EINVAL; + + mutex_lock(&pressure_event_lock); + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out; + } + + zswapd_press_efd[level] = eventfd_ctx_fileget(efile.file); + if (IS_ERR(zswapd_press_efd[level])) { + ret = PTR_ERR(zswapd_press_efd[level]); + goto out_put_efile; + } + fdput(efile); + mutex_unlock(&pressure_event_lock); + return nbytes; + +out_put_efile: + fdput(efile); +out: + mutex_unlock(&pressure_event_lock); + + return ret; +} + +void zswapd_pressure_report(enum zswapd_pressure_level level) +{ + int ret; + + if (zswapd_press_efd[level] == NULL) + return; + + ret = eventfd_signal(zswapd_press_efd[level], 1); + if (ret < 0) + pr_err("SWAP-MM: %s : level:%u, ret:%d ", __func__, level, ret); +} + +static u64 zswapd_pid_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return get_zswapd_pid(); +} + +static void zswapd_memcgs_param_parse(int level_num) +{ + struct mem_cgroup *memcg = NULL; + u64 score; + int i; + + while ((memcg = get_next_memcg(memcg))) { + score = atomic64_read(&memcg->memcg_reclaimed.app_score); + for (i = 0; i < level_num; ++i) + if (score >= zswap_param[i].min_score && + score <= zswap_param[i].max_score) + break; + + atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, + zswap_param[i].ub_mem2zram_ratio); + atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, + zswap_param[i].ub_zram2ufs_ratio); + atomic_set(&memcg->memcg_reclaimed.refault_threshold, + zswap_param[i].refault_threshold); + } +} + +static ssize_t zswapd_memcgs_param_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + char *token = NULL; + int level_num; + int i; + + buf = strstrip(buf); + token = strsep(&buf, " "); + + if (!token) + return -EINVAL; + + if (kstrtoint(token, 0, &level_num)) + return -EINVAL; + + if (level_num > ZSWAPD_MAX_LEVEL_NUM) + return -EINVAL; + + mutex_lock(&reclaim_para_lock); + for (i = 0; i < level_num; ++i) { + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].min_score) || + zswap_param[i].min_score > MAX_APP_SCORE) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].max_score) || + zswap_param[i].max_score > MAX_APP_SCORE) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].ub_mem2zram_ratio) || + zswap_param[i].ub_mem2zram_ratio > MAX_RATIO) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].ub_zram2ufs_ratio) || + zswap_param[i].ub_zram2ufs_ratio > MAX_RATIO) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].refault_threshold)) + goto out; + } + + zswapd_memcgs_param_parse(level_num); + mutex_unlock(&reclaim_para_lock); + + return nbytes; + +out: + mutex_unlock(&reclaim_para_lock); + return -EINVAL; +} + +static ssize_t zswapd_single_memcg_param_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int ub_mem2zram_ratio; + unsigned int ub_zram2ufs_ratio; + unsigned int refault_threshold; + + buf = strstrip(buf); + + if (sscanf(buf, "%u %u %u", &ub_mem2zram_ratio, &ub_zram2ufs_ratio, + &refault_threshold) != 3) + return -EINVAL; + + if (ub_mem2zram_ratio > MAX_RATIO || ub_zram2ufs_ratio > MAX_RATIO) + return -EINVAL; + + atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, + ub_mem2zram_ratio); + atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, + ub_zram2ufs_ratio); + atomic_set(&memcg->memcg_reclaimed.refault_threshold, + refault_threshold); + + return nbytes; +} + +static ssize_t mem_cgroup_zram_wm_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int ratio; + int ret; + + buf = strstrip(buf); + + ret = kstrtouint(buf, 10, &ratio); + if (ret) + return -EINVAL; + + if (ratio > MAX_RATIO) + return -EINVAL; + + atomic_set(&zram_wm_ratio, ratio); + + return nbytes; +} + +static ssize_t mem_cgroup_compress_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int ratio; + int ret; + + buf = strstrip(buf); + + ret = kstrtouint(buf, 10, &ratio); + if (ret) + return -EINVAL; + + if (ratio > MAX_RATIO) + return -EINVAL; + + atomic_set(&compress_ratio, ratio); + + return nbytes; +} + +static int zswapd_pressure_show(struct seq_file *m, void *v) +{ + zswapd_status_show(m); + + return 0; +} + +static int memcg_active_app_info_list_show(struct seq_file *m, void *v) +{ + struct mem_cgroup_per_node *mz = NULL; + struct mem_cgroup *memcg = NULL; + struct lruvec *lruvec = NULL; + unsigned long eswap_size; + unsigned long anon_size; + unsigned long zram_size; + + while ((memcg = get_next_memcg(memcg))) { + u64 score = atomic64_read(&memcg->memcg_reclaimed.app_score); + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) { + get_next_memcg_break(memcg); + return 0; + } + + lruvec = &mz->lruvec; + if (!lruvec) { + get_next_memcg_break(memcg); + return 0; + } + + anon_size = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) + lruvec_lru_size(lruvec, + LRU_INACTIVE_ANON, MAX_NR_ZONES); + eswap_size = memcg_data_size(memcg, SWAP_SIZE); + zram_size = memcg_data_size(memcg, CACHE_SIZE); + + if (anon_size + zram_size + eswap_size == 0) + continue; + + if (!strlen(memcg->name)) + continue; + + anon_size *= PAGE_SIZE / SZ_1K; + zram_size *= PAGE_SIZE / SZ_1K; + eswap_size *= PAGE_SIZE / SZ_1K; + + seq_printf(m, "%s %llu %lu %lu %lu %llu\n", memcg->name, score, + anon_size, zram_size, eswap_size, + memcg->memcg_reclaimed.reclaimed_pagefault); + } + return 0; +} + +static int report_app_info_show(struct seq_file *m, void *v) +{ + struct mem_cgroup_per_node *mz = NULL; + struct mem_cgroup *memcg = NULL; + struct lruvec *lruvec = NULL; + unsigned long eswap_size; + unsigned long zram_size; + unsigned long anon_size; + + while ((memcg = get_next_memcg(memcg))) { + u64 score = atomic64_read(&memcg->memcg_reclaimed.app_score); + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) { + get_next_memcg_break(memcg); + return 0; + } + + lruvec = &mz->lruvec; + if (!lruvec) { + get_next_memcg_break(memcg); + return 0; + } + + anon_size = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) + lruvec_lru_size(lruvec, + LRU_INACTIVE_ANON, MAX_NR_ZONES); + eswap_size = memcg_data_size(memcg, SWAP_SIZE); + zram_size = memcg_data_size(memcg, CACHE_SIZE); + + if (anon_size + zram_size + eswap_size == 0) + continue; + + anon_size *= PAGE_SIZE / SZ_1K; + zram_size *= PAGE_SIZE / SZ_1K; + eswap_size *= PAGE_SIZE / SZ_1K; + + seq_printf(m, "%s, %llu, %lu, %lu, %lu\n", + strlen(memcg->name) ? memcg->name : "root", + score, anon_size, zram_size, eswap_size); + } + return 0; +} + +#ifdef CONFIG_HYPERHOLD_DEBUG +static int avail_buffers_params_show(struct seq_file *m, void *v) +{ + seq_printf(m, "avail_buffers: %u\n", atomic_read(&avail_buffers)); + seq_printf(m, "min_avail_buffers: %u\n", atomic_read(&min_avail_buffers)); + seq_printf(m, "high_avail_buffers: %u\n", atomic_read(&high_avail_buffers)); + seq_printf(m, "free_swap_threshold: %llu\n", + atomic64_read(&free_swap_threshold) * PAGE_SIZE / SZ_1M); + + return 0; +} + +static int zswapd_max_reclaim_size_show(struct seq_file *m, void *v) +{ + seq_printf(m, "zswapd_max_reclaim_size: %u\n", + atomic_read(&max_reclaim_size)); + + return 0; +} + +static int buffers_ratio_params_show(struct seq_file *m, void *v) +{ + seq_printf(m, "inactive_file_ratio: %u\n", atomic_read(&inactive_file_ratio)); + seq_printf(m, "active_file_ratio: %u\n", atomic_read(&active_file_ratio)); + + return 0; +} + +static u64 area_anon_refault_threshold_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&area_anon_refault_threshold); +} + +static u64 empty_round_skip_interval_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&empty_round_skip_interval); +} + +static u64 max_skip_interval_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&max_skip_interval); +} + +static u64 empty_round_check_threshold_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&empty_round_check_threshold); +} + +static u64 anon_refault_snapshot_min_interval_read( + struct cgroup_subsys_state *css, struct cftype *cft) +{ + return atomic64_read(&anon_refault_snapshot_min_interval); +} + +static u64 zram_critical_threshold_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&zram_critical_threshold); +} + +static int zswapd_memcgs_param_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; i < ZSWAPD_MAX_LEVEL_NUM; ++i) { + seq_printf(m, "level %d min score: %u\n", i, + zswap_param[i].min_score); + seq_printf(m, "level %d max score: %u\n", i, + zswap_param[i].max_score); + seq_printf(m, "level %d ub_mem2zram_ratio: %u\n", i, + zswap_param[i].ub_mem2zram_ratio); + seq_printf(m, "level %d ub_zram2ufs_ratio: %u\n", i, + zswap_param[i].ub_zram2ufs_ratio); + seq_printf(m, "level %d refault_threshold: %u\n", i, + zswap_param[i].refault_threshold); + } + + return 0; +} + +static int zswapd_single_memcg_param_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "memcg score: %llu\n", + atomic64_read(&memcg->memcg_reclaimed.app_score)); + seq_printf(m, "memcg ub_mem2zram_ratio: %u\n", + atomic_read(&memcg->memcg_reclaimed.ub_mem2zram_ratio)); + seq_printf(m, "memcg ub_zram2ufs_ratio: %u\n", + atomic_read(&memcg->memcg_reclaimed.ub_zram2ufs_ratio)); + seq_printf(m, "memcg refault_threshold: %u\n", + atomic_read(&memcg->memcg_reclaimed.refault_threshold)); + + return 0; +} + +static int zram_wm_ratio_show(struct seq_file *m, void *v) +{ + seq_printf(m, "zram_wm_ratio: %u\n", atomic_read(&zram_wm_ratio)); + + return 0; +} + +static int compress_ratio_show(struct seq_file *m, void *v) +{ + seq_printf(m, "compress_ratio: %u\n", atomic_read(&compress_ratio)); + + return 0; +} +static int zswapd_vmstat_show(struct seq_file *m, void *v) +{ +#ifdef CONFIG_VM_EVENT_COUNTERS + unsigned long *vm_buf = NULL; + + vm_buf = kzalloc(sizeof(struct vm_event_state), GFP_KERNEL); + if (!vm_buf) + return -ENOMEM; + all_vm_events(vm_buf); + + seq_printf(m, "zswapd_wake_up:%lu\n", vm_buf[ZSWAPD_WAKEUP]); + seq_printf(m, "zswapd_area_refault:%lu\n", vm_buf[ZSWAPD_REFAULT]); + seq_printf(m, "zswapd_medium_press:%lu\n", vm_buf[ZSWAPD_MEDIUM_PRESS]); + seq_printf(m, "zswapd_critical_press:%lu\n", vm_buf[ZSWAPD_CRITICAL_PRESS]); + seq_printf(m, "zswapd_memcg_ratio_skip:%lu\n", vm_buf[ZSWAPD_MEMCG_RATIO_SKIP]); + seq_printf(m, "zswapd_memcg_refault_skip:%lu\n", vm_buf[ZSWAPD_MEMCG_REFAULT_SKIP]); + seq_printf(m, "zswapd_swapout:%lu\n", vm_buf[ZSWAPD_SWAPOUT]); + seq_printf(m, "zswapd_snapshot_times:%lu\n", vm_buf[ZSWAPD_SNAPSHOT_TIMES]); + seq_printf(m, "zswapd_reclaimed:%lu\n", vm_buf[ZSWAPD_RECLAIMED]); + seq_printf(m, "zswapd_scanned:%lu\n", vm_buf[ZSWAPD_SCANNED]); + + kfree(vm_buf); +#endif + + return 0; +} + +void memcg_eswap_info_show(struct seq_file *m) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup_per_node *mz = NULL; + struct lruvec *lruvec = NULL; + unsigned long anon; + unsigned long file; + unsigned long zram; + unsigned long eswap; + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) + return; + + lruvec = &mz->lruvec; + if (!lruvec) + return; + + anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); + zram = memcg_data_size(memcg, CACHE_SIZE) / SZ_1K; + eswap = memcg_data_size(memcg, SWAP_SIZE) / SZ_1K; + anon *= PAGE_SIZE / SZ_1K; + file *= PAGE_SIZE / SZ_1K; + seq_printf(m, "Anon:\t%12lu kB\nFile:\t%12lu kB\nzram:\t%12lu kB\nEswap:\t%12lu kB\n", + anon, file, zram, eswap); +} +#endif + +static struct cftype zswapd_policy_files[] = { + { + .name = "active_app_info_list", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = memcg_active_app_info_list_show, + }, + { + .name = "zram_wm_ratio", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = mem_cgroup_zram_wm_ratio_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zram_wm_ratio_show, +#endif + }, + { + .name = "compress_ratio", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = mem_cgroup_compress_ratio_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = compress_ratio_show, +#endif + }, + { + .name = "zswapd_pressure", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = zswapd_pressure_event_control, + }, + { + .name = "zswapd_pid", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = zswapd_pid_read, + }, + { + .name = "avail_buffers", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = avail_buffers_params_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = avail_buffers_params_show, +#endif + }, + { + .name = "zswapd_max_reclaim_size", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = zswapd_max_reclaim_size_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zswapd_max_reclaim_size_show, +#endif + }, + { + .name = "area_anon_refault_threshold", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = area_anon_refault_threshold_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = area_anon_refault_threshold_read, +#endif + }, + { + .name = "empty_round_skip_interval", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = empty_round_skip_interval_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = empty_round_skip_interval_read, +#endif + }, + { + .name = "max_skip_interval", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = max_skip_interval_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = max_skip_interval_read, +#endif + }, + { + .name = "empty_round_check_threshold", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = empty_round_check_threshold_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = empty_round_check_threshold_read, +#endif + }, + { + .name = "anon_refault_snapshot_min_interval", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = anon_refault_snapshot_min_interval_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = anon_refault_snapshot_min_interval_read, +#endif + }, + { + .name = "zswapd_memcgs_param", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = zswapd_memcgs_param_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zswapd_memcgs_param_show, +#endif + }, + { + .name = "zswapd_single_memcg_param", + .write = zswapd_single_memcg_param_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zswapd_single_memcg_param_show, +#endif + }, + { + .name = "buffer_ratio_params", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = buffers_ratio_params_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = buffers_ratio_params_show, +#endif + }, + { + .name = "zswapd_pressure_show", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = zswapd_pressure_show, + }, + { + .name = "zram_critical_threshold", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = zram_critical_thres_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = zram_critical_threshold_read, +#endif + }, + +#ifdef CONFIG_HYPERHOLD_DEBUG + { + .name = "zswapd_vmstat_show", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = zswapd_vmstat_show, + }, +#endif + + { }, /* terminate */ +}; + +static int __init zswapd_policy_init(void) +{ + if (!mem_cgroup_disabled()) + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, zswapd_policy_files)); + + return 0; +} +subsys_initcall(zswapd_policy_init); diff --git a/mm/zswapd_internal.h b/mm/zswapd_internal.h new file mode 100644 index 000000000000..1447882ae497 --- /dev/null +++ b/mm/zswapd_internal.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * mm/zswapd_internal.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZSWAPD_INTERNAL_H +#define _ZSWAPD_INTERNAL_H + +enum zswapd_pressure_level { + LEVEL_LOW = 0, + LEVEL_MEDIUM, + LEVEL_CRITICAL, + LEVEL_COUNT +}; + +enum zswapd_eswap_policy { + CHECK_BUFFER_ONLY = 0, + CHECK_BUFFER_ZRAMRATIO_BOTH +}; + +void zswapd_pressure_report(enum zswapd_pressure_level level); +inline unsigned int get_zram_wm_ratio(void); +inline unsigned int get_compress_ratio(void); +inline unsigned int get_avail_buffers(void); +inline unsigned int get_min_avail_buffers(void); +inline unsigned int get_high_avail_buffers(void); +inline unsigned int get_zswapd_max_reclaim_size(void); +inline unsigned int get_inactive_file_ratio(void); +inline unsigned int get_active_file_ratio(void); +inline unsigned long long get_area_anon_refault_threshold(void); +inline unsigned long long get_anon_refault_snapshot_min_interval(void); +inline unsigned long long get_empty_round_skip_interval(void); +inline unsigned long long get_max_skip_interval(void); +inline unsigned long long get_empty_round_check_threshold(void); +inline unsigned long long get_zram_critical_threshold(void); +u64 memcg_data_size(struct mem_cgroup *memcg, int type); +u64 swapin_memcg(struct mem_cgroup *memcg, u64 req_size); + +#endif /* MM_ZSWAPD_INTERNAL_H */ -- Gitee From 4db23b1b6a8b74987c808de81d67bea0f7419522 Mon Sep 17 00:00:00 2001 From: xianghengliang Date: Tue, 8 Feb 2022 16:18:10 +0800 Subject: [PATCH 007/113] ohos inclusion category: feature issue: #I4SV6F CVE: NA Signed-off-by: xianghengliang ---------------------------------------------- change /sys/fs/hmdfs/xxx uid&git to system --- fs/hmdfs/comm/device_node.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/hmdfs/comm/device_node.c b/fs/hmdfs/comm/device_node.c index 54eaaf06f223..183f3b7172c5 100644 --- a/fs/hmdfs/comm/device_node.c +++ b/fs/hmdfs/comm/device_node.c @@ -1421,6 +1421,7 @@ int hmdfs_register_sysfs(const char *name, struct hmdfs_sb_info *sbi) init_completion(&sbi->s_kobj_unregister); ret = kobject_init_and_add(&sbi->kobj, &sbi_ktype, &hmdfs_kset->kobj, "%s", name); + sysfs_change_owner(&sbi->kobj, KUIDT_INIT(1000), KGIDT_INIT(1000)); mutex_unlock(&hmdfs_sysfs_mutex); if (ret) { -- Gitee From 0a9b9822606f0a10864da9cb0a9881a1b3b7bfed Mon Sep 17 00:00:00 2001 From: waterwin Date: Tue, 8 Feb 2022 21:18:30 +0800 Subject: [PATCH 008/113] hmdfs: Introduce authority to hmdfs ohos inclusion category: feature issue: #I4SW8O CVE: NA ---------------------------------------------- hmdfs manage permission by configfs, bundle access its bundle directory but other directory. Signed-off-by: qianjiaxing --- fs/hmdfs/Makefile | 1 + fs/hmdfs/authority/authentication.c | 94 +++---- fs/hmdfs/authority/authentication.h | 38 ++- fs/hmdfs/authority/config.c | 377 ++++++++++++++++++++++++++++ fs/hmdfs/inode_local.c | 8 +- fs/hmdfs/main.c | 7 + 6 files changed, 451 insertions(+), 74 deletions(-) create mode 100644 fs/hmdfs/authority/config.c diff --git a/fs/hmdfs/Makefile b/fs/hmdfs/Makefile index 25c3eef3dd9d..6f38c843664e 100644 --- a/fs/hmdfs/Makefile +++ b/fs/hmdfs/Makefile @@ -11,5 +11,6 @@ hmdfs-y += comm/connection.o comm/socket_adapter.o comm/transport.o hmdfs-$(CONFIG_HMDFS_FS_ENCRYPTION) += comm/crypto.o hmdfs-$(CONFIG_HMDFS_FS_PERMISSION) += authority/authentication.o +hmdfs-$(CONFIG_HMDFS_FS_PERMISSION) += authority/config.o hmdfs-$(CONFIG_FS_FAULT_INJECTION) += comm/fault_inject.o diff --git a/fs/hmdfs/authority/authentication.c b/fs/hmdfs/authority/authentication.c index 97d842147050..85ac3c96c5b1 100644 --- a/fs/hmdfs/authority/authentication.c +++ b/fs/hmdfs/authority/authentication.c @@ -48,9 +48,8 @@ const struct cred *hmdfs_override_fsids(bool is_recv_thread) if (!cred) return NULL; - cred->fsuid = MEDIA_RW_UID; - cred->fsgid = is_recv_thread ? - KGIDT_INIT((gid_t)AID_EVERYBODY) : MEDIA_RW_GID; + cred->fsuid = is_recv_thread ? SYSTEM_UID : USER_DATA_RW_UID; + cred->fsgid = is_recv_thread ? SYSTEM_GID : USER_DATA_RW_GID; old_cred = override_creds(cred); @@ -73,7 +72,8 @@ const struct cred *hmdfs_override_dir_fsids(struct inode *dir, switch (level) { case HMDFS_PERM_MNT: /* system : media_rw */ - cred->fsuid = SYSTEM_UID; + cred->fsuid = USER_DATA_RW_UID; + cred->fsgid = USER_DATA_RW_GID; perm = (hii->perm & HMDFS_DIR_TYPE_MASK) | level; break; case HMDFS_PERM_DFS: @@ -83,15 +83,12 @@ const struct cred *hmdfs_override_dir_fsids(struct inode *dir, * other : media_rw : media_rw **/ if (!strcmp(dentry->d_name.name, PKG_ROOT_NAME)) { - cred->fsuid = SYSTEM_UID; perm = HMDFS_DIR_DATA | level; - } else if (!strcmp(dentry->d_name.name, SYSTEM_NAME)) { - cred->fsuid = SYSTEM_UID; - perm = AUTH_SYSTEM | HMDFS_DIR_SYSTEM | level; } else { - cred->fsuid = MEDIA_RW_UID; perm = HMDFS_DIR_PUBLIC | level; } + cred->fsuid = USER_DATA_RW_UID; + cred->fsgid = USER_DATA_RW_GID; break; case HMDFS_PERM_PKG: if (is_data_dir(hii->perm)) { @@ -102,20 +99,25 @@ const struct cred *hmdfs_override_dir_fsids(struct inode *dir, * local uninstall. * Set appid + media_rw for local install. */ - uid_t app_id = 0; + int bid = get_bid(dentry->d_name.name); - if (app_id != 0) - cred->fsuid = KUIDT_INIT(app_id); - else + if (bid != 0) { + cred->fsuid = KUIDT_INIT(bid); + cred->fsgid = KGIDT_INIT(bid); + } else { cred->fsuid = ROOT_UID; + cred->fsgid = ROOT_GID; + } perm = AUTH_PKG | HMDFS_DIR_PKG | level; } else { cred->fsuid = dir->i_uid; + cred->fsgid = dir->i_gid; perm = (hii->perm & AUTH_MASK) | HMDFS_DIR_DEFAULT | level; } break; case HMDFS_PERM_OTHER: cred->fsuid = dir->i_uid; + cred->fsgid = dir->i_gid; if (is_pkg_auth(hii->perm)) perm = AUTH_PKG | HMDFS_DIR_PKG_SUB | level; else @@ -127,7 +129,6 @@ const struct cred *hmdfs_override_dir_fsids(struct inode *dir, break; } - cred->fsgid = MEDIA_RW_GID; *_perm = perm; old_cred = override_creds(cred); @@ -312,27 +313,6 @@ static __u16 __inherit_perm_file(struct inode *parent) return perm; } -static void fixup_ownership(struct inode *child, struct dentry *lower_dentry, - uid_t uid) -{ - int err; - struct iattr newattrs; - - newattrs.ia_valid = ATTR_UID | ATTR_FORCE; - newattrs.ia_uid = KUIDT_INIT(uid); - if (!S_ISDIR(d_inode(lower_dentry)->i_mode)) - newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV; - - inode_lock(d_inode(lower_dentry)); - err = notify_change(lower_dentry, &newattrs, NULL); - inode_unlock(d_inode(lower_dentry)); - - if (!err) - child->i_uid = KUIDT_INIT(uid); - else - hmdfs_err("update PKG uid failed, err = %d", err); -} - static void fixup_ownership_user_group(struct inode *child, struct dentry *lower_dentry, uid_t uid, gid_t gid) { @@ -371,7 +351,7 @@ __u16 hmdfs_perm_inherit(struct inode *parent_inode, struct inode *child) void check_and_fixup_ownership(struct inode *parent_inode, struct inode *child, struct dentry *lower_dentry, const char *name) { - uid_t appid; + int bid; struct hmdfs_inode_info *info = hmdfs_i(child); if (info->perm == HMDFS_ALL_MASK) @@ -379,25 +359,22 @@ void check_and_fixup_ownership(struct inode *parent_inode, struct inode *child, switch (info->perm & HMDFS_DIR_TYPE_MASK) { case HMDFS_DIR_PKG: - appid = 0; - if (appid != child->i_uid.val) - fixup_ownership(child, lower_dentry, appid); + bid = get_bid(name); + if (bid != child->i_uid.val || bid != child->i_gid.val) + fixup_ownership_user_group(child, lower_dentry, bid, bid); break; case HMDFS_DIR_DATA: case HMDFS_FILE_PKG_SUB: case HMDFS_DIR_DEFAULT: case HMDFS_FILE_DEFAULT: + case HMDFS_DIR_PUBLIC: if (parent_inode->i_uid.val != child->i_uid.val || parent_inode->i_gid.val != child->i_gid.val) fixup_ownership_user_group(child, lower_dentry, parent_inode->i_uid.val, parent_inode->i_gid.val); break; - case HMDFS_DIR_PUBLIC: - fixup_ownership(child, lower_dentry, (uid_t)AID_MEDIA_RW); - - break; default: break; } @@ -416,7 +393,8 @@ void check_and_fixup_ownership_remote(struct inode *dir, switch (level) { case HMDFS_PERM_MNT: /* system : media_rw */ - dinode->i_uid = SYSTEM_UID; + dinode->i_uid = USER_DATA_RW_UID; + dinode->i_gid = USER_DATA_RW_GID; perm = (hii->perm & HMDFS_DIR_TYPE_MASK) | level; break; case HMDFS_PERM_DFS: @@ -426,17 +404,12 @@ void check_and_fixup_ownership_remote(struct inode *dir, * other : media_rw : media_rw **/ if (!strcmp(dentry->d_name.name, PKG_ROOT_NAME)) { - // "data" - dinode->i_uid = SYSTEM_UID; perm = HMDFS_DIR_DATA | level; - } else if (!strcmp(dentry->d_name.name, SYSTEM_NAME)) { - // "system" - dinode->i_uid = SYSTEM_UID; - perm = AUTH_SYSTEM | HMDFS_DIR_SYSTEM | level; } else { - dinode->i_uid = MEDIA_RW_UID; perm = HMDFS_DIR_PUBLIC | level; } + dinode->i_uid = USER_DATA_RW_UID; + dinode->i_gid = USER_DATA_RW_GID; break; case HMDFS_PERM_PKG: if (is_data_dir(hii->perm)) { @@ -447,20 +420,24 @@ void check_and_fixup_ownership_remote(struct inode *dir, * local uninstall. * Set appid + media_rw for local install. */ - uid_t app_id = 0; - - if (app_id != 0) - dinode->i_uid = KUIDT_INIT(app_id); - else + int bid = get_bid(dentry->d_name.name); + if (bid != 0) { + dinode->i_uid = KUIDT_INIT(bid); + dinode->i_gid = KGIDT_INIT(bid); + } else { dinode->i_uid = ROOT_UID; + dinode->i_gid = ROOT_GID; + } perm = AUTH_PKG | HMDFS_DIR_PKG | level; } else { dinode->i_uid = dir->i_uid; + dinode->i_gid = dir->i_gid; perm = (hii->perm & AUTH_MASK) | HMDFS_DIR_DEFAULT | level; } break; case HMDFS_PERM_OTHER: dinode->i_uid = dir->i_uid; + dinode->i_gid = dir->i_gid; if (is_pkg_auth(hii->perm)) perm = AUTH_PKG | HMDFS_DIR_PKG_SUB | level; else @@ -472,7 +449,6 @@ void check_and_fixup_ownership_remote(struct inode *dir, break; } - dinode->i_gid = MEDIA_RW_GID; dinfo->perm = perm; } @@ -481,6 +457,6 @@ void hmdfs_root_inode_perm_init(struct inode *root_inode) struct hmdfs_inode_info *hii = hmdfs_i(root_inode); hii->perm = HMDFS_DIR_ROOT | HMDFS_PERM_MNT; - set_inode_uid(root_inode, SYSTEM_UID); - set_inode_gid(root_inode, MEDIA_RW_GID); + set_inode_uid(root_inode, USER_DATA_RW_UID); + set_inode_gid(root_inode, USER_DATA_RW_GID); } diff --git a/fs/hmdfs/authority/authentication.h b/fs/hmdfs/authority/authentication.h index e8b7bed53fb9..26838e2e8128 100644 --- a/fs/hmdfs/authority/authentication.h +++ b/fs/hmdfs/authority/authentication.h @@ -23,24 +23,22 @@ struct cache_fs_override { #ifdef CONFIG_HMDFS_FS_PERMISSION -#define AID_ROOT 0 -#define AID_SYSTEM 1000 -#define AID_SDCARD_RW 1015 -#define AID_MEDIA_RW 1023 -#define AID_EVERYBODY 9997 +#define OID_ROOT 0 +#define OID_SYSTEM 1000 +#define OID_USER_DATA_RW 1008 /* copied from sdcardfs/multiuser.h */ -#define AID_USER_OFFSET 100000 /* offset for uid ranges for each user */ +#define BASE_USER_RANGE 200000 /* offset for uid ranges for each user */ #define HMDFS_PERM_XATTR "user.hmdfs.perm" -#define ROOT_UID KUIDT_INIT(AID_ROOT) -#define SYSTEM_UID KUIDT_INIT(AID_SYSTEM) -#define MEDIA_RW_UID KUIDT_INIT(AID_MEDIA_RW) +#define ROOT_UID KUIDT_INIT(OID_ROOT) +#define SYSTEM_UID KUIDT_INIT(OID_SYSTEM) +#define USER_DATA_RW_UID KUIDT_INIT(OID_USER_DATA_RW) -#define SYSTEM_GID KGIDT_INIT((gid_t) AID_SYSTEM) -#define MEDIA_RW_GID KGIDT_INIT(AID_MEDIA_RW) -#define SDCARD_RW_GID KGIDT_INIT(AID_SDCARD_RW) +#define ROOT_GID KGIDT_INIT(OID_ROOT) +#define SYSTEM_GID KGIDT_INIT(OID_SYSTEM) +#define USER_DATA_RW_GID KGIDT_INIT(OID_USER_DATA_RW) #define PKG_ROOT_NAME "data" #define SYSTEM_NAME "system" @@ -89,7 +87,7 @@ static inline bool is_perm_other(__u16 perm) static inline void hmdfs_check_cred(const struct cred *cred) { - if (cred->fsuid.val != AID_SYSTEM || cred->fsgid.val != AID_SYSTEM) + if (cred->fsuid.val != OID_SYSTEM || cred->fsgid.val != OID_SYSTEM) hmdfs_warning("uid is %u, gid is %u", cred->fsuid.val, cred->fsgid.val); } @@ -176,6 +174,13 @@ static inline bool is_system_auth(__u16 perm) #define HMDFS_ALL_MASK (HMDFS_MOUNT_POINT_MASK | AUTH_MASK | HMDFS_DIR_TYPE_MASK | HMDFS_PERM_MASK) +static inline kuid_t get_bid_from_uid(kuid_t uid) +{ + kuid_t bid; + + bid.val = uid.val % BASE_USER_RANGE; + return bid; +} static inline void set_inode_gid(struct inode *inode, kgid_t gid) { @@ -250,6 +255,9 @@ int hmdfs_override_dir_id_fs(struct cache_fs_override *or, void hmdfs_revert_dir_id_fs(struct cache_fs_override *or); void check_and_fixup_ownership_remote(struct inode *dir, struct dentry *dentry); +extern int get_bid(const char *bname); +extern int __init hmdfs_init_configfs(void); +extern void hmdfs_exit_configfs(void); #else @@ -323,6 +331,10 @@ void hmdfs_check_cred(const struct cred *cred) { } +static inline int get_bid(const char *bname) { return 0; } +static inline int __init hmdfs_init_configfs(void) { return 0; } +static inline void hmdfs_exit_configfs(void) {} + #endif /* CONFIG_HMDFS_FS_PERMISSION */ #endif diff --git a/fs/hmdfs/authority/config.c b/fs/hmdfs/authority/config.c new file mode 100644 index 000000000000..2daadd40704e --- /dev/null +++ b/fs/hmdfs/authority/config.c @@ -0,0 +1,377 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * fs/hmdfs/comm/authority/config.c + * + * Copyright (c) 2020-2021 Huawei Device Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include "hmdfs.h" + +#define UID_ATTR_TYPE 0 +#define GID_ATTR_TYPE 1 + +static struct kmem_cache *hmdfs_bid_entry_cachep; + +struct hmdfs_bid_entry { + struct hlist_node node; + struct qstr str; + int id; +}; + +struct hmdfs_config_bitem { + struct config_item item; + struct qstr str; +}; + +static unsigned int make_hash(const char *name, unsigned int len) +{ + unsigned long hash; + + hash = init_name_hash(0); + while (len--) + hash = partial_name_hash(tolower(*name++), hash); + + return end_name_hash(hash); +} + +static struct qstr make_qstr(const char *name) +{ + struct qstr str; + str.name = name; + str.len = strlen(name); + str.hash = make_hash(str.name, str.len); + + return str; +} + +static struct hmdfs_bid_entry *alloc_bid_entry(const char *name, int id) +{ + struct hmdfs_bid_entry *bid_entry; + char *bid_entry_name; + + bid_entry = kmem_cache_alloc(hmdfs_bid_entry_cachep, GFP_KERNEL); + if (!bid_entry) { + bid_entry = ERR_PTR(-ENOMEM); + goto out; + } + + bid_entry_name = kstrdup(name, GFP_KERNEL); + if (!bid_entry_name) { + kmem_cache_free(hmdfs_bid_entry_cachep, bid_entry); + bid_entry = ERR_PTR(-ENOMEM); + goto out; + } + + INIT_HLIST_NODE(&bid_entry->node); + bid_entry->str = make_qstr(bid_entry_name); + bid_entry->id = id; +out: + return bid_entry; +} + +static void free_bid_entry(struct hmdfs_bid_entry *bid_entry) +{ + if (bid_entry == NULL) + return; + + kfree(bid_entry->str.name); + kmem_cache_free(hmdfs_bid_entry_cachep, bid_entry); +} + +static struct hmdfs_config_bitem *alloc_bitem(const char *name) +{ + struct hmdfs_config_bitem *bitem; + char *bitem_name; + + bitem = kzalloc(sizeof(*bitem), GFP_KERNEL); + if (!bitem) { + bitem = ERR_PTR(-ENOMEM); + goto out; + } + + bitem_name = kstrdup(name, GFP_KERNEL); + if (!bitem_name) { + kfree(bitem); + bitem = ERR_PTR(-ENOMEM); + goto out; + } + + bitem->str = make_qstr(bitem_name); +out: + return bitem; +} + +static void free_bitem(struct hmdfs_config_bitem *bitem) +{ + if (bitem == NULL) + return; + + kfree(bitem->str.name); + kfree(bitem); +} + +#define HMDFS_BUNDLE_ATTRIBUTE(_attr_) \ + \ +static DEFINE_HASHTABLE(hmdfs_##_attr_##_hash_table, 4); \ + \ +static DEFINE_MUTEX(hmdfs_##_attr_##_hash_mutex); \ + \ +static int query_##_attr_##_hash_entry(struct qstr *str) \ +{ \ + int id = 0; \ + struct hmdfs_bid_entry *bid_entry; \ + struct hlist_node *hash_node; \ + \ + mutex_lock(&hmdfs_##_attr_##_hash_mutex); \ + hash_for_each_possible_safe(hmdfs_##_attr_##_hash_table, \ + bid_entry, hash_node, node, str->hash) { \ + if (qstr_case_eq(str, &bid_entry->str)) { \ + id = bid_entry->id; \ + break; \ + } \ + } \ + mutex_unlock(&hmdfs_##_attr_##_hash_mutex); \ + \ + return id; \ +} \ + \ +static int insert_##_attr_##_hash_entry(struct qstr *str, int id) \ +{ \ + int err = 0; \ + struct hmdfs_bid_entry *bid_entry; \ + struct hlist_node *hash_node; \ + \ + hmdfs_info("insert name = %s", str->name); \ + \ + mutex_lock(&hmdfs_##_attr_##_hash_mutex); \ + hash_for_each_possible_safe(hmdfs_##_attr_##_hash_table, \ + bid_entry, hash_node, node, str->hash) { \ + if (qstr_case_eq(str, &bid_entry->str)) { \ + bid_entry->id = id; \ + mutex_unlock(&hmdfs_##_attr_##_hash_mutex); \ + goto out; \ + } \ + } \ + mutex_unlock(&hmdfs_##_attr_##_hash_mutex); \ + \ + bid_entry = alloc_bid_entry(str->name, id); \ + if (IS_ERR(bid_entry)) { \ + err = PTR_ERR(bid_entry); \ + goto out; \ + } \ + \ + hash_add_rcu(hmdfs_##_attr_##_hash_table, &bid_entry->node, \ + bid_entry->str.hash); \ +out: \ + return err; \ +} \ + \ +static void remove_##_attr_##_hash_entry(struct qstr *str) \ +{ \ + struct hmdfs_bid_entry *bid_entry; \ + struct hlist_node *hash_node; \ + \ + hmdfs_info("remove name = %s", str->name); \ + \ + mutex_lock(&hmdfs_##_attr_##_hash_mutex); \ + hash_for_each_possible_safe(hmdfs_##_attr_##_hash_table, \ + bid_entry, hash_node, node, str->hash) { \ + if (qstr_case_eq(str, &bid_entry->str)) { \ + hash_del_rcu(&bid_entry->node); \ + free_bid_entry(bid_entry); \ + break; \ + } \ + } \ + mutex_unlock(&hmdfs_##_attr_##_hash_mutex); \ +} \ + \ +static void clear_##_attr_##_hash_entry(void) \ +{ \ + int index; \ + struct hmdfs_bid_entry *bid_entry; \ + struct hlist_node *hash_node; \ + \ + hmdfs_info("clear bid entry"); \ + \ + mutex_lock(&hmdfs_##_attr_##_hash_mutex); \ + hash_for_each_safe(hmdfs_##_attr_##_hash_table, index, \ + hash_node, bid_entry, node) { \ + hash_del_rcu(&bid_entry->node); \ + kfree(bid_entry->str.name); \ + kmem_cache_free(hmdfs_bid_entry_cachep, bid_entry); \ + } \ + mutex_unlock(&hmdfs_##_attr_##_hash_mutex); \ +} \ + \ +static int hmdfs_##_attr_##_get(const char *bname) \ +{ \ + struct qstr str; \ + \ + str = make_qstr(bname); \ + return query_##_attr_##_hash_entry(&str); \ +} \ + \ +static ssize_t hmdfs_##_attr_##_show(struct config_item *item, \ + char *page) \ +{ \ + int id; \ + struct hmdfs_config_bitem *bitem; \ + \ + hmdfs_info("show bundle id"); \ + \ + bitem = container_of(item, struct hmdfs_config_bitem, item); \ + id = query_##_attr_##_hash_entry(&bitem->str); \ + \ + return scnprintf(page, PAGE_SIZE, "%u\n", id); \ +} \ + \ +static ssize_t hmdfs_##_attr_##_store(struct config_item *item, \ + const char *page, size_t count) \ +{ \ + int id; \ + int err; \ + size_t size; \ + struct hmdfs_config_bitem *bitem; \ + \ + hmdfs_info("store bundle id"); \ + \ + bitem = container_of(item, struct hmdfs_config_bitem, item); \ + \ + if (kstrtouint(page, 10, &id)) { \ + size = -EINVAL; \ + goto out; \ + } \ + \ + err = insert_##_attr_##_hash_entry(&bitem->str, id); \ + if (err) { \ + size = err; \ + goto out; \ + } \ + \ + size = count; \ +out: \ + return size; \ +} \ + \ +static struct configfs_attribute hmdfs_##_attr_##_attr = { \ + .ca_name = __stringify(_attr_), \ + .ca_mode = S_IRUGO | S_IWUGO, \ + .ca_owner = THIS_MODULE, \ + .show = hmdfs_##_attr_##_show, \ + .store = hmdfs_##_attr_##_store, \ +}; + +HMDFS_BUNDLE_ATTRIBUTE(bid) + +static struct configfs_attribute *hmdfs_battrs[] = { + &hmdfs_bid_attr, + NULL, +}; + +static void hmdfs_config_bitem_release(struct config_item *item) +{ + struct hmdfs_config_bitem *bitem; + + hmdfs_info("release bundle item"); + + bitem = container_of(item, struct hmdfs_config_bitem, item); + remove_bid_hash_entry(&bitem->str); + remove_bid_hash_entry(&bitem->str); + free_bitem(bitem); +} + +static struct configfs_item_operations hmdfs_config_bitem_ops = { + .release = hmdfs_config_bitem_release, +}; + +static struct config_item_type hmdfs_config_bitem_type = { + .ct_item_ops = &hmdfs_config_bitem_ops, + .ct_attrs = hmdfs_battrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_item *hmdfs_make_bitem(struct config_group *group, + const char *name) +{ + struct config_item *item; + struct hmdfs_config_bitem *bitem; + + hmdfs_info("make bundle item = %s", name); + + bitem = alloc_bitem(name); + if (IS_ERR(bitem)) { + item = ERR_PTR(-ENOMEM); + goto out; + } + + config_item_init_type_name(&bitem->item, name, + &hmdfs_config_bitem_type); + item = &bitem->item; +out: + return item; +} + +static struct configfs_group_operations hmdfs_group_ops = { + .make_item = hmdfs_make_bitem, +}; + +static struct config_item_type hmdfs_group_type = { + .ct_group_ops = &hmdfs_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem hmdfs_subsystem = { + .su_group = { + .cg_item = { + .ci_namebuf = "hmdfs", + .ci_type = &hmdfs_group_type, + }, + }, +}; + +int get_bid(const char *bname) +{ + return hmdfs_bid_get(bname); +} + +int __init hmdfs_init_configfs(void) +{ + int err; + struct configfs_subsystem *subsys; + + hmdfs_info("init configfs"); + + hmdfs_bid_entry_cachep = kmem_cache_create("hmdfs_bid_entry_cachep", + sizeof(struct hmdfs_bid_entry), 0, 0, NULL); + if (!hmdfs_bid_entry_cachep) { + hmdfs_err("failed to create bid entry cachep"); + err = -ENOMEM; + goto out; + } + + subsys = &hmdfs_subsystem; + config_group_init(&subsys->su_group); + mutex_init(&subsys->su_mutex); + + err = configfs_register_subsystem(subsys); + if (err) + hmdfs_err("failed to register subsystem"); + +out: + return err; +} + +void hmdfs_exit_configfs(void) +{ + hmdfs_info("hmdfs exit configfs"); + + configfs_unregister_subsystem(&hmdfs_subsystem); + clear_bid_hash_entry(); + + kmem_cache_destroy(hmdfs_bid_entry_cachep); +} \ No newline at end of file diff --git a/fs/hmdfs/inode_local.c b/fs/hmdfs/inode_local.c index d34b765ab65d..7afab9d98ada 100644 --- a/fs/hmdfs/inode_local.c +++ b/fs/hmdfs/inode_local.c @@ -872,6 +872,8 @@ static int hmdfs_getattr_local(const struct path *path, struct kstat *stat, hmdfs_get_lower_path(path->dentry, &lower_path); ret = vfs_getattr(&lower_path, stat, request_mask, flags); stat->ino = d_inode(path->dentry)->i_ino; + stat->uid = d_inode(path->dentry)->i_uid; + stat->gid = d_inode(path->dentry)->i_gid; hmdfs_put_lower_path(&lower_path); return ret; @@ -892,10 +894,12 @@ int hmdfs_permission(struct inode *inode, int mask) } else if (in_group_p(inode->i_gid)) { mode >>= 3; } else if (is_pkg_auth(hii->perm)) { - if (uid_eq(cur_uid, inode->i_uid)) + kuid_t bid = get_bid_from_uid(cur_uid); + + if (uid_eq(bid, inode->i_uid)) return 0; } else if (is_system_auth(hii->perm)) { - if (in_group_p(MEDIA_RW_GID)) + if (in_group_p(USER_DATA_RW_GID)) return 0; } diff --git a/fs/hmdfs/main.c b/fs/hmdfs/main.c index c9b28e8cb9f1..0456a247caf6 100644 --- a/fs/hmdfs/main.c +++ b/fs/hmdfs/main.c @@ -1034,6 +1034,11 @@ static int __init hmdfs_init(void) hmdfs_err("hmdfs register failed!"); goto out_err; } + + err = hmdfs_init_configfs(); + if (err) + goto out_err; + err = hmdfs_sysfs_init(); if (err) goto out_err; @@ -1043,6 +1048,7 @@ static int __init hmdfs_init(void) return 0; out_err: hmdfs_sysfs_exit(); + hmdfs_exit_configfs(); unregister_filesystem(&hmdfs_fs_type); hmdfs_destroy_caches(); hmdfs_err("hmdfs init failed!"); @@ -1053,6 +1059,7 @@ static void __exit hmdfs_exit(void) { hmdfs_destroy_debugfs_root(); hmdfs_sysfs_exit(); + hmdfs_exit_configfs(); unregister_filesystem(&hmdfs_fs_type); ida_destroy(&hmdfs_sb_seq); hmdfs_destroy_caches(); -- Gitee From 1d2bdbd805260438f1e3449ff5e0807713594f34 Mon Sep 17 00:00:00 2001 From: roger Date: Mon, 7 Feb 2022 16:40:25 +0800 Subject: [PATCH 009/113] blackbox: Introduce a fault log collecting framework for registered modules of chips. ohos inclusion category: feature issue:I4Q6AR CVE: NA ------------------------------- The blackbox is a fault log collecting framework for registered modules of chips. When a fault occurs, bbox will invoke the registered function to save the log and reset the module. There are some APIs for modules to register to blackbox. bbox_register_module_ops is called by the module to register itself including the callback functions dumping logs and resetting it when a fault occurs. The registered modules call bbox_notify_error to report an error. When blackbox get the error, it will save the fault logs and restore the module. Signed-off-by: roger --- drivers/staging/Kconfig | 2 + drivers/staging/Makefile | 1 + drivers/staging/blackbox/Kconfig | 102 ++++ drivers/staging/blackbox/Makefile | 5 + drivers/staging/blackbox/blackbox_common.c | 190 ++++++ drivers/staging/blackbox/blackbox_core.c | 603 ++++++++++++++++++++ drivers/staging/blackbox/blackbox_storage.c | 248 ++++++++ include/linux/blackbox.h | 68 +++ include/linux/blackbox_common.h | 37 ++ include/linux/blackbox_storage.h | 20 + 10 files changed, 1276 insertions(+) create mode 100644 drivers/staging/blackbox/Kconfig create mode 100644 drivers/staging/blackbox/Makefile create mode 100644 drivers/staging/blackbox/blackbox_common.c create mode 100644 drivers/staging/blackbox/blackbox_core.c create mode 100644 drivers/staging/blackbox/blackbox_storage.c create mode 100644 include/linux/blackbox.h create mode 100644 include/linux/blackbox_common.h create mode 100644 include/linux/blackbox_storage.h diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index e7cd80bb8761..d47ee7199bfc 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -124,4 +124,6 @@ source "drivers/staging/hievent/Kconfig" source "drivers/staging/hungtask/Kconfig" +source "drivers/staging/blackbox/Kconfig" + endif # STAGING diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index dfa144064b94..b0fe6f912da6 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -52,3 +52,4 @@ obj-y += hikey9xx/ obj-$(CONFIG_HILOG) += hilog/ obj-$(CONFIG_HIEVENT) += hievent/ obj-$(CONFIG_DFX_HUNGTASK) += hungtask/ +obj-$(CONFIG_BLACKBOX) += blackbox/ diff --git a/drivers/staging/blackbox/Kconfig b/drivers/staging/blackbox/Kconfig new file mode 100644 index 000000000000..a3c94f927227 --- /dev/null +++ b/drivers/staging/blackbox/Kconfig @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: GPL-2.0 +menu "Blackbox Options" + +config BLACKBOX + bool "Support for blackbox" + depends on !ARCH_HAS_SYSCALL_WRAPPER + select STORAGE if BLACKBOX_STORAGE_MATERIAL + default y + help + The blackbox is a fault log collecting framework for registered modules + of chips. When a fault occurs, blackbox will invoke the registered + function to save the log and reset the module. + +config BLACKBOX_LOG_ROOT_PATH + string "root path of the blackbox log" + depends on BLACKBOX + help + define the root path of the blackbox log + +config BLACKBOX_LOG_PART_REPRESENTATIVE + string "representative of the blackbox log part" + depends on BLACKBOX + help + define the representative of the blackbox log part + +config BLACKBOX_STORAGE_BY_MEMORY + tristate "blackbox fault log storage by memory directly" + depends on BLACKBOX + select STORAGE_BY_MEMORY + help + This option enables saving fault logs with memory by blackbox when a + panic occurs. It depends on supporting warm reset and disabling erase + ddr when warm reset. + +config BLACKBOX_STORAGE_BY_PSTORE_BLK + tristate "blackbox fault log storage by pstore blk" + depends on BLACKBOX + depends on PSTORE_BLK + depends on PSTORE_BLACKBOX + select STORAGE_BY_PSTORE_BLK + help + This option enables saving fault logs with pstore blk by blackbox when a + panic occurs. It depends on supporting pstore blk. Especially, flash + driver's panic_write implementation is needed. Othersize, if a panic + happen, then fault log can not be saved. + +config BLACKBOX_STORAGE_BY_PSTORE_RAM + tristate "blackbox fault log storage by pstore ram" + depends on BLACKBOX + depends on PSTORE_RAM + depends on PSTORE_BLACKBOX + select STORAGE_BY_PSTORE_RAM + help + This option enables saving fault logs with pstore ram by blackbox when a + panic occurs. It depends on supporting pstore ram. + +config BLACKBOX_STORAGE_BY_RAW_PARTITION + tristate "blackbox fault log storage by RAW partition" + depends on BLACKBOX + select STORAGE_BY_RAW_PARTITION + help + This option enables saving fault logs with RAW partition by blackbox when a + panic occurs. It depends on reserving partition for blackbox. + +config BLACKBOX_STORAGE_MATERIAL + def_bool y + depends on BLACKBOX + depends on BLACKBOX_STORAGE_BY_MEMORY || BLACKBOX_STORAGE_BY_PSTORE_BLK || \ + BLACKBOX_STORAGE_BY_PSTORE_RAM || BLACKBOX_STORAGE_BY_RAW_PARTITION + +choice + prompt "Default storage material for fault log when a panic occurs." + depends on BLACKBOX_STORAGE_MATERIAL + help + This option choose the default fault log material for blackbox when a + panic occurs. + + The default materail is ram directly. It's easy, but not work offen. + + config DEF_BLACKBOX_STORAGE_BY_MEMORY + bool "memory" if BLACKBOX_STORAGE_BY_MEMORY + + config DEF_BLACKBOX_STORAGE_BY_PSTORE_BLK + bool "pstore_blk" if BLACKBOX_STORAGE_BY_PSTORE_BLK + + config DEF_BLACKBOX_STORAGE_BY_PSTORE_RAM + bool "pstore_ram" if BLACKBOX_STORAGE_BY_PSTORE_RAM + + config DEF_BLACKBOX_STORAGE_BY_RAW_PARTITION + bool "raw_partition" if BLACKBOX_STORAGE_BY_RAW_PARTITION + +endchoice + +config DEF_BLACKBOX_STORAGE + string + depends on BLACKBOX_STORAGE_MATERIAL + default "memory" if DEF_BLACKBOX_STORAGE_BY_MEMORY + default "pstore_blk" if DEF_BLACKBOX_STORAGE_BY_PSTORE_BLK + default "pstore_ram" if DEF_BLACKBOX_STORAGE_BY_PSTORE_RAM + default "raw_partition" if DEF_BLACKBOX_STORAGE_BY_RAW_PARTITION + +endmenu diff --git a/drivers/staging/blackbox/Makefile b/drivers/staging/blackbox/Makefile new file mode 100644 index 000000000000..9befa81a176e --- /dev/null +++ b/drivers/staging/blackbox/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_BLACKBOX) += blackbox_core.o \ + blackbox_storage.o \ + blackbox_common.o diff --git a/drivers/staging/blackbox/blackbox_common.c b/drivers/staging/blackbox/blackbox_common.c new file mode 100644 index 000000000000..e9329c175a6c --- /dev/null +++ b/drivers/staging/blackbox/blackbox_common.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Huawei Technologies Co., Ltd. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void sys_reset(void) +{ + bbox_print_info("reset the system now!\n"); + emergency_restart(); + bbox_print_info("reset the system failed!\n"); +} + +void change_own_mode(char *path, int uid, int gid, int mode) +{ + mm_segment_t old_fs; + int ret = -1; + + if (unlikely(!path || uid == -1 || gid == -1)) { + bbox_print_err("path or uid or gid error.\n"); + return; + } + + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_chown(path, uid, gid); + if (ret != 0) { + bbox_print_err("sys_chown [%s] failed, ret: %d\n", path, ret); + goto __out; + } + + ret = sys_chmod(path, mode); + if (ret != 0) { + bbox_print_err("sys_chmod [%s] failed, ret: %d\n", path, ret); + goto __out; + } + +__out: + set_fs(old_fs); +} + +int full_write_file(const char *pfile_path, char *buf, + size_t buf_size, bool is_append) +{ + mm_segment_t old_fs; + long total_to_write = (long)buf_size; + long total_write = 0; + long write_this_time; + char *ptemp = buf; + int fd = -1; + + if (unlikely(!pfile_path || !buf)) { + bbox_print_err("fd or buf is NULL!\n"); + return -EINVAL; + } + + old_fs = get_fs(); + set_fs(KERNEL_DS); + fd = sys_open(pfile_path, O_CREAT | O_RDWR | + (is_append ? O_APPEND : O_TRUNC), 0); + if (fd < 0) { + bbox_print_err("Create file [%s] failed! ret: %d\n", pfile_path, fd); + goto __out; + } + + while (total_to_write > 0) { + write_this_time = ksys_write(fd, ptemp, total_to_write); + if (write_this_time < 0) { + bbox_print_err("%s\n", "Failed to write file!\n"); + break; + } + ptemp += write_this_time; + total_to_write -= write_this_time; + total_write += write_this_time; + } + +__out: + if (fd >= 0) { + ksys_sync(); + ksys_close(fd); + } + set_fs(old_fs); + + return total_write == (long)buf_size ? 0 : -1; +} + +static int create_new_dir(char *path) +{ + int ret; + mm_segment_t old_fs; + + if (unlikely(!path)) { + bbox_print_err("path is NULL!\n"); + return -EINVAL; + } + + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_access(path, 0); + if (ret != 0) { + ret = sys_mkdir(path, BBOX_DIR_LIMIT); + if (ret < 0) { + bbox_print_err("Create dir [%s] failed! ret: %d\n", path, ret); + set_fs(old_fs); + return -EINVAL; + } + change_own_mode(path, AID_ROOT, AID_SYSTEM, BBOX_DIR_LIMIT); + } + set_fs(old_fs); + + return 0; +} + +int create_log_dir(const char *path) +{ + char *cur_path = NULL; + int index = 0; + + if (unlikely(!path)) { + bbox_print_err("path is NULL!\n"); + return -EINVAL; + } + + if (*path != '/') + return -EINVAL; + cur_path = vmalloc(PATH_MAX_LEN + 1); + if (!cur_path) { + bbox_print_err("vmalloc failed!\n"); + return -ENOMEM; + } + memset(cur_path, 0, PATH_MAX_LEN + 1); + cur_path[index++] = *path++; + while (*path != '\0') { + if (*path == '/') + create_new_dir(cur_path); + cur_path[index] = *path; + path++; + index++; + } + create_new_dir(cur_path); + vfree(cur_path); + + return 0; +} + +void get_timestamp(char *buf, size_t buf_size) +{ + struct rtc_time tm; + struct timespec64 tv; + + if (unlikely(!buf || buf_size == 0)) { + bbox_print_err("buf: %p, buf_size: %u\n", buf, (unsigned int)buf_size); + return; + } + + memset(buf, 0, buf_size); + memset(&tm, 0, sizeof(tm)); + + memset(&tv, 0, sizeof(tv)); + ktime_get_real_ts64(&tv); + tv.tv_sec -= (long)sys_tz.tz_minuteswest * SECONDS_PER_MINUTE; + rtc_time64_to_tm(tv.tv_sec, &tm); + + (void)scnprintf(buf, buf_size, TIMESTAMP_FORMAT, + tm.tm_year + YEAR_BASE, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, get_ticks()); + buf[buf_size - 1] = '\0'; +} + +unsigned long long get_ticks(void) +{ + /* use only one int value to save time: */ + + struct timespec64 uptime; + + ktime_get_ts64(&uptime); + + ktime_get_boottime_ts64(&uptime); + + return (u64)uptime.tv_sec; +} diff --git a/drivers/staging/blackbox/blackbox_core.c b/drivers/staging/blackbox/blackbox_core.c new file mode 100644 index 000000000000..95ff9dc0231a --- /dev/null +++ b/drivers/staging/blackbox/blackbox_core.c @@ -0,0 +1,603 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Huawei Technologies Co., Ltd. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* ---- local macroes ---- */ +/* bbox/BBOX - blackbox */ +#define HISTORY_LOG_NAME "history.log" +#define LOG_PART_WAIT_TIME 1000 /* unit: ms */ +#define HISTORY_LOG_MAX_LEN 1024 +#define TOP_CATEGORY_SYSTEM_RESET "System Reset" +#define TOP_CATEGORY_FREEZE "System Freeze" +#define TOP_CATEGORY_SYSTEM_POWEROFF "POWEROFF" +#define TOP_CATEGORY_SUBSYSTEM_CRASH "Subsystem Crash" +#define CATEGORY_SYSTEM_REBOOT "SYSREBOOT" +#define CATEGORY_SYSTEM_POWEROFF "POWEROFF" +#define CATEGORY_SYSTEM_PANIC "PANIC" +#define CATEGORY_SYSTEM_OOPS "OOPS" +#define CATEGORY_SYSTEM_CUSTOM "CUSTOM" +#define CATEGORY_SYSTEM_WATCHDOG "HWWATCHDOG" +#define CATEGORY_SYSTEM_HUNGTASK "HUNGTASK" +#define CATEGORY_SUBSYSTEM_CUSTOM "CUSTOM" + +#ifndef CONFIG_BLACKBOX_LOG_ROOT_PATH +#error no blackbox log root path +#endif +#ifndef CONFIG_BLACKBOX_LOG_PART_REPRESENTATIVE +#error no representative of the blackbox log part +#endif + +/* ---- local prototypes ---- */ +struct bbox_ops { + struct list_head list; + struct module_ops ops; +}; + +struct error_info_to_category { + const char *module; + struct { + const char *event; + const char *category; + const char *top_category; + } map; +}; + +/* ---- local variables ---- */ +static LIST_HEAD(ops_list); +static DEFINE_SPINLOCK(ops_list_lock); +static DEFINE_SEMAPHORE(temp_error_info_sem); +static struct error_info_to_category error_info_categorys[] = { + { + MODULE_SYSTEM, + {EVENT_SYSREBOOT, CATEGORY_SYSTEM_REBOOT, TOP_CATEGORY_SYSTEM_RESET} + }, + { + MODULE_SYSTEM, + {EVENT_LONGPRESS, CATEGORY_SYSTEM_REBOOT, TOP_CATEGORY_SYSTEM_RESET} + }, + { + MODULE_SYSTEM, + {EVENT_COMBINATIONKEY, CATEGORY_SYSTEM_REBOOT, TOP_CATEGORY_SYSTEM_RESET} + }, + { + MODULE_SYSTEM, + {EVENT_SUBSYSREBOOT, CATEGORY_SYSTEM_REBOOT, TOP_CATEGORY_SYSTEM_RESET} + }, + { + MODULE_SYSTEM, + {EVENT_POWEROFF, CATEGORY_SYSTEM_POWEROFF, TOP_CATEGORY_SYSTEM_POWEROFF} + }, + { + MODULE_SYSTEM, + {EVENT_PANIC, CATEGORY_SYSTEM_PANIC, TOP_CATEGORY_SYSTEM_RESET} + }, + { + MODULE_SYSTEM, + {EVENT_OOPS, CATEGORY_SYSTEM_OOPS, TOP_CATEGORY_SYSTEM_RESET} + }, + { + MODULE_SYSTEM, + {EVENT_SYS_WATCHDOG, CATEGORY_SYSTEM_WATCHDOG, TOP_CATEGORY_FREEZE} + }, + { + MODULE_SYSTEM, + {EVENT_HUNGTASK, CATEGORY_SYSTEM_HUNGTASK, TOP_CATEGORY_FREEZE} + }, +}; + +struct error_info *temp_error_info; + +/* ---- local function prototypes ---- */ +static const char *get_top_category(const char *module, const char *event); +static const char *get_category(const char *module, const char *event); +static void format_log_dir(char *buf, size_t buf_size, + const char *log_root_dir, const char event[EVENT_MAX_LEN], + const char *timestamp); +static void save_history_log(const char *log_root_dir, + struct error_info *info, const char *timestamp, int need_sys_reset); +static void wait_for_log_part(void); +static void format_error_info(struct error_info *info, + const char event[EVENT_MAX_LEN], + const char module[MODULE_MAX_LEN], + const char error_desc[ERROR_DESC_MAX_LEN]); +static void save_last_log(void); +static int save_error_log(void *pparam); + +/* ---- global function prototypes ---- */ + +/* ---- function definitions ---- */ +static const char *get_top_category(const char *module, const char *event) +{ + int i; + int count = (int)ARRAY_SIZE(error_info_categorys); + + if (unlikely(!module || !event)) { + bbox_print_err("module: %p, event: %p\n", module, event); + return TOP_CATEGORY_SUBSYSTEM_CRASH; + } + + for (i = 0; i < count; i++) { + if (!strcmp(error_info_categorys[i].module, module) && + !strcmp(error_info_categorys[i].map.event, event)) { + return error_info_categorys[i].map.top_category; + } + } + if (!strcmp(module, MODULE_SYSTEM)) + return TOP_CATEGORY_SYSTEM_RESET; + + return TOP_CATEGORY_SUBSYSTEM_CRASH; +} + +static const char *get_category(const char *module, const char *event) +{ + int i; + int count = (int)ARRAY_SIZE(error_info_categorys); + + if (unlikely(!module || !event)) { + bbox_print_err("module: %p, event: %p\n", module, event); + return CATEGORY_SUBSYSTEM_CUSTOM; + } + + for (i = 0; i < count; i++) { + if (!strcmp(error_info_categorys[i].module, module) && + !strcmp(error_info_categorys[i].map.event, event)) { + return error_info_categorys[i].map.category; + } + } + if (!strcmp(module, MODULE_SYSTEM)) + return CATEGORY_SYSTEM_CUSTOM; + + return CATEGORY_SUBSYSTEM_CUSTOM; +} + +static void format_log_dir(char *buf, size_t buf_size, + const char *log_root_dir, const char event[EVENT_MAX_LEN], + const char *timestamp) +{ + if (unlikely(!buf || buf_size == 0 || !log_root_dir || + !event || !timestamp)) { + bbox_print_err("buf: %p, buf_size: %u, log_root_dir: %p, event: %p, timestamp: %p\n", + buf, (unsigned int)buf_size, log_root_dir, event, timestamp); + return; + } + + memset(buf, 0, buf_size); + scnprintf(buf, buf_size - 1, "%s/%s_%s", log_root_dir, event, timestamp); +} + +static void format_error_info(struct error_info *info, + const char event[EVENT_MAX_LEN], + const char module[MODULE_MAX_LEN], + const char error_desc[ERROR_DESC_MAX_LEN]) +{ + if (unlikely(!info || !event || !module || !error_desc)) { + bbox_print_err("info: %p, event: %p, module: %p, error_desc: %p\n", + info, event, module, error_desc); + return; + } + + memset(info, 0, sizeof(*info)); + strncpy(info->event, event, min(strlen(event), + sizeof(info->event) - 1)); + strncpy(info->module, module, min(strlen(module), + sizeof(info->module) - 1)); + get_timestamp(info->error_time, TIMESTAMP_MAX_LEN); + strncpy(info->error_desc, error_desc, min(strlen(error_desc), + sizeof(info->error_desc) - 1)); +} + +static void save_history_log(const char *log_root_dir, + struct error_info *info, const char *timestamp, int need_sys_reset) +{ + char history_log_path[PATH_MAX_LEN]; + char *buf; + + if (unlikely(!log_root_dir || !info || !timestamp)) { + bbox_print_err("log_root_dir: %p, info: %p, timestamp: %p\n", + log_root_dir, info, timestamp); + return; + } + + buf = vmalloc(HISTORY_LOG_MAX_LEN + 1); + if (!buf) + return; + + memset(buf, 0, HISTORY_LOG_MAX_LEN + 1); + scnprintf(buf, HISTORY_LOG_MAX_LEN, HISTORY_LOG_FORMAT, + get_top_category(info->module, info->event), info->module, + get_category(info->module, info->event), info->event, timestamp, + need_sys_reset ? "true" : "false", info->error_desc); + memset(history_log_path, 0, sizeof(history_log_path)); + scnprintf(history_log_path, sizeof(history_log_path) - 1, + "%s/%s", log_root_dir, HISTORY_LOG_NAME); + full_write_file(history_log_path, buf, strlen(buf), 1); + change_own_mode(history_log_path, AID_ROOT, AID_SYSTEM, BBOX_FILE_LIMIT); + vfree(buf); +} + +static bool is_log_part_mounted(void) +{ + int ret; + mm_segment_t old_fs; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_access(CONFIG_BLACKBOX_LOG_PART_REPRESENTATIVE, 0); + set_fs(old_fs); + + return ret == 0; +} + +static void wait_for_log_part(void) +{ + bbox_print_info("wait for log part [%s] begin!\n", + CONFIG_BLACKBOX_LOG_PART_REPRESENTATIVE); + while (!is_log_part_mounted()) + msleep(LOG_PART_WAIT_TIME); + + bbox_print_info("wait for log part [%s] end!\n", + CONFIG_BLACKBOX_LOG_PART_REPRESENTATIVE); +} + +static bool find_module_ops(struct error_info *info, struct bbox_ops **ops) +{ + struct list_head *cur = NULL; + struct list_head *next = NULL; + bool find_module = false; + + if (unlikely(!info || !ops)) { + bbox_print_err("info: %p, ops: %p!\n", info, ops); + return find_module; + } + + list_for_each_safe(cur, next, &ops_list) { + *ops = list_entry(cur, struct bbox_ops, list); + if (*ops && !strcmp((*ops)->ops.module, info->module)) { + find_module = true; + break; + } + } + if (!find_module) + bbox_print_err("[%s] hasn't been registered!\n", info->module); + + return find_module; +} + +static void invoke_module_ops(const char *log_dir, struct error_info *info, + struct bbox_ops *ops) +{ + if (unlikely(!info || !!ops)) { + bbox_print_err("info: %p, ops: %p!\n", info, ops); + return; + } + + if (ops->ops.dump && log_dir) { + bbox_print_info("[%s] starts dumping data!\n", ops->ops.module); + ops->ops.dump(log_dir, info); + bbox_print_info("[%s] ends dumping data!\n", ops->ops.module); + } + if (ops->ops.reset) { + bbox_print_info("[%s] starts resetting!\n", ops->ops.module); + ops->ops.reset(info); + bbox_print_info("[%s] ends resetting!\n", ops->ops.module); + } +} + +static void save_log_without_reset(struct error_info *info) +{ + unsigned long flags; + struct bbox_ops *ops = NULL; + char *log_dir = NULL; + char timestamp[TIMESTAMP_MAX_LEN]; + + if (unlikely(!info)) { + bbox_print_err("info: %p!\n", info); + return; + } + + /* get timestamp */ + get_timestamp(timestamp, sizeof(timestamp)); + + /* get bbox ops */ + spin_lock_irqsave(&ops_list_lock, flags); + if (!find_module_ops(info, &ops)) { + spin_unlock_irqrestore(&ops_list_lock, flags); + return; + } + spin_unlock_irqrestore(&ops_list_lock, flags); + create_log_dir(CONFIG_BLACKBOX_LOG_ROOT_PATH); + if (ops->ops.dump) { + /* create log root path */ + if (log_dir) { + format_log_dir(log_dir, PATH_MAX_LEN, + CONFIG_BLACKBOX_LOG_ROOT_PATH, info->event, timestamp); + create_log_dir(log_dir); + } else + bbox_print_err("vmalloc failed!\n"); + } + invoke_module_ops(log_dir, info, ops); + save_history_log(CONFIG_BLACKBOX_LOG_ROOT_PATH, info, timestamp, 0); + if (log_dir) + vfree(log_dir); +} + +static void save_log_with_reset(struct error_info *info) +{ + struct bbox_ops *ops = NULL; + + if (unlikely(!info)) { + bbox_print_err("info: %p!\n", info); + return; + } + + if (!find_module_ops(info, &ops)) + return; + + invoke_module_ops("", info, ops); + sys_reset(); +} + +static void save_temp_error_info(const char event[EVENT_MAX_LEN], + const char module[MODULE_MAX_LEN], + const char error_desc[ERROR_DESC_MAX_LEN]) +{ + if (unlikely(!event || !module || !error_desc)) { + bbox_print_err("event: %p, module: %p, error_desc: %p\n", + event, module, error_desc); + return; + } + + down(&temp_error_info_sem); + format_error_info(temp_error_info, event, module, error_desc); + up(&temp_error_info_sem); +} + +static void do_save_last_log(const struct bbox_ops *ops, const struct error_info *info) +{ + char *log_dir = NULL; + + if (unlikely(!ops || !info)) { + bbox_print_err("ops: %p, info: %p\n", + ops, info); + return; + } + + memset((void *)info, 0, sizeof(*info)); + if (ops->ops.get_last_log_info((struct error_info *)info) != 0) { + bbox_print_err("[%s] failed to get log info!\n", ops->ops.module); + return; + } + bbox_print_info("[%s] starts saving log!\n", ops->ops.module); + bbox_print_info("event: [%s] module: [%s], time is [%s]!\n", + info->event, info->module, info->error_time); + + log_dir = vmalloc(PATH_MAX_LEN); + if (!log_dir) + return; + + if (!strlen(info->error_time)) + get_timestamp((char *)info->error_time, TIMESTAMP_MAX_LEN); + + format_log_dir(log_dir, PATH_MAX_LEN, CONFIG_BLACKBOX_LOG_ROOT_PATH, + info->event, info->error_time); + create_log_dir(log_dir); + if (ops->ops.save_last_log(log_dir, (struct error_info *)info) == 0) + save_history_log(CONFIG_BLACKBOX_LOG_ROOT_PATH, + (struct error_info *)info, info->error_time, 1); + else + bbox_print_err("[%s] failed to save log!\n", ops->ops.module); + vfree(log_dir); +} + +static void save_last_log(void) +{ + unsigned long flags; + struct error_info *info = NULL; + struct list_head *cur = NULL; + struct list_head *next = NULL; + + info = vmalloc(sizeof(*info)); + if (!info) + return; + + spin_lock_irqsave(&ops_list_lock, flags); + list_for_each_safe(cur, next, &ops_list) { + struct bbox_ops *ops = list_entry(cur, struct bbox_ops, list); + + if (!ops) { + bbox_print_err("ops is NULL!\n"); + continue; + } + + if (ops->ops.get_last_log_info && + ops->ops.save_last_log) { + spin_unlock_irqrestore(&ops_list_lock, flags); + do_save_last_log(ops, info); + spin_lock_irqsave(&ops_list_lock, flags); + } else { + bbox_print_err("[%s] get_last_log_info: %p, %s: %p\n", + ops->ops.module, ops->ops.get_last_log_info, + __func__, ops->ops.save_last_log); + } + } + spin_unlock_irqrestore(&ops_list_lock, flags); + vfree(info); +} + +static void save_temp_error_log(void) +{ + down(&temp_error_info_sem); + if (!temp_error_info) { + bbox_print_err("temp_error_info: %p\n", temp_error_info); + up(&temp_error_info_sem); + return; + } + + if (strlen(temp_error_info->event) != 0) + save_log_without_reset(temp_error_info); + + vfree(temp_error_info); + temp_error_info = NULL; + up(&temp_error_info_sem); +} + +static int save_error_log(void *pparam) +{ + wait_for_log_part(); + save_last_log(); + save_temp_error_log(); + + return 0; +} + +#ifdef CONFIG_BLACKBOX_DEBUG +static void print_module_ops(void) +{ + struct bbox_ops *temp = NULL; + + bbox_print_info("The following modules have been registered!\n"); + list_for_each_entry(temp, &ops_list, list) { + bbox_print_info("module: %s, dump: %p, reset: %p, get_last_log_info: %p, + save_last_log: %p\n", + temp->ops.module, temp->ops.dump, temp->ops.reset, temp->ops.get_last_log_info, + temp->ops.save_last_log); + } +} +#endif + +int bbox_register_module_ops(struct module_ops *ops) +{ + struct bbox_ops *new_ops = NULL; + struct bbox_ops *temp = NULL; + struct list_head *cur = NULL; + struct list_head *next = NULL; + unsigned long flags; + + if (unlikely(!ops)) { + bbox_print_err("ops: %p\n", ops); + return -EINVAL; + } + + new_ops = vmalloc(sizeof(*new_ops)); + if (!new_ops) + return -ENOMEM; + memset(new_ops, 0, sizeof(*new_ops)); + memcpy(&new_ops->ops, ops, sizeof(*ops)); + spin_lock_irqsave(&ops_list_lock, flags); + if (list_empty(&ops_list)) + goto __out; + + list_for_each_safe(cur, next, &ops_list) { + temp = list_entry(cur, struct bbox_ops, list); + if (!strcmp(temp->ops.module, ops->module)) { + spin_unlock_irqrestore(&ops_list_lock, flags); + vfree(new_ops); + bbox_print_info("[%s] has been registered!\n", temp->ops.module); + return -ENODATA; + } + } + +__out: + bbox_print_info("[%s] is registered successfully!\n", ops->module); + list_add_tail(&new_ops->list, &ops_list); + spin_unlock_irqrestore(&ops_list_lock, flags); +#ifdef CONFIG_BLACKBOX_DEBUG + print_module_ops(); +#endif + + return 0; +} + +int bbox_notify_error(const char event[EVENT_MAX_LEN], const char module[MODULE_MAX_LEN], + const char error_desc[ERROR_DESC_MAX_LEN], int need_sys_reset) +{ + struct error_info *info = NULL; + + if (unlikely(!event || !module || !error_desc)) { + bbox_print_err("event: %p, module: %p, error_desc: %p\n", event, + module, error_desc); + return -EINVAL; + } + + info = vmalloc(sizeof(*info)); + if (!info) + return -ENOMEM; + + format_error_info(info, event, module, error_desc); + show_stack(current, NULL, KERN_DEFAULT); + if (!need_sys_reset) { + /* handle the error which do not need reset */ + if (!is_log_part_mounted()) + save_temp_error_info(event, module, error_desc); + else + save_log_without_reset(info); + } else { + /* handle the error which need reset */ + save_log_with_reset(info); + } + + vfree(info); + + return 0; +} + +static void __init select_storage_material(void) +{ + const struct reboot_crashlog_storage *tmp = NULL; + + if (!storage_material) + return; + + for (tmp = storage_lastwords; tmp->material; tmp++) { + if (!strcmp(storage_material, tmp->material)) { + storage_lastword = tmp; + return; + } + } +} + +static int __init blackbox_core_init(void) +{ + struct task_struct *tsk = NULL; + + select_storage_material(); + + temp_error_info = vmalloc(sizeof(*temp_error_info)); + if (!temp_error_info) + return -ENOMEM; + + memset(temp_error_info, 0, sizeof(*temp_error_info)); + + /* Create a kernel thread to save log */ + tsk = kthread_run(save_error_log, NULL, "save_error_log"); + if (IS_ERR(tsk)) { + vfree(temp_error_info); + temp_error_info = NULL; + bbox_print_err("kthread_run failed!\n"); + return -ESRCH; + } + + return 0; +} + +core_initcall(blackbox_core_init); +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Blackbox core framework"); +MODULE_AUTHOR("OHOS"); diff --git a/drivers/staging/blackbox/blackbox_storage.c b/drivers/staging/blackbox/blackbox_storage.c new file mode 100644 index 000000000000..3e3634a6d667 --- /dev/null +++ b/drivers/staging/blackbox/blackbox_storage.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Huawei Technologies Co., Ltd. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +char *storage_material = +#ifdef CONFIG_DEF_BLACKBOX_STORAGE + CONFIG_DEF_BLACKBOX_STORAGE; +#else + NULL; +#endif +const struct reboot_crashlog_storage *storage_lastword __ro_after_init; + +#if IS_ENABLED(CONFIG_DEF_BLACKBOX_STORAGE_BY_MEMORY) +static DEFINE_SEMAPHORE(kmsg_sem); +static char *lastlog; +unsigned int lastlog_len; +static int get_log_by_memory(void *in, unsigned int inlen) +{ + return 0; +} + +static int storage_log_by_memory(void *out, unsigned int outlen) +{ + if (unlikely(!out)) + return -EINVAL; + + /* Initialized from caller. */ + lastlog = out; + lastlog_len = outlen; + return 0; +} + +/* Called after storage_log_by_memory successfully. */ +static void do_kmsg_dump(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason) +{ + struct fault_log_info *pinfo; + + if (unlikely(!lastlog)) + return; + + /* get kernel log from kmsg dump module */ + if (down_trylock(&kmsg_sem) != 0) { + bbox_print_err("down_trylock failed!\n"); + return; + } + pinfo = (struct fault_log_info *)lastlog; + (void)kmsg_dump_get_buffer(dumper, true, lastlog + sizeof(*pinfo), + lastlog_len - sizeof(*pinfo), &pinfo->len); + up(&kmsg_sem); +} +#endif + +#if IS_ENABLED(CONFIG_DEF_BLACKBOX_STORAGE_BY_PSTORE_BLK) +#define LOG_FILE_WAIT_TIME 1000 /* unit: ms */ +#define RETRY_MAX_COUNT 10 +#define PSTORE_MOUNT_POINT "/sys/fs/pstore/" +#define FILE_LIMIT (0660) + +#if __BITS_PER_LONG == 64 +#define sys_lstat sys_newlstat +#else +#define sys_lstat sys_lstat64 +#endif + +struct sys_st { +#if __BITS_PER_LONG == 64 + struct stat __st; +#else + struct stat64 __st; +#endif +}; + +static bool is_pstore_part_ready(char *pstore_file) +{ + mm_segment_t old_fs; + int fd = -1; + void *buf = NULL; + char *full_path = NULL; + struct linux_dirent64 *dirp; + int num; + int ret = -1; + struct sys_st st; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + + fd = sys_open(PSTORE_MOUNT_POINT, O_RDONLY, 0); + if (fd < 0) { + bbox_print_err("open dir [%s] failed!\n", PSTORE_MOUNT_POINT); + goto __out; + } + + buf = vmalloc(PATH_MAX_LEN); + if (!buf) + goto __out; + + full_path = vmalloc(PATH_MAX_LEN); + if (!full_path) + goto __out; + + dirp = buf; + + num = sys_getdents64(fd, dirp, PATH_MAX_LEN); + while (num > 0) { + while (num > 0) { + if ((strcmp(dirp->d_name, ".") == 0) || (strcmp(dirp->d_name, "..") == 0)) { + num -= dirp->d_reclen; + dirp = (void *)dirp + dirp->d_reclen; + continue; + } + + memset(full_path, 0, PATH_MAX_LEN); + snprintf(full_path, PATH_MAX_LEN - 1, "%s%s", PSTORE_MOUNT_POINT, dirp->d_name); + + memset((void *)&st, 0, sizeof(struct sys_st)); + + ret = sys_lstat(full_path, &st.__st); + if ((ret == 0) && (S_ISREG(st.__st.st_mode)) && + (strncmp(dirp->d_name, "blackbox", strlen("blackbox")) == 0)) { + if (strcmp(full_path, pstore_file) > 0) + strncpy(pstore_file, full_path, strlen(full_path)); + bbox_print_info("get pstore file name %s %s!\n", pstore_file, + ret ? "failed" : "successfully"); + } + + num -= dirp->d_reclen; + dirp = (void *)dirp + dirp->d_reclen; + } + + dirp = buf; + memset(buf, 0, PATH_MAX_LEN); + num = sys_getdents64(fd, dirp, PATH_MAX_LEN); + } + +__out: + if (fd >= 0) + sys_close(fd); + + set_fs(old_fs); + + vfree(buf); + vfree(full_path); + + return ret == 0; +} + +static int get_log_by_pstore_blk(void *in, unsigned int inlen) +{ + char pstore_file[PATH_MAX_LEN]; + void *pbuf = NULL; + void *pbuf_temp = NULL; + static int retry; + int need_read_size = 0; + int fd = -1; + int ret = 0; + + memset(pstore_file, 0, PATH_MAX_LEN); + while (!is_pstore_part_ready((char *)&pstore_file)) { + msleep(LOG_FILE_WAIT_TIME); + retry++; + if (retry >= RETRY_MAX_COUNT) + return -ENOENT; + } + + if (likely(in)) { + fd = sys_open(pstore_file, O_RDONLY, FILE_LIMIT); + if (fd < 0) { + bbox_print_err("%s():%d: open %s failed! [%d]\n", __func__, + __LINE__, pstore_file, fd); + return -EBADF; + } + memset(in, 0, inlen); + need_read_size = inlen; + pbuf = in; + + pbuf_temp = kzalloc(SZ_4K, GFP_KERNEL); + if (!pbuf_temp) + goto __out; + + while (need_read_size > 0) { + ret = sys_read(fd, pbuf_temp, SZ_4K); + if (ret < 0) { + bbox_print_err("%s():%d: read failed! [%d]\n", __func__, + __LINE__, ret); + goto __error; + } + + if (ret == 0) + break; + + memcpy((void *)pbuf, (const void *)pbuf_temp, ret); + pbuf += ret; + need_read_size -= ret; + } + kfree(pbuf_temp); + } + + sys_close(fd); + + return 0; + +__error: + kfree(pbuf_temp); +__out: + sys_close(fd); + return -EIO; +} +#endif + +const struct reboot_crashlog_storage storage_lastwords[] = { +#if IS_ENABLED(CONFIG_DEF_BLACKBOX_STORAGE_BY_MEMORY) + { + .get_log = get_log_by_memory, + .storage_log = storage_log_by_memory, + .blackbox_dump = do_kmsg_dump, + .material = "memory", + }, +#endif +#if IS_ENABLED(CONFIG_DEF_BLACKBOX_STORAGE_BY_PSTORE_BLK) + { + .get_log = get_log_by_pstore_blk, + .blackbox_dump = pstore_blackbox_dump, + .material = "pstore_blk", + }, +#endif +#if IS_ENABLED(CONFIG_DEF_BLACKBOX_STORAGE_BY_PSTORE_RAM) + { + .material = "pstore_ram", + }, +#endif +#if IS_ENABLED(CONFIG_DEF_BLACKBOX_STORAGE_BY_RAW_PARTITION) + { + .material = "raw_partition", + }, +#endif + { } +}; + diff --git a/include/linux/blackbox.h b/include/linux/blackbox.h new file mode 100644 index 000000000000..78e2599b8185 --- /dev/null +++ b/include/linux/blackbox.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021 Huawei Technologies Co., Ltd. All rights reserved. + */ + +#ifndef BLACKBOX_H +#define BLACKBOX_H + +#include +#include + +#define PATH_MAX_LEN 256 +#define EVENT_MAX_LEN 32 +#define MODULE_MAX_LEN 32 +#define TIMESTAMP_MAX_LEN 24 +#define ERROR_DESC_MAX_LEN 512 +#define LOG_FLAG "VALIDLOG" + +/* module type */ +#define MODULE_SYSTEM "SYSTEM" + +/* fault event type */ +#define EVENT_SYSREBOOT "SYSREBOOT" +#define EVENT_LONGPRESS "LONGPRESS" +#define EVENT_COMBINATIONKEY "COMBINATIONKEY" +#define EVENT_SUBSYSREBOOT "SUBSYSREBOOT" +#define EVENT_POWEROFF "POWEROFF" +#define EVENT_PANIC "PANIC" +#define EVENT_OOPS "OOPS" +#define EVENT_SYS_WATCHDOG "SYSWATCHDOG" +#define EVENT_HUNGTASK "HUNGTASK" +#define EVENT_BOOTFAIL "BOOTFAIL" + +#define bbox_print_err(format, ...) \ + pr_err("bbox: func: %s, line: %d, err: " \ + format, __func__, __LINE__, ##__VA_ARGS__) +#define bbox_print_info(format, ...) \ + pr_err("bbox: info: " format, ##__VA_ARGS__) + +struct error_info { + char event[EVENT_MAX_LEN]; + char module[MODULE_MAX_LEN]; + char error_time[TIMESTAMP_MAX_LEN]; + char error_desc[ERROR_DESC_MAX_LEN]; +}; + +struct fault_log_info { + char flag[8]; /* 8 is the length of the flag */ + int len; /* length of the kernel fault log */ + struct error_info info; +}; + +struct module_ops { + char module[MODULE_MAX_LEN]; + void (*dump)(const char *log_dir, struct error_info *info); + void (*reset)(struct error_info *info); + int (*get_last_log_info)(struct error_info *info); + int (*save_last_log)(const char *log_dir, struct error_info *info); +}; + +void get_timestamp(char *buf, size_t buf_size); +int bbox_register_module_ops(struct module_ops *ops); +int bbox_notify_error(const char event[EVENT_MAX_LEN], + const char module[MODULE_MAX_LEN], + const char error_desc[ERROR_DESC_MAX_LEN], + int need_sys_reset); + +#endif /* BLACKBOX_H */ diff --git a/include/linux/blackbox_common.h b/include/linux/blackbox_common.h new file mode 100644 index 000000000000..11e348e279b2 --- /dev/null +++ b/include/linux/blackbox_common.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021 Huawei Technologies Co., Ltd. All rights reserved. + */ + +#ifndef BLACKBOX_COMMON_H +#define BLACKBOX_COMMON_H + +#include + +/* bbox/BBOX - blackbox */ +#define YEAR_BASE 1900 +#define SECONDS_PER_MINUTE 60 +#define AID_ROOT 0 +#define AID_SYSTEM 1000 +#define BBOX_DIR_LIMIT 0775 +#define BBOX_FILE_LIMIT 0664 +#define PATH_MAX_LEN 256 + +/* + * format: + * [topCategoryName],module[moduleName],category[categoryName],\ + * event[eventName],time[seconds from 1970-01-01 00:00:00 UTC-tick],\ + * sysreboot[true|false],errordesc[errorDescription]\r\n + */ +#define HISTORY_LOG_FORMAT "[%s],module[%s],category[%s],event[%s],"\ + "time[%s],sysreboot[%s],errdesc[%s]\r\n" +#define TIMESTAMP_FORMAT "%04d%02d%02d%02d%02d%02d_%08llu" + +void sys_reset(void); +void change_own_mode(char *path, int uid, int gid, int mode); +int full_write_file(const char *pfile_path, char *buf, + size_t buf_size, bool read_file); +int create_log_dir(const char *path); +unsigned long long get_ticks(void); + +#endif /* BLACKBOX_COMMON_H */ diff --git a/include/linux/blackbox_storage.h b/include/linux/blackbox_storage.h new file mode 100644 index 000000000000..ee4e699b07f7 --- /dev/null +++ b/include/linux/blackbox_storage.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021 Huawei Technologies Co., Ltd. All rights reserved. + */ + +#ifndef BLACKBOX_STORAGE_H +#define BLACKBOX_STORAGE_H + +struct reboot_crashlog_storage { + int (*storage_log)(void *out, unsigned int outlen); + int (*get_log)(void *in, unsigned int inlen); + void (*blackbox_dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason); + const char *material; +}; + +extern char *storage_material; +extern const struct reboot_crashlog_storage *storage_lastword; +extern const struct reboot_crashlog_storage storage_lastwords[]; + +#endif /* BLACKBOX_STORAGE_H */ -- Gitee From d33625d2b42cc3883d5174779fccbe4e3d73a492 Mon Sep 17 00:00:00 2001 From: roger Date: Thu, 10 Feb 2022 00:47:47 +0800 Subject: [PATCH 010/113] pstore: Introduce BlackBox to pstore ohos inclusion category: feature issue:I4Q6AR CVE: NA ------------------------------- Store the customised kernel fault log for BlackBox when oops or panic happened. Signed-off-by: roger --- fs/pstore/Kconfig | 33 +++++++++++ fs/pstore/blk.c | 10 ++++ fs/pstore/platform.c | 112 ++++++++++++++++++++++++++++++++++++ fs/pstore/zone.c | 42 +++++++++++++- include/linux/pstore.h | 8 +++ include/linux/pstore_blk.h | 2 + include/linux/pstore_zone.h | 2 + 7 files changed, 208 insertions(+), 1 deletion(-) diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 8efe60487b48..25bd47bd03f3 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -138,6 +138,18 @@ config PSTORE_FTRACE If unsure, say N. +config PSTORE_BLACKBOX + bool "Store customised fault log" + depends on PSTORE + depends on BLACKBOX + help + Enable storing the customised fault log for BlackBox. + + With the option enabled, pstore will store the customised kernel + fault log for BlackBox when oops or panic happened. + + If unsure, say N. + config PSTORE_RAM tristate "Log panic/oops to a RAM buffer" depends on PSTORE @@ -263,3 +275,24 @@ config PSTORE_BLK_FTRACE_SIZE NOTE that, both Kconfig and module parameters can configure pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_BLACKBOX_SIZE + int "Size in Kbytes of fault log for BlackBox to store" + depends on PSTORE_BLK + depends on PSTORE_BLACKBOX + default 64 + help + This just sets size of fault log (blackbox_size) for pstore/blk. + The size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLACKBOX_STACK_SIZE + int "Default stack size for BlackBox" if EXPERT + depends on PSTORE + depends on PSTORE_BLACKBOX + default 1024 + help + Defines default size of pstore stack size for blackbox. + Can be enlarged if needed. not recommended to shrink it. diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index fcd5563dde06..6a768a5dae91 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -51,6 +51,14 @@ static long ftrace_size = -1; module_param(ftrace_size, long, 0400); MODULE_PARM_DESC(ftrace_size, "ftrace size in kbytes"); +#if IS_ENABLED(CONFIG_PSTORE_BLACKBOX) +static long blackbox_size = CONFIG_PSTORE_BLK_BLACKBOX_SIZE; +#else +static long blackbox_size = -1; +#endif +module_param(blackbox_size, long, 0400); +MODULE_PARM_DESC(blackbox_size, "blackbox size in kbytes"); + static bool best_effort; module_param(best_effort, bool, 0400); MODULE_PARM_DESC(best_effort, "use best effort to write (i.e. do not require storage driver pstore support, default: off)"); @@ -144,6 +152,7 @@ static int __register_pstore_device(struct pstore_device_info *dev) verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG); verify_size(console_size, 4096, dev->flags & PSTORE_FLAGS_CONSOLE); verify_size(ftrace_size, 4096, dev->flags & PSTORE_FLAGS_FTRACE); + verify_size(blackbox_size, 4096, dev->flags & PSTORE_FLAGS_BLACKBOX); #undef verify_size pstore_zone_info->total_size = dev->total_size; @@ -476,6 +485,7 @@ int pstore_blk_get_config(struct pstore_blk_config *info) info->pmsg_size = check_size(pmsg_size, 4096); info->ftrace_size = check_size(ftrace_size, 4096); info->console_size = check_size(console_size, 4096); + info->blackbox_size = check_size(blackbox_size, 4096); return 0; } diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b1ebf7b61732..f9ab44bf0de6 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -16,6 +16,10 @@ #include #include #include +#ifdef CONFIG_PSTORE_BLACKBOX +#include +#include +#endif #if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS) #include #endif @@ -58,6 +62,7 @@ static const char * const pstore_type_names[] = { "powerpc-common", "pmsg", "powerpc-opal", + "blackbox", }; static int pstore_new_entry; @@ -378,6 +383,113 @@ void pstore_record_init(struct pstore_record *record, record->time = ns_to_timespec64(ktime_get_real_fast_ns()); } +/* + * Store the customised fault log + */ +#ifdef CONFIG_PSTORE_BLACKBOX +#define PSTORE_FLAG "PSTORE" +#define CALLSTACK_MAX_ENTRIES 20 +static void dump_stacktrace(char *pbuf, size_t buf_size, bool is_panic) +{ + int i; + size_t stack_len = 0; + size_t com_len = 0; + unsigned long entries[CALLSTACK_MAX_ENTRIES]; + unsigned int nr_entries; + char tmp_buf[ERROR_DESC_MAX_LEN]; + bool find_panic = false; + + if (unlikely(!pbuf || !buf_size)) + return; + + memset(pbuf, 0, buf_size); + memset(tmp_buf, 0, sizeof(tmp_buf)); + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); + com_len = scnprintf(pbuf, buf_size, "Comm:%s,CPU:%d,Stack:", + current->comm, raw_smp_processor_id()); + for (i = 0; i < nr_entries; i++) { + if (stack_len >= sizeof(tmp_buf)) { + tmp_buf[sizeof(tmp_buf) - 1] = '\0'; + break; + } + stack_len += scnprintf(tmp_buf + stack_len, sizeof(tmp_buf) - stack_len, + "%pS-", (void *)entries[i]); + if (!find_panic && is_panic) { + if (strncmp(tmp_buf, "panic", strlen("panic")) == 0) + find_panic = true; + else + (void)memset(tmp_buf, 0, sizeof(tmp_buf)); + } + } + if (com_len >= buf_size) + return; + stack_len = min(buf_size - com_len, strlen(tmp_buf)); + memcpy(pbuf + com_len, tmp_buf, stack_len); + *(pbuf + buf_size - 1) = '\0'; +} + +void pstore_blackbox_dump(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason) +{ +#if defined(CONFIG_PSTORE_BLK) + const char *why; + int ret; + + why = kmsg_dump_reason_str(reason); + + if (down_trylock(&psinfo->buf_lock)) { + /* Failed to acquire lock: give up if we cannot wait. */ + if (pstore_cannot_wait(reason)) { + pr_err("dump skipped in %s path: may corrupt error record\n", + in_nmi() ? "NMI" : why); + return; + } + if (down_interruptible(&psinfo->buf_lock)) { + pr_err("could not grab semaphore?!\n"); + return; + } + } + + char *dst; + size_t dst_size; + struct pstore_record record; + struct fault_log_info *pfault_log_info = (struct fault_log_info *)psinfo->buf; + + memset(pfault_log_info, 0, sizeof(*pfault_log_info)); + + pstore_record_init(&record, psinfo); + + record.type = PSTORE_TYPE_BLACKBOX; + record.reason = reason; + + memcpy(pfault_log_info->flag, LOG_FLAG, strlen(LOG_FLAG)); + strncpy(pfault_log_info->info.event, why, + min(strlen(why), sizeof(pfault_log_info->info.event) - 1)); + strncpy(pfault_log_info->info.module, PSTORE_FLAG, + min(strlen(PSTORE_FLAG), sizeof(pfault_log_info->info.module) - 1)); + get_timestamp(pfault_log_info->info.error_time, TIMESTAMP_MAX_LEN); + dump_stacktrace(pfault_log_info->info.error_desc, sizeof(pfault_log_info->info.error_desc), false); + + record.buf = psinfo->buf; + + dst = psinfo->buf; + dst_size = psinfo->bufsize; + + dst_size -= sizeof(struct fault_log_info); + + (void)kmsg_dump_get_buffer(dumper, true, dst + sizeof(struct fault_log_info), + dst_size, &(pfault_log_info->len)); + + record.size = sizeof(struct fault_log_info) + pfault_log_info->len; + ret = psinfo->write(&record); + + up(&psinfo->buf_lock); + +#endif +} +EXPORT_SYMBOL_GPL(pstore_blackbox_dump); +#endif + /* * callback from kmsg_dump. Save as much as we can (up to kmsg_bytes) from the * end of the buffer. diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 3ce89216670c..4a175f2245f3 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -93,6 +93,7 @@ struct pstore_zone { * @ppsz: pmsg storage zone * @cpsz: console storage zone * @fpszs: ftrace storage zones + * @bpsz: blackbox storage zone * @kmsg_max_cnt: max count of @kpszs * @kmsg_read_cnt: counter of total read kmsg dumps * @kmsg_write_cnt: counter of total kmsg dump writes @@ -100,6 +101,7 @@ struct pstore_zone { * @console_read_cnt: counter of total read console zone * @ftrace_max_cnt: max count of @fpszs * @ftrace_read_cnt: counter of max read ftrace zone + * @blackbox_read_cnt: counter of total read blackbox zone * @oops_counter: counter of oops dumps * @panic_counter: counter of panic dumps * @recovered: whether finished recovering data from storage @@ -113,6 +115,7 @@ struct psz_context { struct pstore_zone *ppsz; struct pstore_zone *cpsz; struct pstore_zone **fpszs; + struct pstore_zone *bpsz; unsigned int kmsg_max_cnt; unsigned int kmsg_read_cnt; unsigned int kmsg_write_cnt; @@ -120,6 +123,7 @@ struct psz_context { unsigned int console_read_cnt; unsigned int ftrace_max_cnt; unsigned int ftrace_read_cnt; + unsigned int blackbox_read_cnt; /* * These counters should be calculated during recovery. * It records the oops/panic times after crashes rather than boots. @@ -325,6 +329,8 @@ static void psz_flush_all_dirty_zones(struct work_struct *work) ret |= psz_flush_dirty_zones(cxt->kpszs, cxt->kmsg_max_cnt); if (cxt->fpszs) ret |= psz_flush_dirty_zones(cxt->fpszs, cxt->ftrace_max_cnt); + if (cxt->bpsz) + ret |= psz_flush_dirty_zone(cxt->bpsz); if (ret && cxt->pstore_zone_info) schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(1000)); } @@ -617,6 +623,10 @@ static inline int psz_recovery(struct psz_context *cxt) if (ret) goto out; + ret = psz_recover_zone(cxt, cxt->bpsz); + if (ret) + goto out; + ret = psz_recover_zones(cxt, cxt->fpszs, cxt->ftrace_max_cnt); out: @@ -637,6 +647,7 @@ static int psz_pstore_open(struct pstore_info *psi) cxt->pmsg_read_cnt = 0; cxt->console_read_cnt = 0; cxt->ftrace_read_cnt = 0; + cxt->blackbox_read_cnt = 0; return 0; } @@ -713,6 +724,8 @@ static int psz_pstore_erase(struct pstore_record *record) if (record->id >= cxt->ftrace_max_cnt) return -EINVAL; return psz_record_erase(cxt, cxt->fpszs[record->id]); + case PSTORE_TYPE_BLACKBOX: + return psz_record_erase(cxt, cxt->bpsz); default: return -EINVAL; } } @@ -898,6 +911,8 @@ static int notrace psz_pstore_write(struct pstore_record *record) return -ENOSPC; return psz_record_write(cxt->fpszs[zonenum], record); } + case PSTORE_TYPE_BLACKBOX: + return psz_record_write(cxt->bpsz, record); default: return -EINVAL; } @@ -935,6 +950,13 @@ static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt) return zone; } + if (cxt->blackbox_read_cnt == 0) { + cxt->blackbox_read_cnt++; + zone = cxt->bpsz; + if (psz_old_ok(zone)) + return zone; + } + return NULL; } @@ -1082,6 +1104,7 @@ static ssize_t psz_pstore_read(struct pstore_record *record) break; case PSTORE_TYPE_CONSOLE: case PSTORE_TYPE_PMSG: + case PSTORE_TYPE_BLACKBOX: readop = psz_record_read; break; default: @@ -1145,6 +1168,8 @@ static void psz_free_all_zones(struct psz_context *cxt) psz_free_zone(&cxt->cpsz); if (cxt->fpszs) psz_free_zones(&cxt->fpszs, &cxt->ftrace_max_cnt); + if (cxt->bpsz) + psz_free_zone(&cxt->bpsz); } static struct pstore_zone *psz_init_zone(enum pstore_type_id type, @@ -1266,6 +1291,15 @@ static int psz_alloc_zones(struct psz_context *cxt) goto free_out; } + off_size += info->blackbox_size; + cxt->bpsz = psz_init_zone(PSTORE_TYPE_BLACKBOX, &off, + info->blackbox_size); + if (IS_ERR(cxt->bpsz)) { + err = PTR_ERR(cxt->bpsz); + cxt->bpsz = NULL; + goto free_out; + } + cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, info->total_size - off_size, info->kmsg_size, &cxt->kmsg_max_cnt); @@ -1301,7 +1335,7 @@ int register_pstore_zone(struct pstore_zone_info *info) } if (!info->kmsg_size && !info->pmsg_size && !info->console_size && - !info->ftrace_size) { + !info->ftrace_size && !info->blackbox_size) { pr_warn("at least one record size must be non-zero\n"); return -EINVAL; } @@ -1326,6 +1360,7 @@ int register_pstore_zone(struct pstore_zone_info *info) check_size(pmsg_size, SECTOR_SIZE); check_size(console_size, SECTOR_SIZE); check_size(ftrace_size, SECTOR_SIZE); + check_size(blackbox_size, SECTOR_SIZE); #undef check_size @@ -1354,6 +1389,7 @@ int register_pstore_zone(struct pstore_zone_info *info) pr_debug("\tpmsg size : %ld Bytes\n", info->pmsg_size); pr_debug("\tconsole size : %ld Bytes\n", info->console_size); pr_debug("\tftrace size : %ld Bytes\n", info->ftrace_size); + pr_debug("\tblackbox size : %ld Bytes\n", info->blackbox_size); err = psz_alloc_zones(cxt); if (err) { @@ -1395,6 +1431,10 @@ int register_pstore_zone(struct pstore_zone_info *info) cxt->pstore.flags |= PSTORE_FLAGS_FTRACE; pr_cont(" ftrace"); } + if (info->blackbox_size) { + cxt->pstore.flags |= PSTORE_FLAGS_BLACKBOX; + pr_cont(" blackbox"); + } pr_cont("\n"); err = pstore_register(&cxt->pstore); diff --git a/include/linux/pstore.h b/include/linux/pstore.h index eb93a54cff31..87f26a7c4e7e 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,6 +39,8 @@ enum pstore_type_id { PSTORE_TYPE_PMSG = 7, PSTORE_TYPE_PPC_OPAL = 8, + PSTORE_TYPE_BLACKBOX = 9, + /* End of the list */ PSTORE_TYPE_MAX }; @@ -202,6 +204,7 @@ struct pstore_info { #define PSTORE_FLAGS_CONSOLE BIT(1) #define PSTORE_FLAGS_FTRACE BIT(2) #define PSTORE_FLAGS_PMSG BIT(3) +#define PSTORE_FLAGS_BLACKBOX BIT(4) extern int pstore_register(struct pstore_info *); extern void pstore_unregister(struct pstore_info *); @@ -282,4 +285,9 @@ pstore_ftrace_write_timestamp(struct pstore_ftrace_record *rec, u64 val) } #endif +#ifdef CONFIG_PSTORE_BLACKBOX +extern void pstore_blackbox_dump(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason); +#endif + #endif /*_LINUX_PSTORE_H*/ diff --git a/include/linux/pstore_blk.h b/include/linux/pstore_blk.h index 61e914522b01..399f10524356 100644 --- a/include/linux/pstore_blk.h +++ b/include/linux/pstore_blk.h @@ -96,6 +96,7 @@ void unregister_pstore_device(struct pstore_device_info *dev); * @pmsg_size: Total size of the pmsg storage area * @console_size: Total size of the console storage area * @ftrace_size: Total size for ftrace logging data (for all CPUs) + * @blackbox_size: Total size of the blackbox storage area */ struct pstore_blk_config { char device[80]; @@ -104,6 +105,7 @@ struct pstore_blk_config { unsigned long pmsg_size; unsigned long console_size; unsigned long ftrace_size; + unsigned long blackbox_size; }; /** diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h index 1e35eaa33e5e..1de82e4aa809 100644 --- a/include/linux/pstore_zone.h +++ b/include/linux/pstore_zone.h @@ -21,6 +21,7 @@ typedef ssize_t (*pstore_zone_erase_op)(size_t, loff_t); * @pmsg_size: The size of pmsg zone which is the same as @kmsg_size. * @console_size:The size of console zone which is the same as @kmsg_size. * @ftrace_size:The size of ftrace zone which is the same as @kmsg_size. + * @blackbox_size:The size of blackbox zone which is the same as @kmsg_size. * @read: The general read operation. Both of the function parameters * @size and @offset are relative value to storage. * On success, the number of bytes should be returned, others @@ -48,6 +49,7 @@ struct pstore_zone_info { unsigned long pmsg_size; unsigned long console_size; unsigned long ftrace_size; + unsigned long blackbox_size; pstore_zone_read_op read; pstore_zone_write_op write; pstore_zone_erase_op erase; -- Gitee From 56827c3630c7f0e913fdac1811c0de167c95f268 Mon Sep 17 00:00:00 2001 From: CY Fan Date: Wed, 9 Feb 2022 17:06:20 +0800 Subject: [PATCH 011/113] hyperhold: fix undefined reference to `__aeabi_uldivmod` error ohos inclusion category: bugfix issue: #I4T0KA CVE: NA ----------------- This patch fixes undefined reference to `__aeabi_uldivmod` error when make allmodconfig on arm32. In a 32-bit system, if a or b is a 64-bit value, you cannot simply use a/b, but need to use a special division function, such as do_div(a, b), div_u64(a, b) and so on. Signed-off-by: CY Fan --- drivers/hyperhold/hp_core.c | 6 +++--- drivers/hyperhold/hp_space.c | 2 +- include/linux/zswapd.h | 4 ++++ mm/memcg_control.c | 4 ++-- mm/zswapd.c | 22 +++++++++++----------- 5 files changed, 21 insertions(+), 17 deletions(-) diff --git a/drivers/hyperhold/hp_core.c b/drivers/hyperhold/hp_core.c index 86a9e4704f2e..0d80b88452d0 100644 --- a/drivers/hyperhold/hp_core.c +++ b/drivers/hyperhold/hp_core.c @@ -287,7 +287,7 @@ int hyperhold_addr_extent(u64 addr) if (!CHECK_INITED) return -EINVAL; - eid = addr / hyperhold.spc.ext_size; + eid = div_u64(addr, hyperhold.spc.ext_size); spc = space_of(eid); if (!CHECK(spc, "invalid eid %u!\n", eid)) return -EINVAL; @@ -302,7 +302,7 @@ int hyperhold_addr_offset(u64 addr) if (!CHECK_INITED) return -EINVAL; - return addr % hyperhold.spc.ext_size; + return do_div(addr, hyperhold.spc.ext_size); } EXPORT_SYMBOL(hyperhold_addr_offset); @@ -578,7 +578,7 @@ static int hpio_submit(struct hpio *hpio) bio_set_dev(bio, dev->bdev); ext_size = space_of(hpio->eid)->ext_size; - sec = (u64)hpio->eid * ext_size / dev->sec_size; + sec = div_u64((u64)hpio->eid * ext_size, dev->sec_size); bio->bi_iter.bi_sector = sec; for (i = 0; i < hpio->nr_page; i++) { if (!hpio->pages[i]) diff --git a/drivers/hyperhold/hp_space.c b/drivers/hyperhold/hp_space.c index 95d42d064290..cb3d3439c5a6 100644 --- a/drivers/hyperhold/hp_space.c +++ b/drivers/hyperhold/hp_space.c @@ -41,7 +41,7 @@ bool init_space(struct hp_space *spc, u64 dev_size, u32 ext_size) return false; } spc->ext_size = ext_size; - spc->nr_ext = dev_size / ext_size; + spc->nr_ext = div_u64(dev_size, ext_size); atomic_set(&spc->last_alloc_bit, 0); atomic_set(&spc->nr_alloced, 0); init_waitqueue_head(&spc->empty_wq); diff --git a/include/linux/zswapd.h b/include/linux/zswapd.h index 44cd060b12e4..f549137f71b0 100644 --- a/include/linux/zswapd.h +++ b/include/linux/zswapd.h @@ -93,6 +93,10 @@ static struct group_swap_device *register_group_swap(struct group_swap_ops *ops, static void unregister_group_swap(struct group_swap_device *gsdev) { } + +static void memcg_eswap_info_show(struct seq_file *m) +{ +} #endif #endif /* _LINUX_ZSWAPD_H */ diff --git a/mm/memcg_control.c b/mm/memcg_control.c index d56a2ba665b6..985fcaa66943 100644 --- a/mm/memcg_control.c +++ b/mm/memcg_control.c @@ -339,11 +339,11 @@ static u64 memcg_ub_ufs2zram_ratio_read(struct cgroup_subsys_state *css, struct static int memcg_force_swapin_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - unsigned long size; + u64 size; const unsigned int ratio = 100; size = memcg_data_size(memcg, SWAP_SIZE); - size = atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio) * size / ratio; + size = div_u64(atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio) * size, ratio); swapin_memcg(memcg, size); diff --git a/mm/zswapd.c b/mm/zswapd.c index 577d97974229..36e8ffd42b73 100644 --- a/mm/zswapd.c +++ b/mm/zswapd.c @@ -250,7 +250,7 @@ static bool get_memcg_anon_refault_status(struct mem_cgroup *memcg) { const unsigned int percent_constant = 100; unsigned long long anon_pagefault; - unsigned long anon_total; + unsigned long long anon_total; unsigned long long ratio; struct mem_cgroup_per_node *mz = NULL; struct lruvec *lruvec = NULL; @@ -274,8 +274,8 @@ static bool get_memcg_anon_refault_status(struct mem_cgroup *memcg) lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES) + memcg_data_size(memcg, SWAP_PAGE) + memcg_data_size(memcg, CACHE_PAGE); - ratio = (anon_pagefault - memcg->memcg_reclaimed.reclaimed_pagefault) * - percent_constant / (anon_total + 1); + ratio = div64_u64((anon_pagefault - memcg->memcg_reclaimed.reclaimed_pagefault) * + percent_constant, (anon_total + 1)); if (ratio > atomic_read(&memcg->memcg_reclaimed.refault_threshold)) return true; @@ -294,8 +294,8 @@ static bool get_area_anon_refault_status(void) if (anon_pagefault == last_anon_pagefault || time == last_snapshot_time) return false; - ratio = (anon_pagefault - last_anon_pagefault) * percent_constant / - (jiffies_to_msecs(time - last_snapshot_time) + 1); + ratio = div_u64((anon_pagefault - last_anon_pagefault) * percent_constant, + (jiffies_to_msecs(time - last_snapshot_time) + 1)); anon_refault_ratio = ratio; if (ratio > get_area_anon_refault_threshold()) @@ -396,7 +396,7 @@ int get_zram_current_watermark(void) /* after_comp to before_comp */ diff_buffers *= get_compress_ratio(); /* page to ratio */ - diff_buffers = diff_buffers * percent_constant / nr_total; + diff_buffers = div64_s64(diff_buffers * percent_constant, nr_total); return min(zram_wm_ratio, zram_wm_ratio - diff_buffers); } @@ -410,7 +410,7 @@ bool zram_watermark_ok(void) ratio = get_zram_current_watermark(); nr_zram_used = get_zram_used_pages(); - nr_wm = totalram_pages() * ratio / percent_constant; + nr_wm = div_u64(totalram_pages() * ratio, percent_constant); if (nr_zram_used > nr_wm) return true; @@ -592,8 +592,8 @@ static bool zswapd_shrink_anon(pg_data_t *pgdat, struct scan_control *sc) nr_zram = memcg_data_size(memcg, CACHE_PAGE); nr_eswap = memcg_data_size(memcg, SWAP_PAGE); - zram_ratio = (nr_zram + nr_eswap) * percent_constant / - (nr_inactive + nr_active + nr_zram + nr_eswap + 1); + zram_ratio = div64_u64((nr_zram + nr_eswap) * percent_constant, + (nr_inactive + nr_active + nr_zram + nr_eswap + 1)); if (zram_ratio >= (u32)atomic_read(&memcg->memcg_reclaimed.ub_mem2zram_ratio)) { count_vm_event(ZSWAPD_MEMCG_RATIO_SKIP); continue; @@ -637,7 +637,7 @@ static u64 __calc_nr_to_reclaim(void) reclaim_size = min(reclaim_size, max_reclaim_size); /* MB to pages */ - return reclaim_size * SZ_1M / PAGE_SIZE; + return div_u64(reclaim_size * SZ_1M, PAGE_SIZE); } static void zswapd_shrink_node(pg_data_t *pgdat) @@ -706,7 +706,7 @@ u64 zram_watermark_diff(void) ratio = get_zram_current_watermark(); nr_zram_used = get_zram_used_pages(); - nr_wm = totalram_pages() * ratio / percent_constant; + nr_wm = div_u64(totalram_pages() * ratio, percent_constant); if (nr_zram_used > nr_wm) return (nr_zram_used - nr_wm) * PAGE_SIZE + SWAP_MORE_ZRAM; -- Gitee From 0face466fbb67405a152fadbb58574bb9519d77b Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Tue, 8 Feb 2022 16:21:59 +0800 Subject: [PATCH 012/113] hyperhold: add encryption feature for hyperhold data ohos inclusion category: feature issue: #I4SW43 CVE: NA ------------------------------------------------ encrypting hyperhold data using inline encryption if /proc/sys/kernel/hyperhold/soft_crypt is 0, using soft encryption otherwise. default soft_crypt = 1. Signed-off-by: Chengke Wang --- drivers/hyperhold/hp_core.c | 166 +++++++++++++++++++++++++++++++--- drivers/hyperhold/hp_device.c | 153 ++++++++++++++++++++++++++++++- drivers/hyperhold/hp_device.h | 15 +++ drivers/hyperhold/hp_iotab.h | 1 + 4 files changed, 322 insertions(+), 13 deletions(-) diff --git a/drivers/hyperhold/hp_core.c b/drivers/hyperhold/hp_core.c index 0d80b88452d0..a55ee05f3fe6 100644 --- a/drivers/hyperhold/hp_core.c +++ b/drivers/hyperhold/hp_core.c @@ -16,11 +16,7 @@ #include "hp_space.h" #include "hp_iotab.h" -#ifdef CONFIG_HYPERHOLD_DEBUG -#define HP_DFLT_DEVICE "/dev/loop6" -#else #define HP_DFLT_DEVICE "/dev/by-name/hyperhold" -#endif #define HP_DFLT_EXT_SIZE (1 << 15) #define HP_DEV_NAME_LEN 256 #define HP_STATE_LEN 10 @@ -38,6 +34,7 @@ struct hyperhold { char device_name[HP_DEV_NAME_LEN]; u32 extent_size; + u32 enable_soft_crypt; struct hp_device dev; struct hp_space spc; @@ -79,6 +76,7 @@ void hyperhold_disable(bool force) if (hyperhold.write_wq) destroy_workqueue(hyperhold.write_wq); deinit_space(&hyperhold.spc); + crypto_deinit(&hyperhold.dev); unbind_bdev(&hyperhold.dev); out: if (hyperhold.inited) @@ -101,6 +99,8 @@ void hyperhold_enable(void) goto unlock; if (!bind_bdev(&hyperhold.dev, hyperhold.device_name)) goto err; + if (!crypto_init(&hyperhold.dev, hyperhold.enable_soft_crypt)) + goto err; if (!init_space(&hyperhold.spc, hyperhold.dev.dev_size, hyperhold.extent_size)) goto err; hyperhold.read_wq = alloc_workqueue("hyperhold_read", WQ_HIGHPRI | WQ_UNBOUND, 0); @@ -117,6 +117,7 @@ void hyperhold_enable(void) if (hyperhold.write_wq) destroy_workqueue(hyperhold.write_wq); deinit_space(&hyperhold.spc); + crypto_deinit(&hyperhold.dev); unbind_bdev(&hyperhold.dev); enable = false; unlock: @@ -132,8 +133,8 @@ void hyperhold_enable(void) } EXPORT_SYMBOL(hyperhold_enable); -static int hyperhold_sysctl_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int enable_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { if (write) { if (!strcmp(buffer, "enable\n")) @@ -162,26 +163,93 @@ static int hyperhold_sysctl_handler(struct ctl_table *table, int write, return 0; } +static int device_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + mutex_lock(&hyperhold.init_lock); + if (write && hyperhold.inited) { + pr_err("hyperhold device is busy!\n"); + ret = -EBUSY; + goto unlock; + } + ret = proc_dostring(table, write, buffer, lenp, ppos); + if (write && !ret) { + hyperhold.enable_soft_crypt = 1; + pr_info("device changed, default enable soft crypt.\n"); + } +unlock: + mutex_unlock(&hyperhold.init_lock); + + return ret; +} + +static int extent_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + mutex_lock(&hyperhold.init_lock); + if (write && hyperhold.inited) { + pr_err("hyperhold device is busy!\n"); + ret = -EBUSY; + goto unlock; + } + ret = proc_douintvec(table, write, buffer, lenp, ppos); +unlock: + mutex_unlock(&hyperhold.init_lock); + + return ret; +} + +static int crypto_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + mutex_lock(&hyperhold.init_lock); + if (write && hyperhold.inited) { + pr_err("hyperhold device is busy!\n"); + ret = -EBUSY; + goto unlock; + } + ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); +unlock: + mutex_unlock(&hyperhold.init_lock); + + return ret; +} + static struct ctl_table_header *hp_sysctl_header; static struct ctl_table hp_table[] = { { .procname = "enable", .mode = 0644, - .proc_handler = hyperhold_sysctl_handler, + .proc_handler = enable_sysctl_handler, }, { .procname = "device", .data = &hyperhold.device_name, .maxlen = sizeof(hyperhold.device_name), .mode = 0644, - .proc_handler = proc_dostring, + .proc_handler = device_sysctl_handler, }, { .procname = "extent_size", .data = &hyperhold.extent_size, .maxlen = sizeof(hyperhold.extent_size), .mode = 0644, - .proc_handler = proc_douintvec, + .proc_handler = extent_sysctl_handler, + }, + { + .procname = "soft_crypt", + .data = &hyperhold.enable_soft_crypt, + .maxlen = sizeof(hyperhold.enable_soft_crypt), + .mode = 0644, + .proc_handler = crypto_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, {} }; @@ -204,13 +272,14 @@ static struct ctl_table hp_sys_table[] = { bool is_hyperhold_enable(void) { - return CHECK_ENABLE; + return hyperhold.enable; } static int __init hyperhold_init(void) { strcpy(hyperhold.device_name, HP_DFLT_DEVICE); hyperhold.extent_size = HP_DFLT_EXT_SIZE; + hyperhold.enable_soft_crypt = 1; mutex_init(&hyperhold.init_lock); hp_sysctl_header = register_sysctl_table(hp_sys_table); if (!hp_sysctl_header) { @@ -536,10 +605,74 @@ void *hyperhold_io_private(struct hpio *hpio) } EXPORT_SYMBOL(hyperhold_io_private); +static struct page *get_encrypted_page(struct hp_device *dev, struct page *page, unsigned int op) +{ + struct page *encrypted_page = NULL; + + if (!dev->ctfm) { + encrypted_page = page; + get_page(encrypted_page); + goto out; + } + + encrypted_page = alloc_page(GFP_NOIO); + if (!encrypted_page) { + pr_err("alloc encrypted page failed!\n"); + goto out; + } + encrypted_page->index = page->index; + + /* just alloc a new page for read */ + if (!op_is_write(op)) + goto out; + + /* encrypt page for write */ + if (soft_crypt_page(dev->ctfm, encrypted_page, page, HP_DEV_ENCRYPT)) { + put_page(encrypted_page); + encrypted_page = NULL; + } +out: + return encrypted_page; +} + +static void put_encrypted_pages(struct bio *bio) +{ + struct bio_vec *bv = NULL; + struct bvec_iter_all iter; + + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); +} + static void hp_endio_work(struct work_struct *work) { struct hpio *hpio = container_of(work, struct hpio, endio_work); + struct hp_device *dev = NULL; + struct bio_vec *bv = NULL; + struct bvec_iter_all iter; + struct page *page = NULL; + u32 ext_size; + sector_t sec; + int i; + if (op_is_write(hpio->op)) + goto endio; + ext_size = space_of(hpio->eid)->ext_size; + dev = device_of(hpio->eid); + sec = hpio->eid * ext_size / dev->sec_size; + i = 0; + bio_for_each_segment_all(bv, hpio->bio, iter) { + page = bv->bv_page; + BUG_ON(i >= hpio->nr_page); + BUG_ON(!hpio->pages[i]); + if (dev->ctfm) + BUG_ON(soft_crypt_page(dev->ctfm, hpio->pages[i], page, HP_DEV_DECRYPT)); + sec += PAGE_SIZE / dev->sec_size; + i++; + } +endio: + put_encrypted_pages(hpio->bio); + bio_put(hpio->bio); if (hpio->endio) hpio->endio(hpio); } @@ -554,7 +687,6 @@ static void hpio_endio(struct bio *bio) hpio_set_state(hpio, bio->bi_status ? HPIO_FAIL : HPIO_DONE); wq = op_is_write(hpio->op) ? hyperhold.write_wq : hyperhold.read_wq; queue_work(wq, &hpio->endio_work); - bio_put(bio); atomic64_sub(sizeof(struct bio), &mem_used); } @@ -562,6 +694,7 @@ static int hpio_submit(struct hpio *hpio) { struct hp_device *dev = NULL; struct bio *bio = NULL; + struct page *page = NULL; u32 ext_size; sector_t sec; int i; @@ -584,18 +717,27 @@ static int hpio_submit(struct hpio *hpio) if (!hpio->pages[i]) break; hpio->pages[i]->index = sec; - if (!bio_add_page(bio, hpio->pages[i], PAGE_SIZE, 0)) + page = get_encrypted_page(dev, hpio->pages[i], hpio->op); + if (!page) goto err; + if (!bio_add_page(bio, page, PAGE_SIZE, 0)) { + put_page(page); + goto err; + } sec += PAGE_SIZE / dev->sec_size; } + if (dev->blk_key) + inline_crypt_bio(dev->blk_key, bio); bio->bi_private = hpio; bio->bi_end_io = hpio_endio; + hpio->bio = bio; submit_bio(bio); pr_info("submit hpio %p for eid %u.\n", hpio, hpio->eid); return 0; err: + put_encrypted_pages(bio); bio_put(bio); atomic64_sub(sizeof(struct bio), &mem_used); return -EIO; diff --git a/drivers/hyperhold/hp_device.c b/drivers/hyperhold/hp_device.c index 0fd81be5ffa8..3eec00dca88b 100644 --- a/drivers/hyperhold/hp_device.c +++ b/drivers/hyperhold/hp_device.c @@ -7,10 +7,21 @@ #define pr_fmt(fmt) "[HYPERHOLD]" fmt -#include +#include +#include #include "hp_device.h" +#define HP_CIPHER_MODE BLK_ENCRYPTION_MODE_AES_256_XTS +#define HP_CIPHER_NAME "xts(aes)" +#define HP_KEY_SIZE (64) +#define HP_IV_SIZE (16) + +union hp_iv { + __le64 index; + __le64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; +}; + void unbind_bdev(struct hp_device *dev) { int ret; @@ -76,3 +87,143 @@ bool bind_bdev(struct hp_device *dev, const char *name) return false; } + +int soft_crypt_page(struct crypto_skcipher *ctfm, struct page *dst_page, + struct page *src_page, unsigned int op) +{ + struct skcipher_request *req = NULL; + DECLARE_CRYPTO_WAIT(wait); + struct scatterlist dst, src; + int ret = 0; + union hp_iv iv; + + memset(&iv, 0, sizeof(union hp_iv)); + iv.index = cpu_to_le64(src_page->index); + + req = skcipher_request_alloc(ctfm, GFP_NOIO); + if (!req) { + pr_err("alloc skcipher request failed!\n"); + return -ENOMEM; + } + + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, &wait); + sg_init_table(&dst, 1); + sg_set_page(&dst, dst_page, PAGE_SIZE, 0); + sg_init_table(&src, 1); + sg_set_page(&src, src_page, PAGE_SIZE, 0); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &iv); + if (op == HP_DEV_ENCRYPT) + ret = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + else if (op == HP_DEV_DECRYPT) + ret = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); + else + BUG(); + + if (ret) + pr_err("%scrypt failed!\n", op == HP_DEV_ENCRYPT ? "en" : "de"); + + return ret; +} + +static struct crypto_skcipher *soft_crypto_init(const u8 *key) +{ + char *cipher = HP_CIPHER_NAME; + u32 key_len = HP_KEY_SIZE; + struct crypto_skcipher *ctfm = NULL; + int ret; + + ctfm = crypto_alloc_skcipher(cipher, 0, 0); + if (IS_ERR(ctfm)) { + pr_err("alloc ctfm failed, ret = %ld!\n", PTR_ERR(ctfm)); + ctfm = NULL; + goto err; + } + crypto_skcipher_clear_flags(ctfm, ~0); + crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); + ret = crypto_skcipher_setkey(ctfm, key, key_len); + if (ret) { + pr_err("ctfm setkey failed, ret = %d!\n", ret); + goto err; + } + + return ctfm; +err: + if (ctfm) + crypto_free_skcipher(ctfm); + + return NULL; +} + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION +void inline_crypt_bio(struct blk_crypto_key *blk_key, struct bio *bio) +{ + union hp_iv iv; + + memset(&iv, 0, sizeof(union hp_iv)); + iv.index = cpu_to_le64(bio->bi_iter.bi_sector); + + bio_crypt_set_ctx(bio, blk_key, iv.dun, GFP_NOIO); +} + +static struct blk_crypto_key *inline_crypto_init(const u8 *key) +{ + struct blk_crypto_key *blk_key = NULL; + u32 dun_bytes = HP_IV_SIZE - sizeof(__le64); + int ret; + + blk_key = kzalloc(sizeof(struct blk_crypto_key), GFP_KERNEL); + if (!blk_key) { + pr_err("blk key alloc failed!\n"); + goto err; + } + ret = blk_crypto_init_key(blk_key, key, HP_CIPHER_MODE, dun_bytes, PAGE_SIZE); + if (ret) { + pr_err("blk key init failed, ret = %d!\n", ret); + goto err; + } + + return blk_key; +err: + if (blk_key) + kfree_sensitive(blk_key); + + return NULL; +} +#else +void inline_crypt_bio(struct blk_crypto_key *blk_key, struct bio *bio) {} +static struct blk_crypto_key *inline_crypto_init(const u8 *key) +{ + return NULL; +} +#endif + +bool crypto_init(struct hp_device *dev, bool soft) +{ + u8 key[HP_KEY_SIZE]; + bool ret = false; + + get_random_bytes(key, HP_KEY_SIZE); + if (soft) { + dev->ctfm = soft_crypto_init(key); + ret = dev->ctfm; + } else { + dev->blk_key = inline_crypto_init(key); + ret = dev->blk_key; + } + memzero_explicit(key, HP_KEY_SIZE); + + return ret; +} + +void crypto_deinit(struct hp_device *dev) +{ + if (dev->ctfm) { + crypto_free_skcipher(dev->ctfm); + dev->ctfm = NULL; + } + if (dev->blk_key) { + kfree_sensitive(dev->blk_key); + dev->blk_key = NULL; + } +} diff --git a/drivers/hyperhold/hp_device.h b/drivers/hyperhold/hp_device.h index 52d5de370fda..06f007891481 100644 --- a/drivers/hyperhold/hp_device.h +++ b/drivers/hyperhold/hp_device.h @@ -9,6 +9,13 @@ #define _HP_DEVICE_H_ #include +#include +#include + +enum { + HP_DEV_ENCRYPT, + HP_DEV_DECRYPT, +}; struct hp_device { struct file *filp; @@ -16,8 +23,16 @@ struct hp_device { u32 old_block_size; u64 dev_size; u32 sec_size; + + struct crypto_skcipher *ctfm; + struct blk_crypto_key *blk_key; }; void unbind_bdev(struct hp_device *dev); bool bind_bdev(struct hp_device *dev, const char *name); +bool crypto_init(struct hp_device *dev, bool soft); +void crypto_deinit(struct hp_device *dev); +int soft_crypt_page(struct crypto_skcipher *ctfm, + struct page *dst_page, struct page *src_page, unsigned int op); +void inline_crypt_bio(struct blk_crypto_key *blk_key, struct bio *bio); #endif diff --git a/drivers/hyperhold/hp_iotab.h b/drivers/hyperhold/hp_iotab.h index a2f03620af13..b3785f7aaad9 100644 --- a/drivers/hyperhold/hp_iotab.h +++ b/drivers/hyperhold/hp_iotab.h @@ -39,6 +39,7 @@ struct hpio { hp_endio endio; struct work_struct endio_work; + struct bio *bio; struct list_head list; }; -- Gitee From 3ce87eb4e8519336a69d181e2780ab67ad1d0e4a Mon Sep 17 00:00:00 2001 From: Satya Durga Srinivasu Prabhala Date: Sun, 30 Jan 2022 10:28:03 +0800 Subject: [PATCH 013/113] sched: Support cpu isolation of core_ctl codeaurora inclusion category: feature issue: #I4SRVK CVE: NA Tested-by: Hu Zhaodong Signed-off-by: Tang Yizhou ------------------------------------------- Support cpu isolation of core_ctl based on Code Aurora's latest msm-4.14 source code. The following files under /sys/devices/system/cpu/cpuX/core_ctl/ are supported: enable, min_cpus, max_cpus, need_cpus, active_cpus, and global_state. Signed-off-by: Viresh Kumar Signed-off-by: Santosh Shukla Signed-off-by: Syed Rameez Mustafa Signed-off-by: Vikram Mulukutla Signed-off-by: Satya Durga Srinivasu Prabhala --- drivers/base/cpu.c | 38 ++ include/linux/cpuhotplug.h | 3 + include/linux/cpumask.h | 35 + include/linux/hrtimer.h | 9 + include/linux/nmi.h | 21 + include/linux/sched.h | 35 + include/linux/sched/core_ctl.h | 14 + include/linux/sched/isolation.h | 19 + include/linux/sched/stat.h | 9 + include/linux/stop_machine.h | 11 + include/linux/timer.h | 7 + include/trace/events/sched.h | 160 +++++ init/Kconfig | 26 + kernel/cpu.c | 17 + kernel/irq/cpuhotplug.c | 49 ++ kernel/irq/proc.c | 6 + kernel/sched/Makefile | 2 + kernel/sched/core.c | 439 ++++++++++++- kernel/sched/core_ctl.c | 1061 +++++++++++++++++++++++++++++++ kernel/sched/core_ctl.h | 19 + kernel/sched/cpupri.c | 3 + kernel/sched/fair.c | 130 +++- kernel/sched/rt.c | 11 +- kernel/sched/sched.h | 40 ++ kernel/sched/sched_avg.c | 186 ++++++ kernel/sched/topology.c | 11 +- kernel/sched/walt.c | 4 + kernel/smp.c | 3 +- kernel/stop_machine.c | 4 + kernel/time/hrtimer.c | 141 +++- kernel/time/timer.c | 78 +++ kernel/watchdog.c | 34 +- mm/vmstat.c | 5 +- 33 files changed, 2585 insertions(+), 45 deletions(-) create mode 100644 include/linux/sched/core_ctl.h create mode 100644 kernel/sched/core_ctl.c create mode 100644 kernel/sched/core_ctl.h create mode 100644 kernel/sched/sched_avg.c diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 8f1d6569564c..025a6c9dd622 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -180,9 +180,38 @@ static struct attribute_group crash_note_cpu_attr_group = { }; #endif +#ifdef CONFIG_CPU_ISOLATION_OPT +static ssize_t isolate_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + ssize_t rc; + int cpuid = cpu->dev.id; + unsigned int isolated = cpu_isolated(cpuid); + + rc = sysfs_emit(buf, "%d\n", isolated); + + return rc; +} + +static DEVICE_ATTR_RO(isolate); + +static struct attribute *cpu_isolated_attrs[] = { + &dev_attr_isolate.attr, + NULL +}; + +static struct attribute_group cpu_isolated_attr_group = { + .attrs = cpu_isolated_attrs, +}; +#endif + static const struct attribute_group *common_cpu_attr_groups[] = { #ifdef CONFIG_KEXEC &crash_note_cpu_attr_group, +#endif +#ifdef CONFIG_CPU_ISOLATION_OPT + &cpu_isolated_attr_group, #endif NULL }; @@ -190,6 +219,9 @@ static const struct attribute_group *common_cpu_attr_groups[] = { static const struct attribute_group *hotplugable_cpu_attr_groups[] = { #ifdef CONFIG_KEXEC &crash_note_cpu_attr_group, +#endif +#ifdef CONFIG_CPU_ISOLATION_OPT + &cpu_isolated_attr_group, #endif NULL }; @@ -220,6 +252,9 @@ static struct cpu_attr cpu_attrs[] = { _CPU_ATTR(online, &__cpu_online_mask), _CPU_ATTR(possible, &__cpu_possible_mask), _CPU_ATTR(present, &__cpu_present_mask), +#ifdef CONFIG_CPU_ISOLATION_OPT + _CPU_ATTR(core_ctl_isolated, &__cpu_isolated_mask), +#endif }; /* @@ -465,6 +500,9 @@ static struct attribute *cpu_root_attrs[] = { &cpu_attrs[0].attr.attr, &cpu_attrs[1].attr.attr, &cpu_attrs[2].attr.attr, +#ifdef CONFIG_CPU_ISOLATION_OPT + &cpu_attrs[3].attr.attr, +#endif &dev_attr_kernel_max.attr, &dev_attr_offline.attr, &dev_attr_isolated.attr, diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8fb893ed205e..dcd53762c467 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -71,6 +71,9 @@ enum cpuhp_state { CPUHP_SLAB_PREPARE, CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, +#ifdef CONFIG_SCHED_CORE_CTRL + CPUHP_CORE_CTL_ISOLATION_DEAD, +#endif CPUHP_CPUIDLE_COUPLED_PREPARE, CPUHP_POWERPC_PMAC_PREPARE, CPUHP_POWERPC_MMU_CTX_PREPARE, diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index f0d895d6ac39..eb5acbe17a56 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -55,6 +55,7 @@ extern unsigned int nr_cpu_ids; * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration + * cpu_isolated_mask- has bit 'cpu' set iff cpu isolated * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * @@ -96,6 +97,11 @@ extern struct cpumask __cpu_active_mask; #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) +#ifdef CONFIG_CPU_ISOLATION_OPT +extern struct cpumask __cpu_isolated_mask; +#define cpu_isolated_mask ((const struct cpumask *)&__cpu_isolated_mask) +#endif + extern atomic_t __num_online_cpus; #if NR_CPUS > 1 @@ -129,6 +135,22 @@ static inline unsigned int num_online_cpus(void) #define cpu_active(cpu) ((cpu) == 0) #endif +#if defined(CONFIG_CPU_ISOLATION_OPT) && NR_CPUS > 1 +#define num_isolated_cpus() cpumask_weight(cpu_isolated_mask) +#define num_online_uniso_cpus() \ +({ \ + cpumask_t mask; \ + \ + cpumask_andnot(&mask, cpu_online_mask, cpu_isolated_mask); \ + cpumask_weight(&mask); \ +}) +#define cpu_isolated(cpu) cpumask_test_cpu((cpu), cpu_isolated_mask) +#else /* !CONFIG_CPU_ISOLATION_OPT || NR_CPUS == 1 */ +#define num_isolated_cpus() 0U +#define num_online_uniso_cpus() num_online_cpus() +#define cpu_isolated(cpu) 0U +#endif + extern cpumask_t cpus_booted_once_mask; static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) @@ -811,6 +833,9 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) +#ifdef CONFIG_CPU_ISOLATION_OPT +#define for_each_isolated_cpu(cpu) for_each_cpu((cpu), cpu_isolated_mask) +#endif /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); @@ -851,6 +876,16 @@ set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, &__cpu_active_mask); } +#ifdef CONFIG_CPU_ISOLATION_OPT +static inline void +set_cpu_isolated(unsigned int cpu, bool isolated) +{ + if (isolated) + cpumask_set_cpu(cpu, &__cpu_isolated_mask); + else + cpumask_clear_cpu(cpu, &__cpu_isolated_mask); +} +#endif /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7f1b8549ebce..f1c6982ef650 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -74,6 +74,7 @@ enum hrtimer_restart { * * 0x00 inactive * 0x01 enqueued into rbtree + * 0x02 timer is pinned to a cpu * * The callback state is not part of the timer->state because clearing it would * mean touching the timer after the callback, this makes it impossible to free @@ -93,6 +94,8 @@ enum hrtimer_restart { */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 +#define HRTIMER_PINNED_SHIFT 1 +#define HRTIMER_STATE_PINNED (1 << HRTIMER_PINNED_SHIFT) /** * struct hrtimer - the basic hrtimer structure @@ -366,6 +369,12 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) #endif /* Exported timer functions: */ +#ifdef CONFIG_CPU_ISOLATION_OPT +/* To be used from cpusets, only */ +extern void hrtimer_quiesce_cpu(void *cpup); +#else +static inline void hrtimer_quiesce_cpu(void *cpup) { } +#endif /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 750c7f395ca9..6848b270f366 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -13,6 +13,11 @@ #ifdef CONFIG_LOCKUP_DETECTOR void lockup_detector_init(void); +#ifdef CONFIG_CPU_ISOLATION_OPT +extern void watchdog_enable(unsigned int cpu); +extern void watchdog_disable(unsigned int cpu); +extern bool watchdog_configured(unsigned int cpu); +#endif void lockup_detector_soft_poweroff(void); void lockup_detector_cleanup(void); bool is_hardlockup(void); @@ -37,6 +42,22 @@ extern int sysctl_hardlockup_all_cpu_backtrace; static inline void lockup_detector_init(void) { } static inline void lockup_detector_soft_poweroff(void) { } static inline void lockup_detector_cleanup(void) { } +#ifdef CONFIG_CPU_ISOLATION_OPT +static inline void watchdog_enable(unsigned int cpu) +{ +} +static inline void watchdog_disable(unsigned int cpu) +{ +} +static inline bool watchdog_configured(unsigned int cpu) +{ + /* + * Pretend the watchdog is always configured. + * We will be waiting for the watchdog to be enabled in core isolation + */ + return true; +} +#endif #endif /* !CONFIG_LOCKUP_DETECTOR */ #ifdef CONFIG_SOFTLOCKUP_DETECTOR diff --git a/include/linux/sched.h b/include/linux/sched.h index e4b281653f7c..44d5d8ed532a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -222,6 +222,41 @@ enum task_event { IRQ_UPDATE = 5, }; +#ifdef CONFIG_CPU_ISOLATION_OPT +extern int sched_isolate_count(const cpumask_t *mask, bool include_offline); +extern int sched_isolate_cpu(int cpu); +extern int sched_unisolate_cpu(int cpu); +extern int sched_unisolate_cpu_unlocked(int cpu); +#else +static inline int sched_isolate_count(const cpumask_t *mask, + bool include_offline) +{ + cpumask_t count_mask; + + if (include_offline) + cpumask_andnot(&count_mask, mask, cpu_online_mask); + else + return 0; + + return cpumask_weight(&count_mask); +} + +static inline int sched_isolate_cpu(int cpu) +{ + return 0; +} + +static inline int sched_unisolate_cpu(int cpu) +{ + return 0; +} + +static inline int sched_unisolate_cpu_unlocked(int cpu) +{ + return 0; +} +#endif + extern void scheduler_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX diff --git a/include/linux/sched/core_ctl.h b/include/linux/sched/core_ctl.h new file mode 100644 index 000000000000..ca321b7b0b08 --- /dev/null +++ b/include/linux/sched/core_ctl.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2016, 2019-2020, The Linux Foundation. All rights reserved. + */ + +#ifndef __CORE_CTL_H +#define __CORE_CTL_H + +#ifdef CONFIG_SCHED_CORE_CTRL +extern void core_ctl_check(u64 wallclock); +#else +static inline void core_ctl_check(u64 wallclock) { } +#endif +#endif diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index cc9f393e2a70..22420b45b393 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -28,10 +28,25 @@ extern void __init housekeeping_init(void); #else +#ifdef CONFIG_CPU_ISOLATION_OPT +static inline int housekeeping_any_cpu(enum hk_flags flags) +{ + cpumask_t available; + int cpu; + + cpumask_andnot(&available, cpu_online_mask, cpu_isolated_mask); + cpu = cpumask_any(&available); + if (cpu >= nr_cpu_ids) + cpu = smp_processor_id(); + + return cpu; +} +#else static inline int housekeeping_any_cpu(enum hk_flags flags) { return smp_processor_id(); } +#endif static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags) { @@ -54,7 +69,11 @@ static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) if (static_branch_unlikely(&housekeeping_overridden)) return housekeeping_test_cpu(cpu, flags); #endif +#ifdef CONFIG_CPU_ISOLATION_OPT + return !cpu_isolated(cpu); +#else return true; +#endif } #endif /* _LINUX_SCHED_ISOLATION_H */ diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 568286411b43..ca8b0d1ccf94 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -21,6 +21,15 @@ extern bool single_task_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); +#ifdef CONFIG_SCHED_WALT +extern unsigned int sched_get_cpu_util(int cpu); +#else +static inline unsigned int sched_get_cpu_util(int cpu) +{ + return 0; +} +#endif + static inline int sched_info_on(void) { #ifdef CONFIG_SCHEDSTATS diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 63ea9aff368f..57908e344979 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -32,6 +32,9 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg); bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); +#ifdef CONFIG_CPU_ISOLATION_OPT +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); +#endif void stop_machine_park(int cpu); void stop_machine_unpark(int cpu); void stop_machine_yield(const struct cpumask *cpumask); @@ -80,6 +83,14 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu, return false; } +static inline int stop_cpus(const struct cpumask *cpumask, + cpu_stop_fn_t fn, void *arg) +{ + if (cpumask_test_cpu(raw_smp_processor_id(), cpumask)) + return stop_one_cpu(raw_smp_processor_id(), fn, arg); + return -ENOENT; +} + #endif /* CONFIG_SMP */ /* diff --git a/include/linux/timer.h b/include/linux/timer.h index d10bc7e73b41..f80f416bed14 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -180,6 +180,13 @@ extern int timer_reduce(struct timer_list *timer, unsigned long expires); */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) +/* To be used from cpusets, only */ +#ifdef CONFIG_CPU_ISOLATION_OPT +extern void timer_quiesce_cpu(void *cpup); +#else +static inline void timer_quiesce_cpu(void *cpup) { } +#endif + extern void add_timer(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c96a4337afe6..27b6ed3c9e58 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -6,6 +6,7 @@ #define _TRACE_SCHED_H #include +#include #include #include @@ -600,6 +601,165 @@ TRACE_EVENT(sched_wake_idle_without_ipi, TP_printk("cpu=%d", __entry->cpu) ); +#ifdef CONFIG_SCHED_CORE_CTRL +TRACE_EVENT(core_ctl_eval_need, + + TP_PROTO(unsigned int cpu, unsigned int old_need, + unsigned int new_need, unsigned int updated), + TP_ARGS(cpu, old_need, new_need, updated), + TP_STRUCT__entry( + __field(u32, cpu) + __field(u32, old_need) + __field(u32, new_need) + __field(u32, updated) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->old_need = old_need; + __entry->new_need = new_need; + __entry->updated = updated; + ), + TP_printk("cpu=%u, old_need=%u, new_need=%u, updated=%u", __entry->cpu, + __entry->old_need, __entry->new_need, __entry->updated) +); + +TRACE_EVENT(core_ctl_set_busy, + + TP_PROTO(unsigned int cpu, unsigned int busy, + unsigned int old_is_busy, unsigned int is_busy, int high_irqload), + TP_ARGS(cpu, busy, old_is_busy, is_busy, high_irqload), + TP_STRUCT__entry( + __field(u32, cpu) + __field(u32, busy) + __field(u32, old_is_busy) + __field(u32, is_busy) + __field(bool, high_irqload) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->busy = busy; + __entry->old_is_busy = old_is_busy; + __entry->is_busy = is_busy; + __entry->high_irqload = high_irqload; + ), + TP_printk("cpu=%u, busy=%u, old_is_busy=%u, new_is_busy=%u high_irqload=%d", + __entry->cpu, __entry->busy, __entry->old_is_busy, + __entry->is_busy, __entry->high_irqload) +); + +TRACE_EVENT(core_ctl_set_boost, + + TP_PROTO(u32 refcount, s32 ret), + TP_ARGS(refcount, ret), + TP_STRUCT__entry( + __field(u32, refcount) + __field(s32, ret) + ), + TP_fast_assign( + __entry->refcount = refcount; + __entry->ret = ret; + ), + TP_printk("refcount=%u, ret=%d", __entry->refcount, __entry->ret) +); + +TRACE_EVENT(core_ctl_update_nr_need, + + TP_PROTO(int cpu, int nr_need, int prev_misfit_need, + int nrrun, int max_nr, int nr_prev_assist), + + TP_ARGS(cpu, nr_need, prev_misfit_need, nrrun, max_nr, nr_prev_assist), + + TP_STRUCT__entry( + __field(int, cpu) + __field(int, nr_need) + __field(int, prev_misfit_need) + __field(int, nrrun) + __field(int, max_nr) + __field(int, nr_prev_assist) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->nr_need = nr_need; + __entry->prev_misfit_need = prev_misfit_need; + __entry->nrrun = nrrun; + __entry->max_nr = max_nr; + __entry->nr_prev_assist = nr_prev_assist; + ), + + TP_printk("cpu=%d nr_need=%d prev_misfit_need=%d nrrun=%d max_nr=%d nr_prev_assist=%d", + __entry->cpu, __entry->nr_need, __entry->prev_misfit_need, + __entry->nrrun, __entry->max_nr, __entry->nr_prev_assist) +); +#endif + +#ifdef CONFIG_SCHED_RUNNING_AVG +/* + * Tracepoint for sched_get_nr_running_avg + */ +TRACE_EVENT(sched_get_nr_running_avg, + + TP_PROTO(int cpu, int nr, int nr_misfit, int nr_max), + + TP_ARGS(cpu, nr, nr_misfit, nr_max), + + TP_STRUCT__entry( + __field(int, cpu) + __field(int, nr) + __field(int, nr_misfit) + __field(int, nr_max) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->nr = nr; + __entry->nr_misfit = nr_misfit; + __entry->nr_max = nr_max; + ), + + TP_printk("cpu=%d nr=%d nr_misfit=%d nr_max=%d", + __entry->cpu, __entry->nr, __entry->nr_misfit, __entry->nr_max) +); +#endif + +#ifdef CONFIG_CPU_ISOLATION_OPT +/* + * sched_isolate - called when cores are isolated/unisolated + * + * @acutal_mask: mask of cores actually isolated/unisolated + * @req_mask: mask of cores requested isolated/unisolated + * @online_mask: cpu online mask + * @time: amount of time in us it took to isolate/unisolate + * @isolate: 1 if isolating, 0 if unisolating + * + */ +TRACE_EVENT(sched_isolate, + + TP_PROTO(unsigned int requested_cpu, unsigned int isolated_cpus, + u64 start_time, unsigned char isolate), + + TP_ARGS(requested_cpu, isolated_cpus, start_time, isolate), + + TP_STRUCT__entry( + __field(u32, requested_cpu) + __field(u32, isolated_cpus) + __field(u32, time) + __field(unsigned char, isolate) + ), + + TP_fast_assign( + __entry->requested_cpu = requested_cpu; + __entry->isolated_cpus = isolated_cpus; + __entry->time = div64_u64(sched_clock() - start_time, 1000); + __entry->isolate = isolate; + ), + + TP_printk("iso cpu=%u cpus=0x%x time=%u us isolated=%d", + __entry->requested_cpu, __entry->isolated_cpus, + __entry->time, __entry->isolate) +); +#endif + /* * Following tracepoints are not exported in tracefs and provide hooking * mechanisms only for testing and debugging purposes. diff --git a/init/Kconfig b/init/Kconfig index 8b20edacf921..1512479e7782 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -658,6 +658,32 @@ config CPU_ISOLATION Say Y if unsure. +config SCHED_RUNNING_AVG + bool "per-rq and per-cluster running average statistics" + default n + +config CPU_ISOLATION_OPT + bool "CPU isolation optimization" + depends on SMP + default n + help + This option enables cpu isolation optimization, which allows + to isolate cpu dynamically. The isolated cpu will be unavailable + to scheduler and load balancer, and all its non-pinned timers, + IRQs and tasks will be migrated to other cpus, only pinned + kthread and IRQS are still allowed to run, this achieves + similar effect as hotplug but at lower latency cost. + +config SCHED_CORE_CTRL + bool "Core control" + depends on CPU_ISOLATION_OPT + select SCHED_RUNNING_AVG + default n + help + This option enables the core control functionality in + the scheduler. Core control automatically isolate and + unisolate cores based on cpu load and utilization. + source "kernel/rcu/Kconfig" config BUILD_BIN2C diff --git a/kernel/cpu.c b/kernel/cpu.c index 67c22941b5f2..d33629370cf1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1052,6 +1052,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, if (!cpu_present(cpu)) return -EINVAL; +#ifdef CONFIG_CPU_ISOLATION_OPT + if (!tasks_frozen && !cpu_isolated(cpu) && num_online_uniso_cpus() == 1) + return -EBUSY; +#endif + cpus_write_lock(); cpuhp_tasks_frozen = tasks_frozen; @@ -2495,6 +2500,11 @@ EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); +#ifdef CONFIG_CPU_ISOLATION_OPT +struct cpumask __cpu_isolated_mask __read_mostly; +EXPORT_SYMBOL(__cpu_isolated_mask); +#endif + atomic_t __num_online_cpus __read_mostly; EXPORT_SYMBOL(__num_online_cpus); @@ -2513,6 +2523,13 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(&__cpu_online_mask, src); } +#ifdef CONFIG_CPU_ISOLATION_OPT +void init_cpu_isolated(const struct cpumask *src) +{ + cpumask_copy(&__cpu_isolated_mask, src); +} +#endif + void set_cpu_online(unsigned int cpu, bool online) { /* diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 02236b13b359..47e017cf33ce 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "internals.h" @@ -58,6 +59,9 @@ static bool migrate_one_irq(struct irq_desc *desc) const struct cpumask *affinity; bool brokeaff = false; int err; +#ifdef CONFIG_CPU_ISOLATION_OPT + struct cpumask available_cpus; +#endif /* * IRQ chip might be already torn down, but the irq descriptor is @@ -110,7 +114,17 @@ static bool migrate_one_irq(struct irq_desc *desc) if (maskchip && chip->irq_mask) chip->irq_mask(d); +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_copy(&available_cpus, affinity); + cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask); + affinity = &available_cpus; +#endif + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { +#ifdef CONFIG_CPU_ISOLATION_OPT + const struct cpumask *default_affinity; +#endif + /* * If the interrupt is managed, then shut it down and leave * the affinity untouched. @@ -120,7 +134,38 @@ static bool migrate_one_irq(struct irq_desc *desc) irq_shutdown_and_deactivate(desc); return false; } + +#ifdef CONFIG_CPU_ISOLATION_OPT + default_affinity = desc->affinity_hint ? : irq_default_affinity; + /* + * The order of preference for selecting a fallback CPU is + * + * (1) online and un-isolated CPU from default affinity + * (2) online and un-isolated CPU + * (3) online CPU + */ + cpumask_andnot(&available_cpus, cpu_online_mask, + cpu_isolated_mask); + if (cpumask_intersects(&available_cpus, default_affinity)) + cpumask_and(&available_cpus, &available_cpus, + default_affinity); + else if (cpumask_empty(&available_cpus)) + affinity = cpu_online_mask; + + /* + * We are overriding the affinity with all online and + * un-isolated cpus. irq_set_affinity_locked() call + * below notify this mask to PM QOS affinity listener. + * That results in applying the CPU_DMA_LATENCY QOS + * to all the CPUs specified in the mask. But the low + * level irqchip driver sets the affinity of an irq + * to only one CPU. So pick only one CPU from the + * prepared mask while overriding the user affinity. + */ + affinity = cpumask_of(cpumask_any(affinity)); +#else affinity = cpu_online_mask; +#endif brokeaff = true; } /* @@ -129,7 +174,11 @@ static bool migrate_one_irq(struct irq_desc *desc) * mask and therefore might keep/reassign the irq to the outgoing * CPU. */ +#ifdef CONFIG_CPU_ISOLATION_OPT + err = irq_set_affinity_locked(d, affinity, false); +#else err = irq_do_set_affinity(d, affinity, false); +#endif if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 72513ed2a5fc..5613e4a3f2e2 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -154,6 +154,12 @@ static ssize_t write_irq_affinity(int type, struct file *file, if (err) goto free_cpumask; +#ifdef CONFIG_CPU_ISOLATION_OPT + if (cpumask_subset(new_value, cpu_isolated_mask)) { + err = -EINVAL; + goto free_cpumask; + } +#endif /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 75ab238bde9d..0e3173ee99fb 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -37,3 +37,5 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o +obj-$(CONFIG_SCHED_RUNNING_AVG) += sched_avg.o +obj-$(CONFIG_SCHED_CORE_CTRL) += core_ctl.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index aed3b931e670..33e19cbd4eee 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,8 @@ #include #include +#include +#include #include #include @@ -1893,6 +1895,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, struct rq_flags rf; struct rq *rq; int ret = 0; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_t allowed_mask; +#endif rq = task_rq_lock(p, &rf); update_rq_clock(rq); @@ -1916,6 +1921,20 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_mask, new_mask)) goto out; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + cpumask_and(&allowed_mask, &allowed_mask, cpu_valid_mask); + + dest_cpu = cpumask_any(&allowed_mask); + if (dest_cpu >= nr_cpu_ids) { + cpumask_and(&allowed_mask, cpu_valid_mask, new_mask); + dest_cpu = cpumask_any(&allowed_mask); + if (!cpumask_intersects(new_mask, cpu_valid_mask)) { + ret = -EINVAL; + goto out; + } + } +#else /* * Picking a ~random cpu helps in cases where we are changing affinity * for groups of tasks (ie. cpuset), so that load balancing is not @@ -1926,6 +1945,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, ret = -EINVAL; goto out; } +#endif do_set_cpus_allowed(p, new_mask); @@ -1940,8 +1960,13 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, } /* Can the task run on the task's current CPU? If so, we're done */ +#ifdef CONFIG_CPU_ISOLATION_OPT + if (cpumask_test_cpu(task_cpu(p), &allowed_mask)) + goto out; +#else if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; +#endif if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; @@ -2293,12 +2318,19 @@ EXPORT_SYMBOL_GPL(kick_process); * select_task_rq() below may allow selection of !active CPUs in order * to satisfy the above rules. */ +#ifdef CONFIG_CPU_ISOLATION_OPT +static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) +#else static int select_fallback_rq(int cpu, struct task_struct *p) +#endif { int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; - enum { cpuset, possible, fail } state = cpuset; + enum { cpuset, possible, fail, bug } state = cpuset; int dest_cpu; +#ifdef CONFIG_CPU_ISOLATION_OPT + int isolated_candidate = -1; +#endif /* * If the node that the CPU is on has been offlined, cpu_to_node() @@ -2312,6 +2344,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for_each_cpu(dest_cpu, nodemask) { if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) return dest_cpu; } @@ -2322,7 +2356,18 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for_each_cpu(dest_cpu, p->cpus_ptr) { if (!is_cpu_allowed(p, dest_cpu)) continue; +#ifdef CONFIG_CPU_ISOLATION_OPT + if (cpu_isolated(dest_cpu)) { + if (allow_iso) + isolated_candidate = dest_cpu; + continue; + } + goto out; + } + if (isolated_candidate != -1) { + dest_cpu = isolated_candidate; +#endif goto out; } @@ -2341,6 +2386,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) break; case fail: +#ifdef CONFIG_CPU_ISOLATION_OPT + allow_iso = true; + state = bug; + break; +#else + /* fall through; */ +#endif + + case bug: BUG(); break; } @@ -2368,6 +2422,10 @@ static int select_fallback_rq(int cpu, struct task_struct *p) static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { +#ifdef CONFIG_CPU_ISOLATION_OPT + bool allow_isolated = (p->flags & PF_KTHREAD); +#endif + lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) @@ -2385,8 +2443,14 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ +#ifdef CONFIG_CPU_ISOLATION_OPT + if (unlikely(!is_cpu_allowed(p, cpu)) || + (cpu_isolated(cpu) && !allow_isolated)) + cpu = select_fallback_rq(task_cpu(p), p, allow_isolated); +#else if (unlikely(!is_cpu_allowed(p, cpu))) cpu = select_fallback_rq(task_cpu(p), p); +#endif return cpu; } @@ -3939,7 +4003,7 @@ void sched_exec(void) if (dest_cpu == smp_processor_id()) goto unlock; - if (likely(cpu_active(dest_cpu))) { + if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) { struct migration_arg arg = { p, dest_cpu }; raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -5936,6 +6000,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; +#ifdef CONFIG_CPU_ISOLATION_OPT + int dest_cpu; + cpumask_t allowed_mask; +#endif rcu_read_lock(); @@ -5997,20 +6065,30 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = __set_cpus_allowed_ptr(p, new_mask, true); - - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask); + if (dest_cpu < nr_cpu_ids) { +#endif + retval = __set_cpus_allowed_ptr(p, new_mask, true); + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } } +#ifdef CONFIG_CPU_ISOLATION_OPT + } else { + retval = -EINVAL; } +#endif + out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: @@ -6074,6 +6152,16 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_mask, cpu_active_mask); + +#ifdef CONFIG_CPU_ISOLATION_OPT + /* The userspace tasks are forbidden to run on + * isolated CPUs. So exclude isolated CPUs from + * the getaffinity. + */ + if (!(p->flags & PF_KTHREAD)) + cpumask_andnot(mask, mask, cpu_isolated_mask); +#endif + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: @@ -6761,20 +6849,77 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) BUG(); } +#ifdef CONFIG_CPU_ISOLATION_OPT +/* + * Remove a task from the runqueue and pretend that it's migrating. This + * should prevent migrations for the detached task and disallow further + * changes to tsk_cpus_allowed. + */ +static void +detach_one_task_core(struct task_struct *p, struct rq *rq, + struct list_head *tasks) +{ + lockdep_assert_held(&rq->lock); + + p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(rq, p, 0); + list_add(&p->se.group_node, tasks); +} + +static void attach_tasks_core(struct list_head *tasks, struct rq *rq) +{ + struct task_struct *p; + + lockdep_assert_held(&rq->lock); + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); + list_del_init(&p->se.group_node); + + BUG_ON(task_rq(p) != rq); + activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + } +} + +#else + +static void +detach_one_task_core(struct task_struct *p, struct rq *rq, + struct list_head *tasks) +{ +} + +static void attach_tasks_core(struct list_head *tasks, struct rq *rq) +{ +} + +#endif /* CONFIG_CPU_ISOLATION_OPT */ + /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). + * Migrate all tasks (not pinned if pinned argument say so) from the rq, + * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq(). * * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) +void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, + bool migrate_pinned_tasks) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; struct rq_flags orf = *rf; int dest_cpu; + unsigned int num_pinned_kthreads = 1; /* this thread */ + LIST_HEAD(tasks); + cpumask_t avail_cpus; + +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); +#else + cpumask_copy(&avail_cpus, cpu_online_mask); +#endif /* * Fudge the rq selection such that the below task selection loop @@ -6797,13 +6942,20 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) for (;;) { /* * There's this thread running, bail when that's the only - * remaining thread: + * remaining thread. */ if (rq->nr_running == 1) break; next = __pick_migrate_task(rq); + if (!migrate_pinned_tasks && next->flags & PF_KTHREAD && + !cpumask_intersects(&avail_cpus, &next->cpus_mask)) { + detach_one_task_core(next, rq, &tasks); + num_pinned_kthreads += 1; + continue; + } + /* * Rules for changing task_struct::cpus_mask are holding * both pi_lock and rq->lock, such that holding either @@ -6816,31 +6968,278 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) rq_unlock(rq, rf); raw_spin_lock(&next->pi_lock); rq_relock(rq, rf); + if (!(rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(rq); /* * Since we're inside stop-machine, _nothing_ should have * changed the task, WARN if weird stuff happened, because in * that case the above rq->lock drop is a fail too. + * However, during cpu isolation the load balancer might have + * interferred since we don't stop all CPUs. Ignore warning for + * this case. */ - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { + if (task_rq(next) != rq || !task_on_rq_queued(next)) { + WARN_ON(migrate_pinned_tasks); raw_spin_unlock(&next->pi_lock); continue; } /* Find suitable destination for @next, with force if needed. */ +#ifdef CONFIG_CPU_ISOLATION_OPT + dest_cpu = select_fallback_rq(dead_rq->cpu, next, false); +#else dest_cpu = select_fallback_rq(dead_rq->cpu, next); +#endif rq = __migrate_task(rq, rf, next, dest_cpu); if (rq != dead_rq) { rq_unlock(rq, rf); rq = dead_rq; *rf = orf; rq_relock(rq, rf); + if (!(rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(rq); } raw_spin_unlock(&next->pi_lock); } rq->stop = stop; + + if (num_pinned_kthreads > 1) + attach_tasks_core(&tasks, rq); } + +#ifdef CONFIG_CPU_ISOLATION_OPT +int do_isolation_work_cpu_stop(void *data) +{ + unsigned int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + watchdog_disable(cpu); + + local_irq_disable(); + + irq_migrate_all_off_this_cpu(); + + flush_smp_call_function_from_idle(); + + /* Update our root-domain */ + rq_lock(rq, &rf); + + /* + * Temporarily mark the rq as offline. This will allow us to + * move tasks off the CPU. + */ + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + + migrate_tasks(rq, &rf, false); + + if (rq->rd) + set_rq_online(rq); + rq_unlock(rq, &rf); + + local_irq_enable(); + return 0; +} + +int do_unisolation_work_cpu_stop(void *data) +{ + watchdog_enable(smp_processor_id()); + return 0; +} + +static void sched_update_group_capacities(int cpu) +{ + struct sched_domain *sd; + + mutex_lock(&sched_domains_mutex); + rcu_read_lock(); + + for_each_domain(cpu, sd) { + int balance_cpu = group_balance_cpu(sd->groups); + + init_sched_groups_capacity(cpu, sd); + /* + * Need to ensure this is also called with balancing + * cpu. + */ + if (cpu != balance_cpu) + init_sched_groups_capacity(balance_cpu, sd); + } + + rcu_read_unlock(); + mutex_unlock(&sched_domains_mutex); +} + +static unsigned int cpu_isolation_vote[NR_CPUS]; + +int sched_isolate_count(const cpumask_t *mask, bool include_offline) +{ + cpumask_t count_mask = CPU_MASK_NONE; + + if (include_offline) { + cpumask_complement(&count_mask, cpu_online_mask); + cpumask_or(&count_mask, &count_mask, cpu_isolated_mask); + cpumask_and(&count_mask, &count_mask, mask); + } else { + cpumask_and(&count_mask, mask, cpu_isolated_mask); + } + + return cpumask_weight(&count_mask); +} + +/* + * 1) CPU is isolated and cpu is offlined: + * Unisolate the core. + * 2) CPU is not isolated and CPU is offlined: + * No action taken. + * 3) CPU is offline and request to isolate + * Request ignored. + * 4) CPU is offline and isolated: + * Not a possible state. + * 5) CPU is online and request to isolate + * Normal case: Isolate the CPU + * 6) CPU is not isolated and comes back online + * Nothing to do + * + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_isolate_cpu(int cpu) +{ + struct rq *rq; + cpumask_t avail_cpus; + int ret_code = 0; + u64 start_time = 0; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + cpu_maps_update_begin(); + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); + + if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu) || + !cpu_online(cpu) || cpu >= NR_CPUS) { + ret_code = -EINVAL; + goto out; + } + + rq = cpu_rq(cpu); + + if (++cpu_isolation_vote[cpu] > 1) + goto out; + + /* We cannot isolate ALL cpus in the system */ + if (cpumask_weight(&avail_cpus) == 1) { + --cpu_isolation_vote[cpu]; + ret_code = -EINVAL; + goto out; + } + + /* + * There is a race between watchdog being enabled by hotplug and + * core isolation disabling the watchdog. When a CPU is hotplugged in + * and the hotplug lock has been released the watchdog thread might + * not have run yet to enable the watchdog. + * We have to wait for the watchdog to be enabled before proceeding. + */ + if (!watchdog_configured(cpu)) { + msleep(20); + if (!watchdog_configured(cpu)) { + --cpu_isolation_vote[cpu]; + ret_code = -EBUSY; + goto out; + } + } + + set_cpu_isolated(cpu, true); + cpumask_clear_cpu(cpu, &avail_cpus); + + /* Migrate timers */ + smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1); + smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1); + + watchdog_disable(cpu); + irq_lock_sparse(); + stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + irq_unlock_sparse(); + + calc_load_migrate(rq); + update_max_interval(); + sched_update_group_capacities(cpu); + +out: + cpu_maps_update_done(); + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 1); + return ret_code; +} + +/* + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_unisolate_cpu_unlocked(int cpu) +{ + int ret_code = 0; + u64 start_time = 0; + + if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu) + || cpu >= NR_CPUS) { + ret_code = -EINVAL; + goto out; + } + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + if (!cpu_isolation_vote[cpu]) { + ret_code = -EINVAL; + goto out; + } + + if (--cpu_isolation_vote[cpu]) + goto out; + + set_cpu_isolated(cpu, false); + update_max_interval(); + sched_update_group_capacities(cpu); + + if (cpu_online(cpu)) { + stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0); + + /* Kick CPU to immediately do load balancing */ + if (!atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(cpu))) + smp_send_reschedule(cpu); + } + +out: + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 0); + return ret_code; +} + +int sched_unisolate_cpu(int cpu) +{ + int ret_code; + + cpu_maps_update_begin(); + ret_code = sched_unisolate_cpu_unlocked(cpu); + cpu_maps_update_done(); + return ret_code; +} + +#endif /* CONFIG_CPU_ISOLATION_OPT */ + #endif /* CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) @@ -7028,7 +7427,7 @@ int sched_cpu_dying(unsigned int cpu) BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq, &rf); + migrate_tasks(rq, &rf, true); BUG_ON(rq->nr_running != 1); rq_unlock_irqrestore(rq, &rf); diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c new file mode 100644 index 000000000000..eef1d6921178 --- /dev/null +++ b/kernel/sched/core_ctl.c @@ -0,0 +1,1061 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2014-2021, The Linux Foundation. All rights reserved. + */ + +#define pr_fmt(fmt) "core_ctl: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "sched.h" +#include "walt.h" + +#define MAX_CPUS_PER_CLUSTER 6 +#define MAX_CLUSTERS 3 + +struct cluster_data { + bool inited; + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int active_cpus; + unsigned int num_cpus; + unsigned int nr_isolated_cpus; + unsigned int nr_not_preferred_cpus; + cpumask_t cpu_mask; + unsigned int need_cpus; + unsigned int task_thres; + unsigned int max_nr; + unsigned int nr_prev_assist; + unsigned int nr_prev_assist_thresh; + s64 need_ts; + struct list_head lru; + bool pending; + spinlock_t pending_lock; + bool enable; + int nrrun; + struct task_struct *core_ctl_thread; + unsigned int first_cpu; + unsigned int boost; + struct kobject kobj; +}; + +struct cpu_data { + bool is_busy; + unsigned int busy; + unsigned int cpu; + bool not_preferred; + struct cluster_data *cluster; + struct list_head sib; + bool isolated_by_us; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static struct cluster_data cluster_state[MAX_CLUSTERS]; +static unsigned int num_clusters; + +#define for_each_cluster(cluster, idx) \ + for (; (idx) < num_clusters && ((cluster) = &cluster_state[idx]);\ + (idx)++) + +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cluster_data *state); +static void wake_up_core_ctl_thread(struct cluster_data *state); +static bool initialized; + +ATOMIC_NOTIFIER_HEAD(core_ctl_notifier); +static unsigned int last_nr_big; + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_min_cpus(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + val = min(val, state->num_cpus); + state->max_cpus = val; + state->min_cpus = min(state->min_cpus, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_max_cpus(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->max_cpus); +} + +static ssize_t store_enable(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + bool bval; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + bval = !!val; + if (bval != state->enable) { + state->enable = bval; + apply_need(state); + } + + return count; +} + +static ssize_t show_enable(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->enable); +} + +static ssize_t show_need_cpus(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->need_cpus); +} + +static ssize_t show_active_cpus(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->active_cpus); +} + +static ssize_t show_global_state(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + struct cluster_data *cluster; + ssize_t count = 0; + unsigned int cpu; + + spin_lock_irq(&state_lock); + for_each_possible_cpu(cpu) { + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + if (!cluster || !cluster->inited) + continue; + + count += sysfs_emit_at(buf, count, + "CPU%u\n", cpu); + count += sysfs_emit_at(buf, count, + "\tCPU: %u\n", c->cpu); + count += sysfs_emit_at(buf, count, + "\tOnline: %u\n", + cpu_online(c->cpu)); + count += sysfs_emit_at(buf, count, + "\tIsolated: %u\n", + cpu_isolated(c->cpu)); + count += sysfs_emit_at(buf, count, + "\tFirst CPU: %u\n", + cluster->first_cpu); + count += sysfs_emit_at(buf, count, + "\tBusy%%: %u\n", c->busy); + count += sysfs_emit_at(buf, count, + "\tIs busy: %u\n", c->is_busy); + count += sysfs_emit_at(buf, count, + "\tNot preferred: %u\n", + c->not_preferred); + count += sysfs_emit_at(buf, count, + "\tNr running: %u\n", cluster->nrrun); + count += sysfs_emit_at(buf, count, + "\tActive CPUs: %u\n", get_active_cpu_count(cluster)); + count += sysfs_emit_at(buf, count, + "\tNeed CPUs: %u\n", cluster->need_cpus); + count += sysfs_emit_at(buf, count, + "\tNr isolated CPUs: %u\n", + cluster->nr_isolated_cpus); + count += sysfs_emit_at(buf, count, + "\tBoost: %u\n", (unsigned int) cluster->boost); + } + spin_unlock_irq(&state_lock); + + return count; +} + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(const struct cluster_data *, char *); + ssize_t (*store)(struct cluster_data *, const char *, size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(active_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(enable); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &enable.attr, + &need_cpus.attr, + &active_cpus.attr, + &global_state.attr, + NULL +}; + +#define to_cluster_data(k) container_of(k, struct cluster_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +static struct sched_avg_stats nr_stats[NR_CPUS]; + +/* + * nr_need: + * Number of tasks running on this cluster plus + * tasks running on higher capacity clusters. + * To find out CPUs needed from this cluster. + * + * For example: + * On dual cluster system with 4 min capacity + * CPUs and 4 max capacity CPUs, if there are + * 4 small tasks running on min capacity CPUs + * and 2 big tasks running on 2 max capacity + * CPUs, nr_need has to be 6 for min capacity + * cluster and 2 for max capacity cluster. + * This is because, min capacity cluster has to + * account for tasks running on max capacity + * cluster, so that, the min capacity cluster + * can be ready to accommodate tasks running on max + * capacity CPUs if the demand of tasks goes down. + */ +static int compute_cluster_nr_need(int index) +{ + int cpu; + struct cluster_data *cluster; + int nr_need = 0; + + for_each_cluster(cluster, index) { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_need += nr_stats[cpu].nr; + } + + return nr_need; +} + +/* + * prev_misfit_need: + * Tasks running on smaller capacity cluster which + * needs to be migrated to higher capacity cluster. + * To find out how many tasks need higher capacity CPUs. + * + * For example: + * On dual cluster system with 4 min capacity + * CPUs and 4 max capacity CPUs, if there are + * 2 small tasks and 2 big tasks running on + * min capacity CPUs and no tasks running on + * max cpacity, prev_misfit_need of min capacity + * cluster will be 0 and prev_misfit_need of + * max capacity cluster will be 2. + */ +static int compute_prev_cluster_misfit_need(int index) +{ + int cpu; + struct cluster_data *prev_cluster; + int prev_misfit_need = 0; + + /* + * Lowest capacity cluster does not have to + * accommodate any misfit tasks. + */ + if (index == 0) + return 0; + + prev_cluster = &cluster_state[index - 1]; + + for_each_cpu(cpu, &prev_cluster->cpu_mask) + prev_misfit_need += nr_stats[cpu].nr_misfit; + + return prev_misfit_need; +} + +static int compute_cluster_max_nr(int index) +{ + int cpu; + struct cluster_data *cluster = &cluster_state[index]; + int max_nr = 0; + + for_each_cpu(cpu, &cluster->cpu_mask) + max_nr = max(max_nr, nr_stats[cpu].nr_max); + + return max_nr; +} + +static int cluster_real_big_tasks(int index) +{ + int nr_big = 0; + int cpu; + struct cluster_data *cluster = &cluster_state[index]; + + if (index == 0) { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_big += nr_stats[cpu].nr_misfit; + } else { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_big += nr_stats[cpu].nr; + } + + return nr_big; +} + +/* + * prev_nr_need_assist: + * Tasks that are eligible to run on the previous + * cluster but cannot run because of insufficient + * CPUs there. prev_nr_need_assist is indicative + * of number of CPUs in this cluster that should + * assist its previous cluster to makeup for + * insufficient CPUs there. + * + * For example: + * On tri-cluster system with 4 min capacity + * CPUs, 3 intermediate capacity CPUs and 1 + * max capacity CPU, if there are 4 small + * tasks running on min capacity CPUs, 4 big + * tasks running on intermediate capacity CPUs + * and no tasks running on max capacity CPU, + * prev_nr_need_assist for min & max capacity + * clusters will be 0, but, for intermediate + * capacity cluster prev_nr_need_assist will + * be 1 as it has 3 CPUs, but, there are 4 big + * tasks to be served. + */ +static int prev_cluster_nr_need_assist(int index) +{ + int need = 0; + int cpu; + struct cluster_data *prev_cluster; + + if (index == 0) + return 0; + + index--; + prev_cluster = &cluster_state[index]; + + /* + * Next cluster should not assist, while there are isolated cpus + * in this cluster. + */ + if (prev_cluster->nr_isolated_cpus) + return 0; + + for_each_cpu(cpu, &prev_cluster->cpu_mask) + need += nr_stats[cpu].nr; + + need += compute_prev_cluster_misfit_need(index); + + if (need > prev_cluster->active_cpus) + need = need - prev_cluster->active_cpus; + else + need = 0; + + return need; +} + +static void update_running_avg(void) +{ + struct cluster_data *cluster; + unsigned int index = 0; + unsigned long flags; + int big_avg = 0; + + sched_get_nr_running_avg(nr_stats); + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + int nr_need, prev_misfit_need; + + if (!cluster->inited) + continue; + + nr_need = compute_cluster_nr_need(index); + prev_misfit_need = compute_prev_cluster_misfit_need(index); + + + cluster->nrrun = nr_need + prev_misfit_need; + cluster->max_nr = compute_cluster_max_nr(index); + cluster->nr_prev_assist = prev_cluster_nr_need_assist(index); + trace_core_ctl_update_nr_need(cluster->first_cpu, nr_need, + prev_misfit_need, + cluster->nrrun, cluster->max_nr, + cluster->nr_prev_assist); + big_avg += cluster_real_big_tasks(index); + } + spin_unlock_irqrestore(&state_lock, flags); + + last_nr_big = big_avg; +} + +#define MAX_NR_THRESHOLD 4 +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(const struct cluster_data *cluster, + unsigned int new_need) +{ + /* unisolate all cores if there are enough tasks */ + if (cluster->nrrun >= cluster->task_thres) + return cluster->num_cpus; + + /* + * unisolate as many cores as the previous cluster + * needs assistance with. + */ + if (cluster->nr_prev_assist >= cluster->nr_prev_assist_thresh) + new_need = new_need + cluster->nr_prev_assist; + + /* only unisolate more cores if there are tasks to run */ + if (cluster->nrrun > new_need) + new_need = new_need + 1; + + /* + * We don't want tasks to be overcrowded in a cluster. + * If any CPU has more than MAX_NR_THRESHOLD in the last + * window, bring another CPU to help out. + */ + if (cluster->max_nr > MAX_NR_THRESHOLD) + new_need = new_need + 1; + + return new_need; +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(const struct cluster_data *cluster, + unsigned int need_cpus) +{ + return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus); +} + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster) +{ + return cluster->num_cpus - + sched_isolate_count(&cluster->cpu_mask, true); +} + +static bool is_active(const struct cpu_data *state) +{ + return cpu_online(state->cpu) && !cpu_isolated(state->cpu); +} + +static bool adjustment_possible(const struct cluster_data *cluster, + unsigned int need) +{ + return (need < cluster->active_cpus || (need > cluster->active_cpus && + cluster->nr_isolated_cpus)); +} + +static bool eval_need(struct cluster_data *cluster) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + unsigned int new_need; + s64 now, elapsed; + + if (unlikely(!cluster->inited)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + + if (cluster->boost || !cluster->enable) { + need_cpus = cluster->max_cpus; + } else { + cluster->active_cpus = get_active_cpu_count(cluster); + thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0; + list_for_each_entry(c, &cluster->lru, sib) { + bool old_is_busy = c->is_busy; + int high_irqload = sched_cpu_high_irqload(c->cpu); + + if (c->busy >= cluster->busy_up_thres[thres_idx] || + high_irqload) + c->is_busy = true; + else if (c->busy < cluster->busy_down_thres[thres_idx]) + c->is_busy = false; + trace_core_ctl_set_busy(c->cpu, c->busy, old_is_busy, + c->is_busy, high_irqload); + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(cluster, need_cpus); + } + new_need = apply_limits(cluster, need_cpus); + need_flag = adjustment_possible(cluster, new_need); + + last_need = cluster->need_cpus; + now = ktime_to_ms(ktime_get()); + + if (new_need > cluster->active_cpus) { + ret = 1; + } else { + /* + * When there is no change in need and there are no more + * active CPUs than currently needed, just update the + * need time stamp and return. + */ + if (new_need == last_need && new_need == cluster->active_cpus) { + cluster->need_ts = now; + spin_unlock_irqrestore(&state_lock, flags); + return 0; + } + + elapsed = now - cluster->need_ts; + ret = elapsed >= cluster->offline_delay_ms; + } + + if (ret) { + cluster->need_ts = now; + cluster->need_cpus = new_need; + } + trace_core_ctl_eval_need(cluster->first_cpu, last_need, new_need, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cluster_data *cluster) +{ + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); +} + +/* ========================= core count enforcement ==================== */ + +static void wake_up_core_ctl_thread(struct cluster_data *cluster) +{ + unsigned long flags; + + spin_lock_irqsave(&cluster->pending_lock, flags); + cluster->pending = true; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + wake_up_process(cluster->core_ctl_thread); +} + +static u64 core_ctl_check_timestamp; + +int core_ctl_set_boost(bool boost) +{ + unsigned int index = 0; + struct cluster_data *cluster = NULL; + unsigned long flags; + int ret = 0; + bool boost_state_changed = false; + + if (unlikely(!initialized)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + if (boost) { + boost_state_changed = !cluster->boost; + ++cluster->boost; + } else { + if (!cluster->boost) { + ret = -EINVAL; + break; + } else { + --cluster->boost; + boost_state_changed = !cluster->boost; + } + } + } + spin_unlock_irqrestore(&state_lock, flags); + + if (boost_state_changed) { + index = 0; + for_each_cluster(cluster, index) + apply_need(cluster); + } + + if (cluster) + trace_core_ctl_set_boost(cluster->boost, ret); + + return ret; +} +EXPORT_SYMBOL(core_ctl_set_boost); + +void core_ctl_check(u64 window_start) +{ + int cpu; + struct cpu_data *c; + struct cluster_data *cluster; + unsigned int index = 0; + unsigned long flags; + + if (unlikely(!initialized)) + return; + + if (window_start == core_ctl_check_timestamp) + return; + + core_ctl_check_timestamp = window_start; + + spin_lock_irqsave(&state_lock, flags); + for_each_possible_cpu(cpu) { + + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + + if (!cluster || !cluster->inited) + continue; + + c->busy = sched_get_cpu_util(cpu); + } + spin_unlock_irqrestore(&state_lock, flags); + + update_running_avg(); + + for_each_cluster(cluster, index) { + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); + } +} + +static void move_cpu_lru(struct cpu_data *cpu_data) +{ + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_del(&cpu_data->sib); + list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru); + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_isolate(struct cluster_data *cluster, unsigned int need) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_isolated = 0; + bool first_pass = cluster->nr_not_preferred_cpus; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus == need) + break; + /* Don't isolate busy CPUs. */ + if (c->is_busy) + continue; + + /* + * We isolate only the not_preferred CPUs. If none + * of the CPUs are selected as not_preferred, then + * all CPUs are eligible for isolation. + */ + if (cluster->nr_not_preferred_cpus && !c->not_preferred) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + +again: + /* + * If the number of active CPUs is within the limits, then + * don't force isolation of any busy CPUs. + */ + if (cluster->active_cpus <= cluster->max_cpus) + return; + + nr_isolated = 0; + num_cpus = cluster->num_cpus; + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus <= cluster->max_cpus) + break; + + if (first_pass && !c->not_preferred) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + + if (first_pass && cluster->active_cpus > cluster->max_cpus) { + first_pass = false; + goto again; + } +} + +static void __try_to_unisolate(struct cluster_data *cluster, + unsigned int need, bool force) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_unisolated = 0; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!c->isolated_by_us) + continue; + if ((cpu_online(c->cpu) && !cpu_isolated(c->cpu)) || + (!force && c->not_preferred)) + continue; + if (cluster->active_cpus == need) + break; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to unisolate CPU%u\n", c->cpu); + if (!sched_unisolate_cpu(c->cpu)) { + c->isolated_by_us = false; + move_cpu_lru(c); + nr_unisolated++; + } else { + pr_debug("Unable to unisolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus -= nr_unisolated; + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_unisolate(struct cluster_data *cluster, unsigned int need) +{ + bool force_use_non_preferred = false; + + __try_to_unisolate(cluster, need, force_use_non_preferred); + + if (cluster->active_cpus == need) + return; + + force_use_non_preferred = true; + __try_to_unisolate(cluster, need, force_use_non_preferred); +} + +static void __ref do_core_ctl(struct cluster_data *cluster) +{ + unsigned int need; + + need = apply_limits(cluster, cluster->need_cpus); + + if (adjustment_possible(cluster, need)) { + pr_debug("Trying to adjust group %u from %u to %u\n", + cluster->first_cpu, cluster->active_cpus, need); + + if (cluster->active_cpus > need) + try_to_isolate(cluster, need); + else if (cluster->active_cpus < need) + try_to_unisolate(cluster, need); + } +} + +static int __ref try_core_ctl(void *data) +{ + struct cluster_data *cluster = data; + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&cluster->pending_lock, flags); + if (!cluster->pending) { + spin_unlock_irqrestore(&cluster->pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&cluster->pending_lock, flags); + } + set_current_state(TASK_RUNNING); + cluster->pending = false; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + do_core_ctl(cluster); + } + + return 0; +} + +static int isolation_cpuhp_state(unsigned int cpu, bool online) +{ + struct cpu_data *state = &per_cpu(cpu_state, cpu); + struct cluster_data *cluster = state->cluster; + unsigned int need; + bool do_wakeup = false, unisolated = false; + unsigned long flags; + + if (unlikely(!cluster || !cluster->inited)) + return 0; + + if (online) { + cluster->active_cpus = get_active_cpu_count(cluster); + + /* + * Moving to the end of the list should only happen in + * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an + * infinite list traversal when thermal (or other entities) + * reject trying to online CPUs. + */ + move_cpu_lru(state); + } else { + /* + * We don't want to have a CPU both offline and isolated. + * So unisolate a CPU that went down if it was isolated by us. + */ + if (state->isolated_by_us) { + sched_unisolate_cpu_unlocked(cpu); + state->isolated_by_us = false; + unisolated = true; + } + + /* Move a CPU to the end of the LRU when it goes offline. */ + move_cpu_lru(state); + + state->busy = 0; + cluster->active_cpus = get_active_cpu_count(cluster); + } + + need = apply_limits(cluster, cluster->need_cpus); + spin_lock_irqsave(&state_lock, flags); + if (unisolated) + cluster->nr_isolated_cpus--; + do_wakeup = adjustment_possible(cluster, need); + spin_unlock_irqrestore(&state_lock, flags); + if (do_wakeup) + wake_up_core_ctl_thread(cluster); + + return 0; +} + +static int core_ctl_isolation_online_cpu(unsigned int cpu) +{ + return isolation_cpuhp_state(cpu, true); +} + +static int core_ctl_isolation_dead_cpu(unsigned int cpu) +{ + return isolation_cpuhp_state(cpu, false); +} + +/* ============================ init code ============================== */ + +static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu) +{ + unsigned int i; + + for (i = 0; i < num_clusters; ++i) { + if (cluster_state[i].first_cpu == first_cpu) + return &cluster_state[i]; + } + + return NULL; +} + +static int cluster_init(const struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cluster_data *cluster; + struct cpu_data *state; + unsigned int cpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + if (find_cluster_by_first_cpu(first_cpu)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + if (num_clusters == MAX_CLUSTERS) { + pr_err("Unsupported number of clusters. Only %u supported\n", + MAX_CLUSTERS); + return -EINVAL; + } + cluster = &cluster_state[num_clusters]; + ++num_clusters; + + cpumask_copy(&cluster->cpu_mask, mask); + cluster->num_cpus = cpumask_weight(mask); + if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + cluster->first_cpu = first_cpu; + cluster->min_cpus = 1; + cluster->max_cpus = cluster->num_cpus; + cluster->need_cpus = cluster->num_cpus; + cluster->offline_delay_ms = 100; + cluster->task_thres = UINT_MAX; + cluster->nr_prev_assist_thresh = UINT_MAX; + cluster->nrrun = cluster->num_cpus; + cluster->enable = true; + cluster->nr_not_preferred_cpus = 0; + INIT_LIST_HEAD(&cluster->lru); + spin_lock_init(&cluster->pending_lock); + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cluster = cluster; + state->cpu = cpu; + list_add_tail(&state->sib, &cluster->lru); + } + cluster->active_cpus = get_active_cpu_count(cluster); + + cluster->core_ctl_thread = kthread_run(try_core_ctl, (void *) cluster, + "core_ctl/%d", first_cpu); + if (IS_ERR(cluster->core_ctl_thread)) + return PTR_ERR(cluster->core_ctl_thread); + + sched_setscheduler_nocheck(cluster->core_ctl_thread, SCHED_FIFO, + ¶m); + + cluster->inited = true; + + kobject_init(&cluster->kobj, &ktype_core_ctl); + return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl"); +} + +static int __init core_ctl_init(void) +{ + struct sched_cluster *cluster; + int ret; + + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "core_ctl/isolation:online", + core_ctl_isolation_online_cpu, NULL); + + cpuhp_setup_state_nocalls(CPUHP_CORE_CTL_ISOLATION_DEAD, + "core_ctl/isolation:dead", + NULL, core_ctl_isolation_dead_cpu); + + for_each_sched_cluster(cluster) { + ret = cluster_init(&cluster->cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n", ret); + } + + initialized = true; + return 0; +} + +late_initcall(core_ctl_init); diff --git a/kernel/sched/core_ctl.h b/kernel/sched/core_ctl.h new file mode 100644 index 000000000000..0be55ac6a526 --- /dev/null +++ b/kernel/sched/core_ctl.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2016, 2019-2020, The Linux Foundation. All rights reserved. + */ + +#ifndef __CORE_CTL_H +#define __CORE_CTL_H + +#ifdef CONFIG_SCHED_CORE_CTRL +void core_ctl_check(u64 wallclock); +int core_ctl_set_boost(bool boost); +#else +static inline void core_ctl_check(u64 wallclock) {} +static inline int core_ctl_set_boost(bool boost) +{ + return 0; +} +#endif +#endif diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 0033731a0797..9d286972ed7a 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -79,6 +79,9 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, if (lowest_mask) { cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(lowest_mask, lowest_mask, cpu_isolated_mask); +#endif /* * We have to ensure that we have at least one bit * still set in the array, since the map could have diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f30bd5d6d655..42d51caa611c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5978,6 +5978,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + if (cpu_isolated(i)) + continue; + if (sched_idle_cpu(i)) return i; @@ -6138,6 +6141,9 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int return -1; cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(cpus, cpus, cpu_isolated_mask); +#endif for_each_cpu_wrap(core, cpus, target) { bool idle = true; @@ -6176,6 +6182,8 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t if (!cpumask_test_cpu(cpu, p->cpus_ptr) || !cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; + if (cpu_isolated(cpu)) + continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; } @@ -6240,6 +6248,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) return -1; + if (cpu_isolated(cpu)) + continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) break; } @@ -6270,6 +6280,9 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); + if (cpu_isolated(cpu)) + continue; + if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; if (fits_capacity(task_util, cpu_cap)) @@ -6311,15 +6324,15 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } if ((available_idle_cpu(target) || sched_idle_cpu(target)) && - asym_fits_capacity(task_util, target)) + !cpu_isolated(target) && asym_fits_capacity(task_util, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_capacity(task_util, prev)) + ((available_idle_cpu(prev) || sched_idle_cpu(prev)) && + !cpu_isolated(target) && asym_fits_capacity(task_util, prev))) return prev; /* @@ -8301,6 +8314,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu) for_each_cpu(cpu, sched_group_span(sdg)) { unsigned long cpu_cap = capacity_of(cpu); + if (cpu_isolated(cpu)) + continue; + capacity += cpu_cap; min_capacity = min(cpu_cap, min_capacity); max_capacity = max(cpu_cap, max_capacity); @@ -8314,10 +8330,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { struct sched_group_capacity *sgc = group->sgc; - - capacity += sgc->capacity; - min_capacity = min(sgc->min_capacity, min_capacity); - max_capacity = max(sgc->max_capacity, max_capacity); + __maybe_unused cpumask_t *cpus = + sched_group_span(group); + + if (!cpu_isolated(cpumask_first(cpus))) { + capacity += sgc->capacity; + min_capacity = min(sgc->min_capacity, + min_capacity); + max_capacity = max(sgc->max_capacity, + max_capacity); + } group = group->next; } while (group != child->groups); } @@ -8525,6 +8547,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); + if (cpu_isolated(i)) + continue; + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; @@ -8566,6 +8591,15 @@ static inline void update_sg_lb_stats(struct lb_env *env, } } + /* Isolated CPU has no weight */ + if (!group->group_weight) { + sgs->group_capacity = 0; + sgs->avg_load = 0; + sgs->group_type = group_has_spare; + sgs->group_weight = group->group_weight; + return; + } + /* Check if dst CPU is idle and preferred to this group */ if (env->sd->flags & SD_ASYM_PACKING && env->idle != CPU_NOT_IDLE && @@ -8911,6 +8945,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) .avg_load = UINT_MAX, .group_type = group_overloaded, }; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_t allowed_cpus; + + cpumask_andnot(&allowed_cpus, p->cpus_ptr, cpu_isolated_mask); +#endif imbalance = scale_load_down(NICE_0_LOAD) * (sd->imbalance_pct-100) / 100; @@ -8919,8 +8958,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int local_group; /* Skip over this group if it has no CPUs allowed */ +#ifdef CONFIG_CPU_ISOLATION_OPT + if (!cpumask_intersects(sched_group_span(group), + &allowed_cpus)) +#else if (!cpumask_intersects(sched_group_span(group), p->cpus_ptr)) +#endif continue; local_group = cpumask_test_cpu(this_cpu, @@ -9485,6 +9529,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; + if (cpu_isolated(i)) + continue; + capacity = capacity_of(i); nr_running = rq->cfs.h_nr_running; @@ -9628,6 +9675,17 @@ static int need_active_balance(struct lb_env *env) return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } +#ifdef CONFIG_CPU_ISOLATION_OPT +int group_balance_cpu_not_isolated(struct sched_group *sg) +{ + cpumask_t cpus; + + cpumask_and(&cpus, sched_group_span(sg), group_balance_mask(sg)); + cpumask_andnot(&cpus, &cpus, cpu_isolated_mask); + return cpumask_first(&cpus); +} +#endif + static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) @@ -9651,7 +9709,7 @@ static int should_we_balance(struct lb_env *env) /* Try to find first idle CPU */ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { - if (!idle_cpu(cpu)) + if (!idle_cpu(cpu) || cpu_isolated(cpu)) continue; /* Are we the first idle CPU? */ @@ -9659,7 +9717,7 @@ static int should_we_balance(struct lb_env *env) } /* Are we the first CPU of this group ? */ - return group_balance_cpu(sg) == env->dst_cpu; + return group_balance_cpu_not_isolated(sg) == env->dst_cpu; } /* @@ -9861,7 +9919,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, * ->active_balance_work. Once set, it's cleared * only after active load balance is finished. */ - if (!busiest->active_balance) { + if (!busiest->active_balance && + !cpu_isolated(cpu_of(busiest))) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; @@ -10075,7 +10134,17 @@ static DEFINE_SPINLOCK(balancing); */ void update_max_interval(void) { - max_load_balance_interval = HZ*num_online_cpus()/10; + unsigned int available_cpus; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_t avail_mask; + + cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask); + available_cpus = cpumask_weight(&avail_mask); +#else + available_cpus = num_online_cpus(); +#endif + + max_load_balance_interval = HZ*available_cpus/10; } /* @@ -10205,6 +10274,9 @@ static inline int find_new_ilb(void) for_each_cpu_and(ilb, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)) { + if (cpu_isolated(ilb)) + continue; + if (idle_cpu(ilb)) return ilb; } @@ -10259,6 +10331,7 @@ static void nohz_balancer_kick(struct rq *rq) struct sched_domain *sd; int nr_busy, i, cpu = rq->cpu; unsigned int flags = 0; + cpumask_t cpumask; if (unlikely(rq->idle_balance)) return; @@ -10273,8 +10346,15 @@ static void nohz_balancer_kick(struct rq *rq) * None are in tickless mode and hence no need for NOHZ idle load * balancing. */ +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&cpumask, nohz.idle_cpus_mask, cpu_isolated_mask); + if (cpumask_empty(&cpumask)) + return; +#else + cpumask_copy(&cpumask, nohz.idle_cpus_mask); if (likely(!atomic_read(&nohz.nr_cpus))) return; +#endif if (READ_ONCE(nohz.has_blocked) && time_after(now, READ_ONCE(nohz.next_blocked))) @@ -10310,7 +10390,7 @@ static void nohz_balancer_kick(struct rq *rq) * currently idle; in which case, kick the ILB to move tasks * around. */ - for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { + for_each_cpu_and(i, sched_domain_span(sd), &cpumask) { if (sched_asym_prefer(i, cpu)) { flags = NOHZ_KICK_MASK; goto unlock; @@ -10488,6 +10568,7 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, int balance_cpu; int ret = false; struct rq *rq; + cpumask_t cpus; SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); @@ -10507,7 +10588,13 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, */ smp_mb(); - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask); +#else + cpumask_copy(&cpus, nohz.idle_cpus_mask); +#endif + + for_each_cpu(balance_cpu, &cpus) { if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) continue; @@ -10658,6 +10745,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) int pulled_task = 0; u64 curr_cost = 0; + if (cpu_isolated(this_cpu)) + return 0; + update_misfit_status(NULL, this_rq); /* * We must set idle_stamp _before_ calling idle_balance(), such that we @@ -10771,6 +10861,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; + /* + * Since core isolation doesn't update nohz.idle_cpus_mask, there + * is a possibility this nohz kicked cpu could be isolated. Hence + * return if the cpu is isolated. + */ + if (cpu_isolated(this_rq->cpu)) + return; + /* * If this CPU has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle CPUs whose ticks are @@ -10792,8 +10890,10 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* Don't need to rebalance while attached to NULL domain or + * cpu is isolated. + */ + if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq))) return; if (time_after_eq(jiffies, rq->next_balance)) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5938cf2e421b..6c1475950441 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -265,8 +265,12 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { - /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + /* + * Try to pull RT tasks here if we lower this rq's prio and cpu is not + * isolated + */ + return rq->rt.highest_prio.curr > prev->prio && + !cpu_isolated(cpu_of(rq)); } static inline int rt_overloaded(struct rq *rq) @@ -2279,7 +2283,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running || + cpu_isolated(cpu_of(rq))) return; rt_queue_pull_task(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6fd06c16ee24..22ff400d5b08 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -140,6 +140,10 @@ extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); +#ifdef CONFIG_SMP +extern void init_sched_groups_capacity(int cpu, struct sched_domain *sd); +#endif + extern void call_trace_sched_update_nr_running(struct rq *rq, int count); /* * Helpers for converting nanosecond timing to jiffy resolution @@ -2957,6 +2961,11 @@ static inline unsigned long cpu_util_freq_walt(int cpu) return (util >= capacity) ? capacity : util; } + +static inline bool hmp_capable(void) +{ + return max_possible_capacity != min_max_possible_capacity; +} #else /* CONFIG_SCHED_WALT */ static inline void walt_fixup_cum_window_demand(struct rq *rq, s64 scaled_delta) { } @@ -2972,4 +2981,35 @@ static inline int is_reserved(int cpu) } static inline void clear_reserved(int cpu) { } + +static inline bool hmp_capable(void) +{ + return false; +} #endif /* CONFIG_SCHED_WALT */ + +struct sched_avg_stats { + int nr; + int nr_misfit; + int nr_max; + int nr_scaled; +}; +#ifdef CONFIG_SCHED_RUNNING_AVG +extern void sched_get_nr_running_avg(struct sched_avg_stats *stats); +#else +static inline void sched_get_nr_running_avg(struct sched_avg_stats *stats) { } +#endif + +#ifdef CONFIG_CPU_ISOLATION_OPT +extern int group_balance_cpu_not_isolated(struct sched_group *sg); +#else +static inline int group_balance_cpu_not_isolated(struct sched_group *sg) +{ + return group_balance_cpu(sg); +} +#endif /* CONFIG_CPU_ISOLATION_OPT */ + +#ifdef CONFIG_HOTPLUG_CPU +extern void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, + bool migrate_pinned_tasks); +#endif diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c new file mode 100644 index 000000000000..d74579a1553d --- /dev/null +++ b/kernel/sched/sched_avg.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2012, 2015-2021, The Linux Foundation. All rights reserved. + */ +/* + * Scheduler hook for average runqueue determination + */ +#include +#include +#include +#include +#include + +#include "sched.h" +#include "walt.h" +#include + +static DEFINE_PER_CPU(u64, nr_prod_sum); +static DEFINE_PER_CPU(u64, last_time); +static DEFINE_PER_CPU(u64, nr_big_prod_sum); +static DEFINE_PER_CPU(u64, nr); +static DEFINE_PER_CPU(u64, nr_max); + +static DEFINE_PER_CPU(unsigned long, iowait_prod_sum); +static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock); +static s64 last_get_time; + +static DEFINE_PER_CPU(atomic64_t, last_busy_time) = ATOMIC64_INIT(0); + +#define NR_THRESHOLD_PCT 15 + +/** + * sched_get_nr_running_avg + * @return: Average nr_running, iowait and nr_big_tasks value since last poll. + * Returns the avg * 100 to return up to two decimal points + * of accuracy. + * + * Obtains the average nr_running value since the last poll. + * This function may not be called concurrently with itself + */ +void sched_get_nr_running_avg(struct sched_avg_stats *stats) +{ + int cpu; + u64 curr_time = sched_clock(); + u64 period = curr_time - last_get_time; + u64 tmp_nr, tmp_misfit; + + if (!period) + return; + + /* read and reset nr_running counts */ + for_each_possible_cpu(cpu) { + unsigned long flags; + u64 diff; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + + tmp_nr = per_cpu(nr_prod_sum, cpu); + tmp_nr += per_cpu(nr, cpu) * diff; + tmp_nr = div64_u64((tmp_nr * 100), period); + + tmp_misfit = per_cpu(nr_big_prod_sum, cpu); + tmp_misfit = div64_u64((tmp_misfit * 100), period); + + /* + * NR_THRESHOLD_PCT is to make sure that the task ran + * at least 85% in the last window to compensate any + * over estimating being done. + */ + stats[cpu].nr = (int)div64_u64((tmp_nr + NR_THRESHOLD_PCT), + 100); + stats[cpu].nr_misfit = (int)div64_u64((tmp_misfit + + NR_THRESHOLD_PCT), 100); + stats[cpu].nr_max = per_cpu(nr_max, cpu); + + trace_sched_get_nr_running_avg(cpu, stats[cpu].nr, + stats[cpu].nr_misfit, stats[cpu].nr_max); + + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr_prod_sum, cpu) = 0; + per_cpu(nr_big_prod_sum, cpu) = 0; + per_cpu(iowait_prod_sum, cpu) = 0; + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); + } + + last_get_time = curr_time; + +} +EXPORT_SYMBOL(sched_get_nr_running_avg); + +#define BUSY_NR_RUN 3 +#define BUSY_LOAD_FACTOR 10 +static inline void update_last_busy_time(int cpu, bool dequeue, + unsigned long prev_nr_run, u64 curr_time) +{ + bool nr_run_trigger = false, load_trigger = false; + + if (!hmp_capable() || is_min_capacity_cpu(cpu)) + return; + + if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN) + nr_run_trigger = true; + + if (dequeue && (cpu_util(cpu) * BUSY_LOAD_FACTOR) > + capacity_orig_of(cpu)) + load_trigger = true; + + if (nr_run_trigger || load_trigger) + atomic64_set(&per_cpu(last_busy_time, cpu), curr_time); +} + +/** + * sched_update_nr_prod + * @cpu: The core id of the nr running driver. + * @delta: Adjust nr by 'delta' amount + * @inc: Whether we are increasing or decreasing the count + * @return: N/A + * + * Update average with latest nr_running value for CPU + */ +void sched_update_nr_prod(int cpu, long delta, bool inc) +{ + u64 diff; + u64 curr_time; + unsigned long flags, nr_running; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + nr_running = per_cpu(nr, cpu); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr, cpu) = nr_running + (inc ? delta : -delta); + + BUG_ON((s64)per_cpu(nr, cpu) < 0); + + if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu)) + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + update_last_busy_time(cpu, !inc, nr_running, curr_time); + + per_cpu(nr_prod_sum, cpu) += nr_running * diff; + per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff; + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); +} +EXPORT_SYMBOL(sched_update_nr_prod); + +/* + * Returns the CPU utilization % in the last window. + * + */ +unsigned int sched_get_cpu_util(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 util; + unsigned long capacity, flags; + unsigned int busy; + + raw_spin_lock_irqsave(&rq->lock, flags); + + util = rq->cfs.avg.util_avg; + capacity = capacity_orig_of(cpu); + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + util = rq->prev_runnable_sum; + util = div64_u64(util, + sched_ravg_window >> SCHED_CAPACITY_SHIFT); + } +#endif + raw_spin_unlock_irqrestore(&rq->lock, flags); + + util = (util >= capacity) ? capacity : util; + busy = div64_ul((util * 100), capacity); + return busy; +} + +u64 sched_get_cpu_last_busy_time(int cpu) +{ + return atomic64_read(&per_cpu(last_busy_time, cpu)); +} diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 004e9505f7ad..b30b62f0d683 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1220,16 +1220,25 @@ build_sched_groups(struct sched_domain *sd, int cpu) * group having more cpu_capacity will pickup more load compared to the * group having less cpu_capacity. */ -static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) +void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_t avail_mask; +#endif WARN_ON(!sg); do { int cpu, max_cpu = -1; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&avail_mask, sched_group_span(sg), + cpu_isolated_mask); + sg->group_weight = cpumask_weight(&avail_mask); +#else sg->group_weight = cpumask_weight(sched_group_span(sg)); +#endif if (!(sd->flags & SD_ASYM_PACKING)) goto next; diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 753b852ab340..30db3d617914 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -23,6 +23,7 @@ #include #include "sched.h" #include "walt.h" +#include "core_ctl.h" #define CREATE_TRACE_POINTS #include #undef CREATE_TRACE_POINTS @@ -1681,6 +1682,9 @@ void walt_irq_work(struct irq_work *irq_work) for_each_cpu(cpu, cpu_possible_mask) raw_spin_unlock(&cpu_rq(cpu)->lock); + + if (!is_migration) + core_ctl_check(this_rq()->window_start); } static void walt_init_once(void) diff --git a/kernel/smp.c b/kernel/smp.c index f73a597c8e4c..92742aa1e348 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -957,7 +957,8 @@ void wake_up_all_idle_cpus(void) if (cpu == smp_processor_id()) continue; - wake_up_if_idle(cpu); + if (!cpu_isolated(cpu)) + wake_up_if_idle(cpu); } preempt_enable(); } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 890b79cf0e7c..3e67402079bb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -442,7 +442,11 @@ static int __stop_cpus(const struct cpumask *cpumask, * @cpumask were offline; otherwise, 0 if all executions of @fn * returned 0, any non zero return value if any returned non zero. */ +#ifdef CONFIG_CPU_ISOLATION_OPT +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +#else static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +#endif { int ret; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4ef90718c114..b8835ac5e31f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -984,7 +984,11 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ +#ifdef CONFIG_CPU_ISOLATION_OPT + WRITE_ONCE(timer->state, (timer->state | HRTIMER_STATE_ENQUEUED)); +#else WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); +#endif return timerqueue_add(&base->active, &timer->node); } @@ -1007,7 +1011,15 @@ static void __remove_hrtimer(struct hrtimer *timer, u8 state = timer->state; /* Pairs with the lockless read in hrtimer_is_queued() */ +#ifdef CONFIG_CPU_ISOLATION_OPT + /* + * We need to preserve PINNED state here, otherwise we may end up + * migrating pinned hrtimers as well. + */ + WRITE_ONCE(timer->state, newstate | (timer->state & HRTIMER_STATE_PINNED)); +#else WRITE_ONCE(timer->state, newstate); +#endif if (!(state & HRTIMER_STATE_ENQUEUED)) return; @@ -1061,6 +1073,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); +#ifdef CONFIG_CPU_ISOLATION_OPT + /* Make sure PINNED flag is cleared after removing hrtimer */ + timer->state &= ~HRTIMER_STATE_PINNED; +#endif return 1; } return 0; @@ -1153,6 +1169,12 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, new_base = base; } +#ifdef CONFIG_CPU_ISOLATION_OPT + timer->state &= ~HRTIMER_STATE_PINNED; + if (mode & HRTIMER_MODE_PINNED) + timer->state |= HRTIMER_STATE_PINNED; +#endif + first = enqueue_hrtimer(timer, new_base, mode); if (!force_local) return first; @@ -1507,9 +1529,13 @@ bool hrtimer_active(const struct hrtimer *timer) do { base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); - +#ifdef CONFIG_CPU_ISOLATION_OPT + if (((timer->state & ~HRTIMER_STATE_PINNED) != + HRTIMER_STATE_INACTIVE) || base->running == timer) +#else if (timer->state != HRTIMER_STATE_INACTIVE || base->running == timer) +#endif return true; } while (read_seqcount_retry(&base->seq, seq) || @@ -2082,6 +2108,117 @@ int hrtimers_prepare_cpu(unsigned int cpu) #ifdef CONFIG_HOTPLUG_CPU +#ifdef CONFIG_CPU_ISOLATION_OPT +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base, + bool remove_pinned) +{ + struct hrtimer *timer; + struct timerqueue_node *node; + struct timerqueue_head pinned; + int is_pinned; + bool is_hotplug = !cpu_online(old_base->cpu_base->cpu); + + timerqueue_init_head(&pinned); + + while ((node = timerqueue_getnext(&old_base->active))) { + timer = container_of(node, struct hrtimer, node); + if (is_hotplug) + BUG_ON(hrtimer_callback_running(timer)); + debug_deactivate(timer); + + /* + * Mark it as ENQUEUED not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + + is_pinned = timer->state & HRTIMER_STATE_PINNED; + if (!remove_pinned && is_pinned) { + timerqueue_add(&pinned, &timer->node); + continue; + } + + timer->base = new_base; + /* + * Enqueue the timers on the new cpu. This does not + * reprogram the event device in case the timer + * expires before the earliest on this CPU, but we run + * hrtimer_interrupt after we migrated everything to + * sort out already expired timers and reprogram the + * event device. + */ + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); + } + + /* Re-queue pinned timers for non-hotplug usecase */ + while ((node = timerqueue_getnext(&pinned))) { + timer = container_of(node, struct hrtimer, node); + + timerqueue_del(&pinned, &timer->node); + enqueue_hrtimer(timer, old_base, HRTIMER_MODE_ABS); + } +} + +static void __migrate_hrtimers(unsigned int scpu, bool remove_pinned) +{ + struct hrtimer_cpu_base *old_base, *new_base; + unsigned long flags; + int i; + + local_irq_save(flags); + old_base = &per_cpu(hrtimer_bases, scpu); + new_base = this_cpu_ptr(&hrtimer_bases); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + raw_spin_lock(&new_base->lock); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i], remove_pinned); + } + + /* + * The migration might have changed the first expiring softirq + * timer on this CPU. Update it. + */ + hrtimer_update_softirq_timer(new_base, false); + + raw_spin_unlock(&old_base->lock); + raw_spin_unlock(&new_base->lock); + + /* Check, if we got expired work to do */ + __hrtimer_peek_ahead_timers(); + local_irq_restore(flags); +} + +int hrtimers_dead_cpu(unsigned int scpu) +{ + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + /* + * this BH disable ensures that raise_softirq_irqoff() does + * not wakeup ksoftirqd (and acquire the pi-lock) while + * holding the cpu_base lock + */ + local_bh_disable(); + __migrate_hrtimers(scpu, true); + local_bh_enable(); + return 0; +} + +void hrtimer_quiesce_cpu(void *cpup) +{ + __migrate_hrtimers(*(int *)cpup, false); +} + +#else + static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { @@ -2157,6 +2294,8 @@ int hrtimers_dead_cpu(unsigned int scpu) return 0; } +#endif /* CONFIG_CPU_ISOLATION_OPT */ + #endif /* CONFIG_HOTPLUG_CPU */ void __init hrtimers_init(void) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a3ec21be3b14..926d0900fa36 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1921,6 +1921,65 @@ signed long __sched schedule_timeout_idle(signed long timeout) EXPORT_SYMBOL(schedule_timeout_idle); #ifdef CONFIG_HOTPLUG_CPU + +#ifdef CONFIG_CPU_ISOLATION_OPT +static void migrate_timer_list(struct timer_base *new_base, + struct hlist_head *head, bool remove_pinned) +{ + struct timer_list *timer; + int cpu = new_base->cpu; + struct hlist_node *n; + int is_pinned; + + hlist_for_each_entry_safe(timer, n, head, entry) { + is_pinned = timer->flags & TIMER_PINNED; + if (!remove_pinned && is_pinned) + continue; + + detach_if_pending(timer, get_timer_base(timer->flags), false); + timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; + internal_add_timer(new_base, timer); + } +} + +static void __migrate_timers(unsigned int cpu, bool remove_pinned) +{ + struct timer_base *old_base; + struct timer_base *new_base; + unsigned long flags; + int b, i; + + for (b = 0; b < NR_BASES; b++) { + old_base = per_cpu_ptr(&timer_bases[b], cpu); + new_base = get_cpu_ptr(&timer_bases[b]); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + raw_spin_lock_irqsave(&new_base->lock, flags); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + /* + * The current CPUs base clock might be stale. Update it + * before moving the timers over. + */ + forward_timer_base(new_base); + + if (!cpu_online(cpu)) + BUG_ON(old_base->running_timer); + + for (i = 0; i < WHEEL_SIZE; i++) + migrate_timer_list(new_base, old_base->vectors + i, + remove_pinned); + + raw_spin_unlock(&old_base->lock); + raw_spin_unlock_irqrestore(&new_base->lock, flags); + put_cpu_ptr(&timer_bases); + } +} + +#else + static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) { struct timer_list *timer; @@ -1934,6 +1993,8 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h } } +#endif /* CONFIG_CPU_ISOLATION_OPT */ + int timers_prepare_cpu(unsigned int cpu) { struct timer_base *base; @@ -1949,6 +2010,21 @@ int timers_prepare_cpu(unsigned int cpu) return 0; } +#ifdef CONFIG_CPU_ISOLATION_OPT +int timers_dead_cpu(unsigned int cpu) +{ + BUG_ON(cpu_online(cpu)); + __migrate_timers(cpu, true); + return 0; +} + +void timer_quiesce_cpu(void *cpup) +{ + __migrate_timers(*(unsigned int *)cpup, false); +} + +#else + int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; @@ -1985,6 +2061,8 @@ int timers_dead_cpu(unsigned int cpu) return 0; } +#endif /* CONFIG_CPU_ISOLATION_OPT */ + #endif /* CONFIG_HOTPLUG_CPU */ static void __init init_timer_cpu(int cpu) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 01bf977090dc..9d3ca28c6f8d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -171,6 +172,7 @@ static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(unsigned int, watchdog_en); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); @@ -428,16 +430,20 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static void watchdog_enable(unsigned int cpu) +void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); struct completion *done = this_cpu_ptr(&softlockup_completion); + unsigned int *enabled = this_cpu_ptr(&watchdog_en); WARN_ON_ONCE(cpu != smp_processor_id()); init_completion(done); complete(done); + if (*enabled) + return; + /* * Start the timer first to prevent the NMI watchdog triggering * before the timer has a chance to fire. @@ -452,11 +458,24 @@ static void watchdog_enable(unsigned int cpu) /* Enable the perf event */ if (watchdog_enabled & NMI_WATCHDOG_ENABLED) watchdog_nmi_enable(cpu); + + /* + * Need to ensure above operations are observed by other CPUs before + * indicating that timer is enabled. This is to synchronize core + * isolation and hotplug. Core isolation will wait for this flag to be + * set. + */ + mb(); + *enabled = 1; } -static void watchdog_disable(unsigned int cpu) +void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = per_cpu_ptr(&watchdog_en, cpu); + + if (!*enabled) + return; WARN_ON_ONCE(cpu != smp_processor_id()); @@ -468,6 +487,17 @@ static void watchdog_disable(unsigned int cpu) watchdog_nmi_disable(cpu); hrtimer_cancel(hrtimer); wait_for_completion(this_cpu_ptr(&softlockup_completion)); + + /* + * No need for barrier here since disabling the watchdog is + * synchronized with hotplug lock + */ + *enabled = 0; +} + +bool watchdog_configured(unsigned int cpu) +{ + return *per_cpu_ptr(&watchdog_en, cpu); } static int softlockup_stop_fn(void *data) diff --git a/mm/vmstat.c b/mm/vmstat.c index a03aa6b3e4dc..ec58ac28b4f7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1881,7 +1881,7 @@ int vmstat_refresh(struct ctl_table *table, int write, static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats(true)) { + if (refresh_cpu_vm_stats(true) && !cpu_isolated(smp_processor_id())) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1973,7 +1973,8 @@ static void vmstat_shepherd(struct work_struct *w) for_each_online_cpu(cpu) { struct delayed_work *dw = &per_cpu(vmstat_work, cpu); - if (!delayed_work_pending(dw) && need_update(cpu)) + if (!delayed_work_pending(dw) && need_update(cpu) && + !cpu_isolated(cpu)) queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); } put_online_cpus(); -- Gitee From 448b6875649927b25f86933f953e1f8986db5cf2 Mon Sep 17 00:00:00 2001 From: waterwin Date: Sat, 12 Feb 2022 14:57:21 +0800 Subject: [PATCH 014/113] hmdfs: multi user support ohos inclusion category: feature issue: #I4TF6Y CVE: NA ---------------------------------------------- hmdfs support multi user mount, with different user id, hap access file with different uid. Signed-off-by: qianjiaxing --- fs/hmdfs/authority/authentication.c | 12 ++++++++---- fs/hmdfs/authority/authentication.h | 14 +++++--------- fs/hmdfs/hmdfs.h | 3 +++ fs/hmdfs/inode_local.c | 8 -------- fs/hmdfs/main.c | 1 + fs/hmdfs/super.c | 18 ++++++++++++++++-- 6 files changed, 33 insertions(+), 23 deletions(-) diff --git a/fs/hmdfs/authority/authentication.c b/fs/hmdfs/authority/authentication.c index 85ac3c96c5b1..d56ac1490bb3 100644 --- a/fs/hmdfs/authority/authentication.c +++ b/fs/hmdfs/authority/authentication.c @@ -99,7 +99,8 @@ const struct cred *hmdfs_override_dir_fsids(struct inode *dir, * local uninstall. * Set appid + media_rw for local install. */ - int bid = get_bid(dentry->d_name.name); + int bid = get_bundle_uid(hmdfs_sb(dentry->d_sb), + dentry->d_name.name); if (bid != 0) { cred->fsuid = KUIDT_INIT(bid); @@ -359,13 +360,15 @@ void check_and_fixup_ownership(struct inode *parent_inode, struct inode *child, switch (info->perm & HMDFS_DIR_TYPE_MASK) { case HMDFS_DIR_PKG: - bid = get_bid(name); + bid = get_bundle_uid(hmdfs_sb(parent_inode->i_sb), name); if (bid != child->i_uid.val || bid != child->i_gid.val) - fixup_ownership_user_group(child, lower_dentry, bid, bid); + fixup_ownership_user_group(child, lower_dentry, bid, + bid); break; case HMDFS_DIR_DATA: case HMDFS_FILE_PKG_SUB: + case HMDFS_DIR_PKG_SUB: case HMDFS_DIR_DEFAULT: case HMDFS_FILE_DEFAULT: case HMDFS_DIR_PUBLIC: @@ -420,7 +423,8 @@ void check_and_fixup_ownership_remote(struct inode *dir, * local uninstall. * Set appid + media_rw for local install. */ - int bid = get_bid(dentry->d_name.name); + int bid = get_bundle_uid(hmdfs_sb(dentry->d_sb), + dentry->d_name.name); if (bid != 0) { dinode->i_uid = KUIDT_INIT(bid); dinode->i_gid = KGIDT_INIT(bid); diff --git a/fs/hmdfs/authority/authentication.h b/fs/hmdfs/authority/authentication.h index 26838e2e8128..af6eec9a4897 100644 --- a/fs/hmdfs/authority/authentication.h +++ b/fs/hmdfs/authority/authentication.h @@ -174,14 +174,6 @@ static inline bool is_system_auth(__u16 perm) #define HMDFS_ALL_MASK (HMDFS_MOUNT_POINT_MASK | AUTH_MASK | HMDFS_DIR_TYPE_MASK | HMDFS_PERM_MASK) -static inline kuid_t get_bid_from_uid(kuid_t uid) -{ - kuid_t bid; - - bid.val = uid.val % BASE_USER_RANGE; - return bid; -} - static inline void set_inode_gid(struct inode *inode, kgid_t gid) { inode->i_gid = gid; @@ -259,6 +251,11 @@ extern int get_bid(const char *bname); extern int __init hmdfs_init_configfs(void); extern void hmdfs_exit_configfs(void); +static inline int get_bundle_uid(struct hmdfs_sb_info *sbi, const char *bname) +{ + return sbi->user_id * BASE_USER_RANGE + get_bid(bname); +} + #else static inline @@ -331,7 +328,6 @@ void hmdfs_check_cred(const struct cred *cred) { } -static inline int get_bid(const char *bname) { return 0; } static inline int __init hmdfs_init_configfs(void) { return 0; } static inline void hmdfs_exit_configfs(void) {} diff --git a/fs/hmdfs/hmdfs.h b/fs/hmdfs/hmdfs.h index d0a24db08f62..9b5d456e1217 100644 --- a/fs/hmdfs/hmdfs.h +++ b/fs/hmdfs/hmdfs.h @@ -190,6 +190,9 @@ struct hmdfs_sb_info { wait_queue_head_t async_readdir_wq; /* don't allow async readdir */ bool async_readdir_prohibit; + + /* multi user */ + unsigned int user_id; }; static inline struct hmdfs_sb_info *hmdfs_sb(struct super_block *sb) diff --git a/fs/hmdfs/inode_local.c b/fs/hmdfs/inode_local.c index 7afab9d98ada..c302d320de48 100644 --- a/fs/hmdfs/inode_local.c +++ b/fs/hmdfs/inode_local.c @@ -893,14 +893,6 @@ int hmdfs_permission(struct inode *inode, int mask) mode >>= 6; } else if (in_group_p(inode->i_gid)) { mode >>= 3; - } else if (is_pkg_auth(hii->perm)) { - kuid_t bid = get_bid_from_uid(cur_uid); - - if (uid_eq(bid, inode->i_uid)) - return 0; - } else if (is_system_auth(hii->perm)) { - if (in_group_p(USER_DATA_RW_GID)) - return 0; } if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) diff --git a/fs/hmdfs/main.c b/fs/hmdfs/main.c index 0456a247caf6..efc952a36afd 100644 --- a/fs/hmdfs/main.c +++ b/fs/hmdfs/main.c @@ -373,6 +373,7 @@ static int hmdfs_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",merge_disable"); seq_printf(m, ",ra_pages=%lu", root->d_sb->s_bdi->ra_pages); + seq_printf(m, ",user_id=%u", sbi->user_id); if (sbi->cache_dir) seq_printf(m, ",cache_dir=%s", sbi->cache_dir); diff --git a/fs/hmdfs/super.c b/fs/hmdfs/super.c index 92012f80ab37..52cc857f5e45 100644 --- a/fs/hmdfs/super.c +++ b/fs/hmdfs/super.c @@ -20,6 +20,7 @@ enum { OPT_VIEW_TYPE, OPT_NO_OFFLINE_STASH, OPT_NO_DENTRY_CACHE, + OPT_USER_ID, OPT_ERR, }; @@ -31,6 +32,7 @@ static match_table_t hmdfs_tokens = { { OPT_VIEW_TYPE, "merge" }, { OPT_NO_OFFLINE_STASH, "no_offline_stash" }, { OPT_NO_DENTRY_CACHE, "no_dentry_cache" }, + { OPT_USER_ID, "user_id=%s"}, { OPT_ERR, NULL }, }; @@ -74,6 +76,7 @@ int hmdfs_parse_options(struct hmdfs_sb_info *sbi, const char *data) char *options_src = NULL; substring_t args[MAX_OPT_ARGS]; unsigned long value = DEAULT_RA_PAGES; + unsigned int user_id = 0; struct super_block *sb = sbi->sb; int err = 0; @@ -100,10 +103,10 @@ int hmdfs_parse_options(struct hmdfs_sb_info *sbi, const char *data) name = match_strdup(&args[0]); if (name) { err = kstrtoul(name, 10, &value); - if (err) - goto out; kfree(name); name = NULL; + if (err) + goto out; } break; case OPT_LOCAL_DST: @@ -128,6 +131,17 @@ int hmdfs_parse_options(struct hmdfs_sb_info *sbi, const char *data) case OPT_NO_DENTRY_CACHE: sbi->s_dentry_cache = false; break; + case OPT_USER_ID: + name = match_strdup(&args[0]); + if (name) { + err = kstrtouint(name, 10, &user_id); + kfree(name); + name = NULL; + if (err) + goto out; + sbi->user_id = user_id; + } + break; default: err = -EINVAL; goto out; -- Gitee From 2c41e0a9122c4ef7d69f80227bb80ffd64b030b4 Mon Sep 17 00:00:00 2001 From: waterwin Date: Thu, 10 Feb 2022 19:48:40 +0800 Subject: [PATCH 015/113] hmdfs: Indroduce device security level check to hmdfs ohos inclusion category: feature issue: #I4T7IZ CVE: NA ---------------------------------------------- hmdfs data access cross device by network, device security level is bigger than data security level. Signed-off-by: qianjiaxing --- fs/hmdfs/comm/connection.c | 11 +++--- fs/hmdfs/comm/connection.h | 4 +- fs/hmdfs/comm/device_node.c | 29 ++++++++++++++- fs/hmdfs/comm/device_node.h | 8 ++++ fs/hmdfs/hmdfs_server.c | 73 +++++++++++++++++++++++++++++++++++++ fs/hmdfs/hmdfs_server.h | 8 ++++ 6 files changed, 125 insertions(+), 8 deletions(-) diff --git a/fs/hmdfs/comm/connection.c b/fs/hmdfs/comm/connection.c index 51e6f829eb34..5a2e5b0f5b02 100644 --- a/fs/hmdfs/comm/connection.c +++ b/fs/hmdfs/comm/connection.c @@ -1092,9 +1092,8 @@ static struct hmdfs_peer *add_peer_unsafe(struct hmdfs_sb_info *sbi, return peer2add; } -static struct hmdfs_peer * -alloc_peer(struct hmdfs_sb_info *sbi, uint8_t *cid, - const struct connection_operations *conn_operations) +static struct hmdfs_peer *alloc_peer(struct hmdfs_sb_info *sbi, uint8_t *cid, + const struct connection_operations *conn_operations, uint32_t devsl) { struct hmdfs_peer *node = kzalloc(sizeof(*node), GFP_KERNEL); @@ -1178,6 +1177,7 @@ alloc_peer(struct hmdfs_sb_info *sbi, uint8_t *cid, init_waitqueue_head(&node->rebuild_inode_status_wq); INIT_LIST_HEAD(&node->stashed_inode_list); node->need_rebuild_stash_list = false; + node->devsl = devsl; return node; @@ -1206,7 +1206,8 @@ alloc_peer(struct hmdfs_sb_info *sbi, uint8_t *cid, return NULL; } -struct hmdfs_peer *hmdfs_get_peer(struct hmdfs_sb_info *sbi, uint8_t *cid) +struct hmdfs_peer *hmdfs_get_peer(struct hmdfs_sb_info *sbi, uint8_t *cid, + uint32_t devsl) { struct hmdfs_peer *peer = NULL, *on_sbi_peer = NULL; const struct connection_operations *conn_opr_ptr = NULL; @@ -1225,7 +1226,7 @@ struct hmdfs_peer *hmdfs_get_peer(struct hmdfs_sb_info *sbi, uint8_t *cid) hmdfs_info("Fatal! Cannot get peer operation"); goto out; } - peer = alloc_peer(sbi, cid, conn_opr_ptr); + peer = alloc_peer(sbi, cid, conn_opr_ptr, devsl); if (unlikely(!peer)) { hmdfs_info("Failed to alloc a peer"); goto out; diff --git a/fs/hmdfs/comm/connection.h b/fs/hmdfs/comm/connection.h index 6f3ee1baddf2..2d80491b9201 100644 --- a/fs/hmdfs/comm/connection.h +++ b/fs/hmdfs/comm/connection.h @@ -213,6 +213,7 @@ struct hmdfs_peer { /* sysfs */ struct kobject kobj; struct completion kobj_unregister; + uint32_t devsl; }; #define HMDFS_DEVID_LOCAL 0 @@ -297,7 +298,8 @@ struct connection *get_conn_impl(struct hmdfs_peer *node, int connect_type); void set_conn_sock_quickack(struct hmdfs_peer *node); -struct hmdfs_peer *hmdfs_get_peer(struct hmdfs_sb_info *sbi, uint8_t *cid); +struct hmdfs_peer *hmdfs_get_peer(struct hmdfs_sb_info *sbi, uint8_t *cid, + uint32_t devsl); struct hmdfs_peer *hmdfs_lookup_from_devid(struct hmdfs_sb_info *sbi, uint64_t device_id); diff --git a/fs/hmdfs/comm/device_node.c b/fs/hmdfs/comm/device_node.c index 183f3b7172c5..c6bf56086400 100644 --- a/fs/hmdfs/comm/device_node.c +++ b/fs/hmdfs/comm/device_node.c @@ -48,7 +48,7 @@ static void ctrl_cmd_update_socket_handler(const char *buf, size_t len, } memcpy(&cmd, buf, sizeof(cmd)); - node = hmdfs_get_peer(sbi, cmd.cid); + node = hmdfs_get_peer(sbi, cmd.cid, cmd.devsl); if (unlikely(!node)) { hmdfs_err("failed to update ctrl node: cannot get peer"); goto out; @@ -72,6 +72,28 @@ static void ctrl_cmd_update_socket_handler(const char *buf, size_t len, peer_put(node); } +static void ctrl_cmd_update_devsl_handler(const char *buf, size_t len, + struct hmdfs_sb_info *sbi) +{ + struct update_devsl_param cmd; + struct hmdfs_peer *node = NULL; + + if (unlikely(!buf || len != sizeof(cmd))) { + hmdfs_err("Recved a invalid userbuf"); + return; + } + memcpy(&cmd, buf, sizeof(cmd)); + + node = hmdfs_lookup_from_cid(sbi, cmd.cid); + if (unlikely(!node)) { + hmdfs_err("failed to update devsl: cannot get peer"); + return; + } + hmdfs_info("Found peer: device_id = %llu", node->device_id); + node->devsl = cmd.devsl; + peer_put(node); +} + static inline void hmdfs_disconnect_node_marked(struct hmdfs_peer *conn) { hmdfs_start_process_offline(conn); @@ -152,6 +174,7 @@ typedef void (*ctrl_cmd_handler)(const char *buf, size_t len, static const ctrl_cmd_handler cmd_handler[CMD_CNT] = { [CMD_UPDATE_SOCKET] = ctrl_cmd_update_socket_handler, + [CMD_UPDATE_DEVSL] = ctrl_cmd_update_devsl_handler, [CMD_OFF_LINE] = ctrl_cmd_off_line_handler, [CMD_OFF_LINE_ALL] = ctrl_cmd_off_line_all_handler, }; @@ -179,8 +202,10 @@ static const char *cmd2str(int cmd) case 0: return "CMD_UPDATE_SOCKET"; case 1: - return "CMD_OFF_LINE"; + return "CMD_UPDATE_DEVSL"; case 2: + return "CMD_OFF_LINE"; + case 3: return "CMD_OFF_LINE_ALL"; default: return "illegal cmd"; diff --git a/fs/hmdfs/comm/device_node.h b/fs/hmdfs/comm/device_node.h index 3c99c7fb679f..fdc64d467287 100644 --- a/fs/hmdfs/comm/device_node.h +++ b/fs/hmdfs/comm/device_node.h @@ -13,6 +13,7 @@ enum CTRL_NODE_CMD { CMD_UPDATE_SOCKET = 0, + CMD_UPDATE_DEVSL, CMD_OFF_LINE, CMD_OFF_LINE_ALL, CMD_CNT, @@ -21,11 +22,18 @@ enum CTRL_NODE_CMD { struct update_socket_param { int32_t cmd; int32_t newfd; + uint32_t devsl; uint8_t status; uint8_t masterkey[HMDFS_KEY_SIZE]; uint8_t cid[HMDFS_CID_SIZE]; } __packed; +struct update_devsl_param { + int32_t cmd; + uint32_t devsl; + uint8_t cid[HMDFS_CID_SIZE]; +} __attribute__((packed)); + struct offline_param { int32_t cmd; uint8_t remote_cid[HMDFS_CID_SIZE]; diff --git a/fs/hmdfs/hmdfs_server.c b/fs/hmdfs/hmdfs_server.c index c50e9f9de842..4da2fc70c1a5 100644 --- a/fs/hmdfs/hmdfs_server.c +++ b/fs/hmdfs/hmdfs_server.c @@ -220,6 +220,74 @@ static int hmdfs_get_inode_by_name(struct hmdfs_peer *con, const char *filename, return 0; } +static const char *datasl_str[] = { + "s0", "s1", "s2", "s3", "s4" +}; + +static int parse_data_sec_level(const char *sl_value, size_t sl_value_len) +{ + int i; + + for (i = 0; i <= sizeof(datasl_str) / sizeof(datasl_str[0]); i++) { + if (!strncmp(sl_value, datasl_str[i], strlen(datasl_str[i]))) + return i + DATA_SEC_LEVEL0; + } + + return DATA_SEC_LEVEL3; +} + +static int check_sec_level(struct hmdfs_peer *node, const char *file_name) +{ + int err; + int ret = 0; + struct path root_path; + struct path file_path; + char *value; + size_t value_len = DATA_SEC_LEVEL_LENGTH; + + if (node->devsl <= 0) { + ret = -EACCES; + goto out_free; + } + + value = kzalloc(value_len, GFP_KERNEL); + if (!value) { + ret = -ENOMEM; + goto out_free; + } + + err = kern_path(node->sbi->local_dst, LOOKUP_DIRECTORY, &root_path); + if (err) { + hmdfs_err("get root path error"); + ret = err; + goto out_free; + } + + err = vfs_path_lookup(root_path.dentry, root_path.mnt, file_name, 0, + &file_path); + if (err) { + hmdfs_err("get file path error"); + ret = err; + goto out_err; + } + + err = vfs_getxattr(file_path.dentry, DATA_SEC_LEVEL_LABEL, value, + value_len); + if (err <= 0 && node->devsl >= DATA_SEC_LEVEL3) + goto out; + if (err > 0 && node->devsl >= parse_data_sec_level(value, err)) + goto out; + + ret = -EACCES; +out: + path_put(&file_path); +out_err: + path_put(&root_path); +out_free: + kfree(value); + return ret; +} + static struct file *hmdfs_open_file(struct hmdfs_peer *con, const char *filename, uint8_t file_type, int *file_id) @@ -232,6 +300,11 @@ static struct file *hmdfs_open_file(struct hmdfs_peer *con, return ERR_PTR(-EINVAL); } + if (check_sec_level(con, filename)) { + hmdfs_err("devsl permission denied"); + return ERR_PTR(-EACCES); + } + if (hm_islnk(file_type)) file = hmdfs_open_photokit_path(con->sbi, filename); else diff --git a/fs/hmdfs/hmdfs_server.h b/fs/hmdfs/hmdfs_server.h index 844f3a9ee82c..740d06f8b3a5 100644 --- a/fs/hmdfs/hmdfs_server.h +++ b/fs/hmdfs/hmdfs_server.h @@ -12,6 +12,14 @@ #include "comm/transport.h" #include "comm/socket_adapter.h" +#define DATA_SEC_LEVEL0 0 +#define DATA_SEC_LEVEL1 1 +#define DATA_SEC_LEVEL2 2 +#define DATA_SEC_LEVEL3 3 +#define DATA_SEC_LEVEL4 4 +#define DATA_SEC_LEVEL_LABEL "user.security" +#define DATA_SEC_LEVEL_LENGTH 10 + static inline void hmdfs_send_err_response(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, int err) { -- Gitee From f56d1502ed37769427931556d6149ad2c24ffb1d Mon Sep 17 00:00:00 2001 From: CY Fan Date: Mon, 14 Feb 2022 00:01:50 +0800 Subject: [PATCH 016/113] hyperhold: fix compiler warnings when opening zswapd config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ohos inclusion category: bugfix issue: #I4TFXN CVE: NA ----------------- This patch fixes the following comipler warnings when opening zswapd config: warning: ‘report_app_info_show’ defined but not used warning: suggest parentheses around assignment used as truth value warning: comparison of distinct pointer types lacks a cast Signed-off-by: CY Fan --- mm/zswapd.c | 16 ++++++++-------- mm/zswapd_control.c | 44 -------------------------------------------- 2 files changed, 8 insertions(+), 52 deletions(-) diff --git a/mm/zswapd.c b/mm/zswapd.c index 36e8ffd42b73..b5fcb0d2aa08 100644 --- a/mm/zswapd.c +++ b/mm/zswapd.c @@ -127,7 +127,7 @@ static u64 swapout(u64 req_size) struct mem_cgroup *memcg = NULL; u64 write_size = 0; - while ((memcg = get_next_memcg(memcg))) { + while ((memcg = get_next_memcg(memcg)) != NULL) { write_size += swapout_memcg(memcg, req_size - write_size); if (write_size >= req_size) break; @@ -141,7 +141,7 @@ static unsigned long long get_zram_used_pages(void) struct mem_cgroup *memcg = NULL; unsigned long long zram_pages = 0; - while ((memcg = get_next_memcg(memcg))) + while ((memcg = get_next_memcg(memcg)) != NULL) zram_pages += memcg_data_size(memcg, CACHE_PAGE); return zram_pages; @@ -152,7 +152,7 @@ static unsigned long long get_eswap_used_pages(void) struct mem_cgroup *memcg = NULL; unsigned long long eswap_pages = 0; - while ((memcg = get_next_memcg(memcg))) + while ((memcg = get_next_memcg(memcg)) != NULL) eswap_pages += memcg_data_size(memcg, SWAP_PAGE); return eswap_pages; @@ -163,7 +163,7 @@ static unsigned long long get_zram_pagefault(void) struct mem_cgroup *memcg = NULL; unsigned long long cache_fault = 0; - while ((memcg = get_next_memcg(memcg))) + while ((memcg = get_next_memcg(memcg)) != NULL) cache_fault += memcg_data_size(memcg, CACHE_FAULT); return cache_fault; @@ -236,7 +236,7 @@ static void snapshot_anon_refaults(void) { struct mem_cgroup *memcg = NULL; - while (memcg = get_next_memcg(memcg)) + while ((memcg = get_next_memcg(memcg)) != NULL) memcg->memcg_reclaimed.reclaimed_pagefault = memcg_data_size(memcg, CACHE_FAULT); last_anon_pagefault = get_zram_pagefault(); @@ -398,7 +398,7 @@ int get_zram_current_watermark(void) /* page to ratio */ diff_buffers = div64_s64(diff_buffers * percent_constant, nr_total); - return min(zram_wm_ratio, zram_wm_ratio - diff_buffers); + return min((long long)zram_wm_ratio, zram_wm_ratio - diff_buffers); } bool zram_watermark_ok(void) @@ -572,7 +572,7 @@ static bool zswapd_shrink_anon(pg_data_t *pgdat, struct scan_control *sc) struct mem_cgroup *memcg = NULL; unsigned long nr[NR_LRU_LISTS]; - while ((memcg = get_next_memcg(memcg))) { + while ((memcg = get_next_memcg(memcg)) != NULL) { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); u64 nr_active, nr_inactive, nr_zram, nr_eswap, zram_ratio; @@ -634,7 +634,7 @@ static u64 __calc_nr_to_reclaim(void) reclaim_size = high_buffers - buffers; /* once max reclaim target is max_reclaim_size */ - reclaim_size = min(reclaim_size, max_reclaim_size); + reclaim_size = min(reclaim_size, (u64)max_reclaim_size); /* MB to pages */ return div_u64(reclaim_size * SZ_1M, PAGE_SIZE); diff --git a/mm/zswapd_control.c b/mm/zswapd_control.c index 934eff21f09b..d7ea6a6fe2cb 100644 --- a/mm/zswapd_control.c +++ b/mm/zswapd_control.c @@ -517,50 +517,6 @@ static int memcg_active_app_info_list_show(struct seq_file *m, void *v) return 0; } -static int report_app_info_show(struct seq_file *m, void *v) -{ - struct mem_cgroup_per_node *mz = NULL; - struct mem_cgroup *memcg = NULL; - struct lruvec *lruvec = NULL; - unsigned long eswap_size; - unsigned long zram_size; - unsigned long anon_size; - - while ((memcg = get_next_memcg(memcg))) { - u64 score = atomic64_read(&memcg->memcg_reclaimed.app_score); - - mz = mem_cgroup_nodeinfo(memcg, 0); - if (!mz) { - get_next_memcg_break(memcg); - return 0; - } - - lruvec = &mz->lruvec; - if (!lruvec) { - get_next_memcg_break(memcg); - return 0; - } - - anon_size = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, - MAX_NR_ZONES) + lruvec_lru_size(lruvec, - LRU_INACTIVE_ANON, MAX_NR_ZONES); - eswap_size = memcg_data_size(memcg, SWAP_SIZE); - zram_size = memcg_data_size(memcg, CACHE_SIZE); - - if (anon_size + zram_size + eswap_size == 0) - continue; - - anon_size *= PAGE_SIZE / SZ_1K; - zram_size *= PAGE_SIZE / SZ_1K; - eswap_size *= PAGE_SIZE / SZ_1K; - - seq_printf(m, "%s, %llu, %lu, %lu, %lu\n", - strlen(memcg->name) ? memcg->name : "root", - score, anon_size, zram_size, eswap_size); - } - return 0; -} - #ifdef CONFIG_HYPERHOLD_DEBUG static int avail_buffers_params_show(struct seq_file *m, void *v) { -- Gitee From 9fe67d66c485bfd0bd7beca1234a02e9af50c12a Mon Sep 17 00:00:00 2001 From: roger Date: Sun, 13 Feb 2022 22:45:58 +0800 Subject: [PATCH 017/113] blackbox: Never storing fault logs on empty device. ohos inclusion category: bugfix issue: #I4Q6AR CVE: NA ------------------------------- Avoid saving fault logs without pstore_blk.blkdev's ready. Signed-off-by: roger --- drivers/staging/blackbox/Kconfig | 7 +++++++ fs/pstore/Kconfig | 2 +- fs/pstore/blk.c | 6 ++++++ fs/pstore/internal.h | 3 +++ fs/pstore/platform.c | 3 +++ 5 files changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/staging/blackbox/Kconfig b/drivers/staging/blackbox/Kconfig index a3c94f927227..cbfb275c52e0 100644 --- a/drivers/staging/blackbox/Kconfig +++ b/drivers/staging/blackbox/Kconfig @@ -32,6 +32,13 @@ config BLACKBOX_STORAGE_BY_MEMORY panic occurs. It depends on supporting warm reset and disabling erase ddr when warm reset. +config BLACKBOX_USE_PSTORE_BLK_DEBUG + bool "blackbox use pstore blk for debug" + depends on BLACKBOX + default n + help + If Y, this enables pstore blk for blackbox. + config BLACKBOX_STORAGE_BY_PSTORE_BLK tristate "blackbox fault log storage by pstore blk" depends on BLACKBOX diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 25bd47bd03f3..40ec4d3f45e7 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -177,7 +177,7 @@ config PSTORE_BLK tristate "Log panic/oops to a block device" depends on PSTORE depends on BLOCK - depends on BROKEN + depends on BLACKBOX_USE_PSTORE_BLK_DEBUG || BROKEN select PSTORE_ZONE default n help diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 6a768a5dae91..b7a290482840 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -53,6 +53,7 @@ MODULE_PARM_DESC(ftrace_size, "ftrace size in kbytes"); #if IS_ENABLED(CONFIG_PSTORE_BLACKBOX) static long blackbox_size = CONFIG_PSTORE_BLK_BLACKBOX_SIZE; +bool pstore_blk_ready; #else static long blackbox_size = -1; #endif @@ -501,6 +502,11 @@ static int __init pstore_blk_init(void) ret = __register_pstore_blk(&info); mutex_unlock(&pstore_blk_lock); +#if IS_ENABLED(CONFIG_PSTORE_BLACKBOX) + if (best_effort && blkdev[0]) + pstore_blk_ready = true; +#endif + return ret; } late_initcall(pstore_blk_init); diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 7fb219042f13..dec9d17a5d94 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -50,4 +50,7 @@ extern void pstore_record_init(struct pstore_record *record, int __init pstore_init_fs(void); void __exit pstore_exit_fs(void); +#ifdef CONFIG_PSTORE_BLACKBOX +extern bool pstore_blk_ready; /* flag which pstore_blk is ready */ +#endif #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f9ab44bf0de6..7cb499250c5a 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -435,6 +435,9 @@ void pstore_blackbox_dump(struct kmsg_dumper *dumper, const char *why; int ret; + if (!pstore_blk_ready) + return; + why = kmsg_dump_reason_str(reason); if (down_trylock(&psinfo->buf_lock)) { -- Gitee From 05c857b8990ed18f8175a3d6cf8179a7d51842cc Mon Sep 17 00:00:00 2001 From: Xi_Yuhao Date: Fri, 11 Feb 2022 12:14:49 +0800 Subject: [PATCH 018/113] binder:fix the error that oneway ipc fails to get accesstoken ohos inclusion category: bugfix issue: #I4T64S CVE: NA --------------------------------------------------------------- The binder driver does not save the ipc context of oneway call. Therefore, when the user mode attempts to obtain the token, the kernel finds that the ipc contet is empty and returns an EFAULT errcode. Signed-off-by: Xi_Yuhao --- drivers/android/binder.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 3604f0df6896..c058fe7dcc26 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -555,6 +555,9 @@ struct binder_thread { struct binder_stats stats; atomic_t tmp_ref; bool is_dead; +#ifdef CONFIG_ACCESS_TOKENID + struct access_token tokens; +#endif /* CONFIG_ACCESS_TOKENID */ }; /** @@ -4554,6 +4557,12 @@ static int binder_thread_read(struct binder_proc *proc, if (t_from) binder_thread_dec_tmpref(t_from); t->buffer->allow_user_free = 1; +#ifdef CONFIG_ACCESS_TOKENID + binder_inner_proc_lock(thread->proc); + thread->tokens.sender_tokenid = t->sender_tokenid; + thread->tokens.first_tokenid = t->first_tokenid; + binder_inner_proc_unlock(thread->proc); +#endif /* CONFIG_ACCESS_TOKENID */ if (cmd != BR_REPLY && !(t->flags & TF_ONE_WAY)) { binder_inner_proc_lock(thread->proc); t->to_parent = thread->transaction_stack; @@ -5165,14 +5174,8 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) goto err; } binder_inner_proc_lock(proc); - if (thread->transaction_stack == NULL) { - ret = -EFAULT; - binder_inner_proc_unlock(proc); - goto err; - } - token = thread->transaction_stack->sender_tokenid; - ftoken = thread->transaction_stack->first_tokenid; - + token = thread->tokens.sender_tokenid; + ftoken = thread->tokens.first_tokenid; binder_inner_proc_unlock(proc); if (put_user(token, &tokens->sender_tokenid)) { ret = -EINVAL; -- Gitee From 8ed51910870d8829a352afe36abfbaf74ab1b16e Mon Sep 17 00:00:00 2001 From: zhizhimeimei6 Date: Wed, 9 Feb 2022 18:07:22 +0800 Subject: [PATCH 019/113] hievent: enable hievent driver ohos inclusion category: feature issue: #I4PJDP CVE: NA ----------------- hievent driver is used to store and upload kernel event. Signed-off-by: zhizhimeimei6 --- drivers/staging/hievent/Kconfig | 13 +- drivers/staging/hievent/Makefile | 3 +- drivers/staging/hievent/hievent_driver.c | 161 +++++++++++------------ drivers/staging/hievent/hievent_driver.h | 2 +- drivers/staging/hievent/hiview_hievent.c | 24 ++-- drivers/staging/hievent/hiview_hievent.h | 2 +- 6 files changed, 104 insertions(+), 101 deletions(-) diff --git a/drivers/staging/hievent/Kconfig b/drivers/staging/hievent/Kconfig index 39da4c041ba9..07834c32ba12 100644 --- a/drivers/staging/hievent/Kconfig +++ b/drivers/staging/hievent/Kconfig @@ -1,4 +1,11 @@ config HIEVENT - tristate "Enable hievent" - help - hievent buffer manager + tristate "Enable hievent" + help + hievent buffer manager + +config BBOX_BUFFER_SIZE + int "bbox buffer size" + depends on HIEVENT + default 2048 + help + Define the default ring buffer size of BBOX \ No newline at end of file diff --git a/drivers/staging/hievent/Makefile b/drivers/staging/hievent/Makefile index 2f67bf647e86..3d3ff445f5c9 100644 --- a/drivers/staging/hievent/Makefile +++ b/drivers/staging/hievent/Makefile @@ -1,2 +1 @@ - -obj-$(CONFIG_HIEVENT) += hievent_driver.o +obj-$(CONFIG_HIEVENT) += hievent_driver.o \ No newline at end of file diff --git a/drivers/staging/hievent/hievent_driver.c b/drivers/staging/hievent/hievent_driver.c index 8331a9632e62..36b0a778e04f 100644 --- a/drivers/staging/hievent/hievent_driver.c +++ b/drivers/staging/hievent/hievent_driver.c @@ -8,13 +8,16 @@ * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * */ +#define pr_fmt(fmt) "hievent_driver " fmt + #include "hievent_driver.h" +#include #include #include #include @@ -31,32 +34,22 @@ #include #include -#ifndef HIEVENTDEV_MAJOR -#define HIEVENTDEV_MAJOR 241 -#endif - -#ifndef HIEVENT_NR_DEVS -#define HIEVENT_NR_DEVS 2 -#endif - -static int hievent_major = HIEVENTDEV_MAJOR; +static struct class *hievent_class; +static dev_t hievent_devno; -static struct cdev hievent_cdev; - -#define HIEVENT_BUFFER ((size_t)1024) -#define HIEVENT_DRIVER "/dev/hwlog_exception" +#define HIEVENT_BUFFER ((size_t)CONFIG_BBOX_BUFFER_SIZE) +#define HIEVENT_DRIVER "/dev/bbox" +#define HIEVENT_DEV_NAME "bbox" +#define HIEVENT_DEV_NR 1 struct hievent_entry { unsigned short len; unsigned short header_size; - int pid; - int tid; - int sec; - int nsec; char msg[0]; }; struct hievent_char_device { + struct cdev devm; int flag; struct mutex mtx; /* lock to protect read/write buffer */ unsigned char *buffer; @@ -159,21 +152,12 @@ static ssize_t hievent_read(struct file *file, char __user *user_buf, } hievent_buffer_dec(sizeof(header)); - retval = copy_to_user((unsigned char *)user_buf, - (unsigned char *)&header, - min(count, sizeof(header))); - if (retval < 0) { - retval = -EINVAL; - goto out; - } - retval = hievent_read_ring_buffer((unsigned char *)(user_buf + - sizeof(header)), header.len); + retval = hievent_read_ring_buffer((unsigned char __user *)(user_buf), header.len); if (retval < 0) { retval = -EINVAL; goto out; } - hievent_buffer_dec(header.len); retval = header.len + sizeof(header); @@ -210,26 +194,7 @@ static int hievent_write_ring_head_buffer(const unsigned char *buffer, static void hievent_head_init(struct hievent_entry * const header, size_t len) { -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 0, 0) -#define NANOSEC_PER_MIRCOSEC 1000 - struct timeval now = { 0 }; - - do_gettimeofday(&now); - - header->sec = now.tv_sec; - header->nsec = now.tv_usec * NANOSEC_PER_MIRCOSEC; -#else - struct timespec64 now = { 0 }; - - ktime_get_real_ts64(&now); - - header->sec = now.tv_sec; - header->nsec = now.tv_nsec; -#endif - header->len = (unsigned short)len; - header->pid = current->pid; - header->tid = 0; header->header_size = sizeof(struct hievent_entry); } @@ -265,7 +230,6 @@ int hievent_write_internal(const char *buffer, size_t buf_len) hievent_cover_old_log(buf_len); hievent_head_init(&header, buf_len); - retval = hievent_write_ring_head_buffer((unsigned char *)&header, sizeof(header)); if (retval) { @@ -293,15 +257,17 @@ int hievent_write_internal(const char *buffer, size_t buf_len) return retval; } -static unsigned int hievent_poll(struct file *filep, - struct poll_table_struct *fds) +static unsigned int hievent_poll(struct file *filep, poll_table *wait) { - (void)filep; - (void)fds; + unsigned int mask = 0; - wait_event_interruptible(hievent_dev.wq, (hievent_dev.size > 0)); + poll_wait(filep, &hievent_dev.wq, wait); + if (hievent_dev.size > 0) { + mask |= POLLIN | POLLRDNORM; + return mask; + } - return (POLLOUT | POLLWRNORM); + return 0; } static ssize_t hievent_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -310,14 +276,16 @@ static ssize_t hievent_write_iter(struct kiocb *iocb, struct iov_iter *from) unsigned char *temp_buffer = NULL; const struct iovec *iov = from->iov; int retval; - int buf_len; - + size_t buf_len; (void)iocb; - if (from->nr_segs != 3) { /* must contain 3 segments */ + + if (from->nr_segs != 2) { /* must contain 2 segments */ + pr_err("invalid nr_segs: %ld", from->nr_segs); retval = -EINVAL; goto out; } + /* seg 0 info is checkcode*/ retval = copy_from_user(&check_code, iov[0].iov_base, sizeof(check_code)); if (retval || check_code != CHECK_CODE) { @@ -325,8 +293,8 @@ static ssize_t hievent_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - /* seg 1 && 2 is head info */ - buf_len = iov[1].iov_len + iov[2].iov_len; + /* seg 1 info */ + buf_len = iov[1].iov_len; if (buf_len > HIEVENT_BUFFER - sizeof(struct hievent_entry)) { retval = -ENOMEM; goto out; @@ -344,20 +312,11 @@ static ssize_t hievent_write_iter(struct kiocb *iocb, struct iov_iter *from) goto free_mem; } - /* 1 2 head info */ - retval = copy_from_user(temp_buffer + iov[1].iov_len, iov[2].iov_base, - iov[2].iov_len); - if (retval) { - retval = -EIO; - goto free_mem; - } - retval = hievent_write_internal(temp_buffer, buf_len); - if (retval) { + if (retval < 0) { retval = -EIO; goto free_mem; } - retval = buf_len + iov[0].iov_len; free_mem: @@ -373,11 +332,11 @@ static const struct file_operations hievent_fops = { .write_iter = hievent_write_iter, /* write_iter */ }; -static void hievent_device_init(void) +static int hievent_device_init(void) { hievent_dev.buffer = kmalloc(HIEVENT_BUFFER, GFP_KERNEL); if (!hievent_dev.buffer) - return; + return -ENOMEM; init_waitqueue_head(&hievent_dev.wq); mutex_init(&hievent_dev.mtx); @@ -385,31 +344,69 @@ static void hievent_device_init(void) hievent_dev.head_offset = 0; hievent_dev.size = 0; hievent_dev.count = 0; + + return 0; } static int __init hieventdev_init(void) { int result; - dev_t devno = MKDEV(hievent_major, 0); + struct device *dev_ret = NULL; - result = register_chrdev_region(devno, 2, "hwlog_exception"); - if (result < 0) - return result; + result = alloc_chrdev_region(&hievent_devno, 0, HIEVENT_DEV_NR, HIEVENT_DEV_NAME); + if (result < 0) { + pr_err("register %s failed", HIEVENT_DRIVER); + return -ENODEV; + } + + cdev_init(&hievent_dev.devm, &hievent_fops); + hievent_dev.devm.owner = THIS_MODULE; + + result = cdev_add(&hievent_dev.devm, hievent_devno, HIEVENT_DEV_NR); + if (result < 0) { + pr_err("cdev_add failed"); + goto unreg_dev; + } + + result = hievent_device_init(); + if (result < 0) { + pr_err("hievent_device_init failed"); + goto del_dev; + } - cdev_init(&hievent_cdev, &hievent_fops); - hievent_cdev.owner = THIS_MODULE; - hievent_cdev.ops = &hievent_fops; + hievent_class = class_create(THIS_MODULE, HIEVENT_DEV_NAME); + if (IS_ERR(hievent_class)) { + pr_err("class_create failed"); + goto del_buffer; + } - cdev_add(&hievent_cdev, MKDEV(hievent_major, 0), HIEVENT_NR_DEVS); + dev_ret = device_create(hievent_class, 0, hievent_devno, 0, HIEVENT_DEV_NAME); + if (IS_ERR(dev_ret)) { + pr_err("device_create failed"); + goto del_class; + } - hievent_device_init(); return 0; + +del_class: + class_destroy(hievent_class); +del_buffer: + kfree(hievent_dev.buffer); +del_dev: + cdev_del(&hievent_dev.devm); +unreg_dev: + unregister_chrdev_region(hievent_devno, HIEVENT_DEV_NR); + + return -ENODEV; } static void __exit hievent_exit_module(void) { - cdev_del(&hievent_cdev); - unregister_chrdev_region(MKDEV(hievent_major, 0), HIEVENT_NR_DEVS); + device_destroy(hievent_class, hievent_devno); + class_destroy(hievent_class); + kfree(hievent_dev.buffer); + cdev_del(&hievent_dev.devm); + unregister_chrdev_region(hievent_devno, HIEVENT_DEV_NR); } static int __init hievent_init_module(void) diff --git a/drivers/staging/hievent/hievent_driver.h b/drivers/staging/hievent/hievent_driver.h index 56eb0c5ea316..5d52982b78f6 100644 --- a/drivers/staging/hievent/hievent_driver.h +++ b/drivers/staging/hievent/hievent_driver.h @@ -8,7 +8,7 @@ * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * */ diff --git a/drivers/staging/hievent/hiview_hievent.c b/drivers/staging/hievent/hiview_hievent.c index 552965dfacb4..c72e6f2bb401 100644 --- a/drivers/staging/hievent/hiview_hievent.c +++ b/drivers/staging/hievent/hiview_hievent.c @@ -8,7 +8,7 @@ * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * */ @@ -24,19 +24,19 @@ #define MAX_PATH_LEN 256 #define MAX_STR_LEN (10 * 1024) -/* 64K is max length of /dev/hwlog_exception */ -#define EVENT_INFO_BUF_LEN (64 * 1024) -#define EVENT_INFO_PACK_BUF_LEN (2 * 1024) +/* CONFIG_BBOX_BUFFER_SIZE is max length of /dev/bbox */ +#define EVENT_INFO_BUF_LEN ((size_t)CONFIG_BBOX_BUFFER_SIZE) +#define EVENT_INFO_PACK_BUF_LEN min((size_t)CONFIG_BBOX_BUFFER_SIZE, 2048) -#define BUF_POINTER_FORWARD \ +#define BUF_POINTER_FORWARD \ do { \ - if (tmplen < len) { \ - tmp += tmplen; \ - len -= tmplen; \ - } else { \ - tmp += len; \ - len = 0; \ - } \ + if (tmplen < len) { \ + tmp += tmplen; \ + len -= tmplen; \ + } else { \ + tmp += len; \ + len = 0; \ + } \ } while (0) struct hievent_payload { diff --git a/drivers/staging/hievent/hiview_hievent.h b/drivers/staging/hievent/hiview_hievent.h index 48d4ae32b536..358a3e8fed4e 100644 --- a/drivers/staging/hievent/hiview_hievent.h +++ b/drivers/staging/hievent/hiview_hievent.h @@ -8,7 +8,7 @@ * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * */ -- Gitee From 532b0cace208a1d6c4ddce5554497cea1dc1ec1a Mon Sep 17 00:00:00 2001 From: CY Fan Date: Mon, 14 Feb 2022 17:48:06 +0800 Subject: [PATCH 020/113] hyperhold: fix printf missing parameter problem ohos inclusion category: bugfix issue: #I4TMR7 CVE: NA ----------------- This patch fixes the printf missing parameter problem in move_obj_to_hpio() and move_obj_from_hpio(). Signed-off-by: CY Fan --- drivers/block/zram/zram_group/group_writeback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_group/group_writeback.c b/drivers/block/zram/zram_group/group_writeback.c index f1b2550c94ff..9ccbadbf4020 100644 --- a/drivers/block/zram/zram_group/group_writeback.c +++ b/drivers/block/zram/zram_group/group_writeback.c @@ -139,7 +139,7 @@ static u32 move_obj_to_hpio(struct zram *zram, u32 index, u16 gid, wbgrp_obj_stats_inc(zram->zgrp, gid, eid, size); zgrp_obj_stats_dec(zram->zgrp, gid, size); pr_info("move obj %u of group %u to hpio %p of eid %u, size = %u, offset = %u\n", - index, gid, hpio, eid, size); + index, gid, hpio, eid, size, offset); unlock: zram_slot_unlock(zram, index); @@ -188,7 +188,7 @@ static void move_obj_from_hpio(struct zram *zram, int index, struct hpio *hpio) wbgrp_obj_stats_dec(zram->zgrp, gid, eid, size); zgrp_obj_stats_inc(zram->zgrp, gid, size); pr_info("move obj %u of group %u from hpio %p of eid %u, size = %u, offset = %u\n", - index, gid, hpio, eid, size); + index, gid, hpio, eid, size, offset); unlock: zram_slot_unlock(zram, index); } -- Gitee From 4dfc09877bb1a04df68f33cd22ace154965e3d47 Mon Sep 17 00:00:00 2001 From: wangke Date: Tue, 18 Jan 2022 21:32:47 +0800 Subject: [PATCH 021/113] hmdfs: Indroduce local file mapping to hmdfs file to hmdfs ohos inclusion category: feature issue: #I4T7LO CVE: NA ---------------------------------------------- mapping local file to hmdfs file for cross-device file access Signed-off-by: wangke --- fs/hmdfs/file_local.c | 221 +++++++++++++++++++++++++++++++++++ fs/hmdfs/file_merge.c | 46 ++++++++ fs/hmdfs/hmdfs.h | 51 ++++++++ fs/hmdfs/hmdfs_device_view.h | 11 +- fs/hmdfs/hmdfs_server.c | 97 ++++++++++++++- fs/hmdfs/hmdfs_trace.h | 2 + fs/hmdfs/inode_local.c | 124 +++++++++++++++++++- fs/hmdfs/inode_remote.c | 15 +++ fs/hmdfs/main.c | 9 ++ 9 files changed, 569 insertions(+), 7 deletions(-) diff --git a/fs/hmdfs/file_local.c b/fs/hmdfs/file_local.c index 893c6edbc93b..bef62b2c04f3 100644 --- a/fs/hmdfs/file_local.c +++ b/fs/hmdfs/file_local.c @@ -237,6 +237,217 @@ static int hmdfs_dir_release_local(struct inode *inode, struct file *file) return 0; } +static inline bool hmdfs_is_dst_path(struct path *src, struct path *dst) +{ + return (src->dentry == dst->dentry) && (src->mnt == dst->mnt); +} + +bool hmdfs_is_share_file(struct file *file) +{ + struct file *cur_file = file; + struct hmdfs_dentry_info *gdi; + struct hmdfs_file_info *gfi; + + while (cur_file->f_inode->i_sb->s_magic == HMDFS_SUPER_MAGIC) { + gdi = hmdfs_d(cur_file->f_path.dentry); + gfi = hmdfs_f(cur_file); + if (hm_isshare(gdi->file_type)) + return true; + if (gfi->lower_file) + cur_file = gfi->lower_file; + else + break; + } + + return false; +} + +bool hmdfs_is_share_item_still_valid(struct hmdfs_share_item *item) +{ + if (kref_read(&item->ref) == 1 && time_after(jiffies, item->timeout)) + return false; + + return true; +} + +inline void release_share_item(struct hmdfs_share_item *item) +{ + kfree(item->relative_path.name); + fput(item->file); + kfree(item); +} + +void hmdfs_remove_share_item(struct kref *ref) +{ + struct hmdfs_share_item *item = + container_of(ref, struct hmdfs_share_item, ref); + + list_del(&item->list); + release_share_item(item); +} + +struct hmdfs_share_item *hmdfs_lookup_share_item(struct hmdfs_share_table *st, + struct qstr *cur_relative_path) +{ + struct hmdfs_share_item *item, *tmp; + + list_for_each_entry_safe(item, tmp, &st->item_list_head, list) { + if (hmdfs_is_share_item_still_valid(item)) { + if (qstr_eq(&item->relative_path, cur_relative_path)) + return item; + } else { + kref_put(&item->ref, hmdfs_remove_share_item); + st->item_cnt--; + } + } + + return NULL; +} + +inline void set_item_timeout(struct hmdfs_share_item *item) +{ + item->timeout = jiffies + HZ * HMDFS_SHARE_ITEM_TIMEOUT_S; +} + +static int hmdfs_insert_share_item(struct hmdfs_share_table *st, + struct qstr *relative_path, struct file *file, char *cid) +{ + struct hmdfs_share_item *new_item = NULL; + int ret = 0; + + if (st->item_cnt >= st->max_cnt) { + ret = -EMFILE; + goto out; + } + + new_item = kmalloc(sizeof(*new_item), GFP_KERNEL); + if (new_item) { + new_item->file = file; + get_file(file); + new_item->relative_path = *relative_path; + memcpy(new_item->cid, cid, HMDFS_CID_SIZE); + kref_init(&new_item->ref); + list_add_tail(&new_item->list, &st->item_list_head); + set_item_timeout(new_item); + st->item_cnt++; + } else { + ret = -ENOMEM; + } + +out: + return ret; +} + +static int hmdfs_update_share_item(struct hmdfs_share_item *item, + struct file *file, char *cid) +{ + /* if not the same file, we need to update struct file */ + if (!hmdfs_is_dst_path(&file->f_path, &item->file->f_path)) { + fput(item->file); + item->file = file; + get_file(file); + } + memcpy(item->cid, cid, HMDFS_CID_SIZE); + set_item_timeout(item); + + return 0; +} + +static int hmdfs_add_to_share_table(struct file *file, + struct hmdfs_sb_info *sbi, struct hmdfs_share_control *sc) +{ + struct fd src = fdget(sc->src_fd); + struct hmdfs_share_table *st = &sbi->share_table; + struct hmdfs_share_item *item; + struct dentry *dentry; + const char *dir_path, *cur_path; + struct qstr relative_path; + int err = 0; + + if (!src.file) + return -EBADF; + + if (!S_ISREG(src.file->f_inode->i_mode)) { + err = -EPERM; + goto err_out; + } + + if (hmdfs_is_share_file(src.file)) { + err = -EPERM; + goto err_out; + } + + dir_path = hmdfs_get_dentry_relative_path(file->f_path.dentry); + if (unlikely(!dir_path)) { + err = -ENOMEM; + goto err_out; + } + + dentry = src.file->f_path.dentry; + if (dentry->d_name.len > NAME_MAX) { + kfree(dir_path); + err = -ENAMETOOLONG; + goto err_out; + } + + cur_path = hmdfs_connect_path(dir_path, dentry->d_name.name); + if (unlikely(!cur_path)) { + kfree(dir_path); + err = -ENOMEM; + goto err_out; + } + relative_path.name = cur_path; + relative_path.len = strlen(cur_path); + + spin_lock(&sbi->share_table.item_list_lock); + item = hmdfs_lookup_share_item(st, &relative_path); + if (!item) + err = hmdfs_insert_share_item(st, &relative_path, + src.file, sc->cid); + else { + if (kref_read(&item->ref) != 1) + err = -EEXIST; + else + hmdfs_update_share_item(item, src.file, sc->cid); + } + spin_unlock(&sbi->share_table.item_list_lock); + + if (err < 0) + kfree(cur_path); + kfree(dir_path); + +err_out: + fdput(src); + return err; +} + +static int hmdfs_ioc_set_share_path(struct file *file, unsigned long arg) +{ + struct hmdfs_share_control sc; + struct super_block *sb = file->f_inode->i_sb; + struct hmdfs_sb_info *sbi = hmdfs_sb(sb); + int error; + + if (copy_from_user(&sc, (struct hmdfs_share_control __user *)arg, + sizeof(sc))) + return -EFAULT; + + error = hmdfs_add_to_share_table(file, sbi, &sc); + + return error; +} + +static long hmdfs_dir_ioctl_local(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case HMDFS_IOC_SET_SHARE_PATH: + return hmdfs_ioc_set_share_path(file, arg); + default: + return -ENOTTY; + } +} + const struct file_operations hmdfs_dir_ops_local = { .owner = THIS_MODULE, .iterate = hmdfs_iterate_local, @@ -244,3 +455,13 @@ const struct file_operations hmdfs_dir_ops_local = { .release = hmdfs_dir_release_local, .fsync = hmdfs_fsync_local, }; + +const struct file_operations hmdfs_dir_ops_share = { + .owner = THIS_MODULE, + .iterate = hmdfs_iterate_local, + .open = hmdfs_dir_open_local, + .release = hmdfs_dir_release_local, + .fsync = hmdfs_fsync_local, + .unlocked_ioctl = hmdfs_dir_ioctl_local, + .compat_ioctl = hmdfs_dir_ioctl_local, +}; diff --git a/fs/hmdfs/file_merge.c b/fs/hmdfs/file_merge.c index 2708f2ba24af..237bb9e806d9 100644 --- a/fs/hmdfs/file_merge.c +++ b/fs/hmdfs/file_merge.c @@ -454,11 +454,57 @@ int hmdfs_dir_release_merge(struct inode *inode, struct file *file) return 0; } +long hmdfs_dir_unlocked_ioctl_merge(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct hmdfs_file_info *fi_head = hmdfs_f(file); + struct hmdfs_file_info *fi_iter = NULL; + struct hmdfs_file_info *fi_temp = NULL; + struct file *lower_file = NULL; + int error = -ENOTTY; + + mutex_lock(&fi_head->comrade_list_lock); + list_for_each_entry_safe(fi_iter, fi_temp, &(fi_head->comrade_list), + comrade_list) { + if (fi_iter->device_id == 0) { + lower_file = fi_iter->lower_file; + error = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); + break; + } + } + mutex_unlock(&fi_head->comrade_list_lock); + return error; +} + +long hmdfs_dir_compat_ioctl_merge(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct hmdfs_file_info *fi_head = hmdfs_f(file); + struct hmdfs_file_info *fi_iter = NULL; + struct hmdfs_file_info *fi_temp = NULL; + struct file *lower_file = NULL; + int error = -ENOTTY; + + mutex_lock(&fi_head->comrade_list_lock); + list_for_each_entry_safe(fi_iter, fi_temp, &(fi_head->comrade_list), + comrade_list) { + if (fi_iter->device_id == 0) { + lower_file = fi_iter->lower_file; + error = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); + break; + } + } + mutex_unlock(&fi_head->comrade_list_lock); + return error; +} + const struct file_operations hmdfs_dir_fops_merge = { .owner = THIS_MODULE, .iterate = hmdfs_iterate_merge, .open = hmdfs_dir_open_merge, .release = hmdfs_dir_release_merge, + .unlocked_ioctl = hmdfs_dir_unlocked_ioctl_merge, + .compat_ioctl = hmdfs_dir_compat_ioctl_merge, }; int hmdfs_file_open_merge(struct inode *inode, struct file *file) diff --git a/fs/hmdfs/hmdfs.h b/fs/hmdfs/hmdfs.h index 9b5d456e1217..4228bb64c43e 100644 --- a/fs/hmdfs/hmdfs.h +++ b/fs/hmdfs/hmdfs.h @@ -56,6 +56,15 @@ #define HMDFS_READPAGES_NR_MAX 32 +#define HMDFS_SHARE_ITEM_TIMEOUT_S 60 +#define HMDFS_SHARE_ITEMS_MAX 4 + +#define HMDFS_IOC 0xf2 +#define HMDFS_IOC_SET_SHARE_PATH _IOW(HMDFS_IOC, 1, \ + struct hmdfs_share_control) + +#define HMDFS_CID_SIZE 64 + enum { HMDFS_FEATURE_READPAGES = 1ULL << 0, HMDFS_FEATURE_READPAGES_OPEN = 1ULL << 1, @@ -90,6 +99,22 @@ struct hmdfs_syncfs_info { spinlock_t list_lock; }; +struct hmdfs_share_item { + struct file *file; + struct qstr relative_path; + char cid[HMDFS_CID_SIZE]; + unsigned long timeout; + struct kref ref; + struct list_head list; +}; + +struct hmdfs_share_table { + struct list_head item_list_head; + spinlock_t item_list_lock; + int item_cnt; + int max_cnt; +}; + struct hmdfs_sb_info { /* list for all registered superblocks */ struct list_head list; @@ -178,6 +203,9 @@ struct hmdfs_sb_info { /* dentry cache */ bool s_dentry_cache; + /* share table */ + struct hmdfs_share_table share_table; + /* msgs that are waiting for remote */ struct list_head async_readdir_msg_list; /* protect async_readdir_msg_list */ @@ -195,6 +223,11 @@ struct hmdfs_sb_info { unsigned int user_id; }; +struct hmdfs_share_control { + __u32 src_fd; + char cid[HMDFS_CID_SIZE]; +}; + static inline struct hmdfs_sb_info *hmdfs_sb(struct super_block *sb) { return sb->s_fs_info; @@ -290,6 +323,24 @@ static inline bool qstr_case_eq(const struct qstr *q1, const struct qstr *q2) return q1->len == q2->len && str_n_case_eq(q1->name, q2->name, q2->len); } +static inline bool qstr_eq(const struct qstr *q1, const struct qstr *q2) +{ + return q1->len == q2->len && !strncmp(q1->name, q2->name, q2->len); +} + +bool hmdfs_is_share_file(struct file *file); + +bool hmdfs_is_share_item_still_valid(struct hmdfs_share_item *item); + +inline void release_share_item(struct hmdfs_share_item *item); + +void hmdfs_remove_share_item(struct kref *ref); + +struct hmdfs_share_item *hmdfs_lookup_share_item(struct hmdfs_share_table *st, + struct qstr *cur_relative_path); + +inline void set_item_timeout(struct hmdfs_share_item *item); + /***************************************************************************** * log print helpers *****************************************************************************/ diff --git a/fs/hmdfs/hmdfs_device_view.h b/fs/hmdfs/hmdfs_device_view.h index dcc49fb89597..fc77ef9ebcbd 100644 --- a/fs/hmdfs/hmdfs_device_view.h +++ b/fs/hmdfs/hmdfs_device_view.h @@ -29,6 +29,7 @@ enum HMDFS_FILE_TYPE { HM_REG = 0, HM_SYMLINK = 1, + HM_SHARE = 2, HM_MAX_FILE_TYPE = 0XFF }; @@ -87,7 +88,9 @@ extern const struct inode_operations hmdfs_file_iops_local; extern const struct file_operations hmdfs_file_fops_local; extern const struct inode_operations hmdfs_dir_inode_ops_local; extern const struct file_operations hmdfs_dir_ops_local; +extern const struct file_operations hmdfs_dir_ops_share; extern const struct inode_operations hmdfs_symlink_iops_local; +extern const struct inode_operations hmdfs_dir_inode_ops_share; /* remote device operation */ extern const struct inode_operations hmdfs_dev_file_iops_remote; @@ -147,6 +150,12 @@ static inline bool hm_islnk(uint8_t file_type) { return (file_type == HM_SYMLINK); } + +static inline bool hm_isshare(uint8_t file_type) +{ + return (file_type == HM_SHARE); +} + struct inode *fill_inode_remote(struct super_block *sb, struct hmdfs_peer *con, struct hmdfs_lookup_ret *lookup_result, struct inode *dir); @@ -155,7 +164,7 @@ struct hmdfs_lookup_ret *get_remote_inode_info(struct hmdfs_peer *con, unsigned int flags); void hmdfs_set_time(struct dentry *dentry, unsigned long time); struct inode *fill_inode_local(struct super_block *sb, - struct inode *lower_inode); + struct inode *lower_inode, const char *name); struct inode *fill_root_inode(struct super_block *sb, struct inode *lower_inode); struct inode *fill_device_inode(struct super_block *sb, diff --git a/fs/hmdfs/hmdfs_server.c b/fs/hmdfs/hmdfs_server.c index 4da2fc70c1a5..dd4508b80e8e 100644 --- a/fs/hmdfs/hmdfs_server.c +++ b/fs/hmdfs/hmdfs_server.c @@ -133,6 +133,28 @@ struct file *hmdfs_open_path(struct hmdfs_sb_info *sbi, const char *path) return file; } +inline bool is_dst_device(char *src_cid, char *dst_cid) +{ + return strncmp(src_cid, dst_cid, HMDFS_CID_SIZE) == 0 ? true : false; +} + +void hmdfs_clear_share_item_offline(struct hmdfs_peer *conn) +{ + struct hmdfs_sb_info *sbi = conn->sbi; + struct hmdfs_share_item *item, *tmp; + + spin_lock(&sbi->share_table.item_list_lock); + list_for_each_entry_safe(item, tmp, &sbi->share_table.item_list_head, + list) { + if (is_dst_device(item->cid, conn->cid)) { + list_del(&item->list); + release_share_item(item); + sbi->share_table.item_cnt--; + } + } + spin_unlock(&sbi->share_table.item_list_lock); +} + inline void hmdfs_close_path(struct file *file) { fput(file); @@ -163,6 +185,8 @@ void hmdfs_server_offline_notify(struct hmdfs_peer *conn, int evt, cond_resched(); } + hmdfs_clear_share_item_offline(conn); + /* Reinitialize idr */ next = idr_get_cursor(idr); idr_destroy(idr); @@ -288,11 +312,31 @@ static int check_sec_level(struct hmdfs_peer *node, const char *file_name) return ret; } +static int hmdfs_check_share_access_permission(struct hmdfs_sb_info *sbi, + const char *filename, char *cid, struct hmdfs_share_item **item) +{ + struct qstr candidate = QSTR_INIT(filename, strlen(filename)); + int ret = -ENOENT; + + spin_lock(&sbi->share_table.item_list_lock); + *item = hmdfs_lookup_share_item(&sbi->share_table, &candidate); + if (*item && is_dst_device((*item)->cid, cid)) { + spin_unlock(&sbi->share_table.item_list_lock); + return 0; + } else + *item = NULL; + spin_unlock(&sbi->share_table.item_list_lock); + + return ret; +} + static struct file *hmdfs_open_file(struct hmdfs_peer *con, const char *filename, uint8_t file_type, int *file_id) { struct file *file = NULL; + struct hmdfs_share_item *item = NULL; + int err = 0; int id; if (!filename) { @@ -307,8 +351,15 @@ static struct file *hmdfs_open_file(struct hmdfs_peer *con, if (hm_islnk(file_type)) file = hmdfs_open_photokit_path(con->sbi, filename); - else + else { + if (hm_isshare(file_type)) { + err = hmdfs_check_share_access_permission(con->sbi, + filename, con->cid, &item); + if (err) + return ERR_PTR(err); + } file = hmdfs_open_path(con->sbi, filename); + } if (IS_ERR(file)) return file; @@ -320,6 +371,10 @@ static struct file *hmdfs_open_file(struct hmdfs_peer *con, } *file_id = id; + /* get item to avoid timeout */ + if (item) + kref_get(&item->ref); + return file; } @@ -715,6 +770,42 @@ void hmdfs_server_atomic_open(struct hmdfs_peer *con, kfree(resp); } +void hmdfs_close_share_item(struct hmdfs_sb_info *sbi, struct file *file, + char *cid) +{ + struct qstr relativepath; + const char *path_name; + struct hmdfs_share_item *item = NULL; + + path_name = hmdfs_get_dentry_relative_path(file->f_path.dentry); + if (unlikely(!path_name)) { + hmdfs_err("get dentry relative path error"); + return; + } + + relativepath.name = path_name; + relativepath.len = strlen(path_name); + + item = hmdfs_lookup_share_item(&sbi->share_table, &relativepath); + + if (item) { + if (unlikely(!is_dst_device(item->cid, cid))) { + hmdfs_err("item not right"); + goto err_out; + } + + if (unlikely(kref_read(&item->ref) == 1)) + hmdfs_err("item ref error"); + + set_item_timeout(item); + kref_put(&item->ref, hmdfs_remove_share_item); + } else + hmdfs_err("cannot get share item %s", relativepath.name); + +err_out: + kfree(path_name); +} + void hmdfs_server_release(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, void *data) { @@ -732,6 +823,10 @@ void hmdfs_server_release(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, ret = PTR_ERR(file); goto out; } + + if (hmdfs_is_share_file(file)) + hmdfs_close_share_item(con->sbi, file, con->cid); + /* put the reference acquired by get_file_by_fid_and_ver() */ hmdfs_close_path(file); hmdfs_info("close %u", file_id); diff --git a/fs/hmdfs/hmdfs_trace.h b/fs/hmdfs/hmdfs_trace.h index 205bf697c357..51ecdb9abbc4 100644 --- a/fs/hmdfs/hmdfs_trace.h +++ b/fs/hmdfs/hmdfs_trace.h @@ -202,6 +202,8 @@ define_hmdfs_lookup_op_end_event(hmdfs_mkdir_merge); define_hmdfs_lookup_op_end_event(hmdfs_rmdir_merge); define_hmdfs_lookup_op_end_event(hmdfs_create_merge); +define_hmdfs_lookup_op_end_event(hmdfs_lookup_share); +define_hmdfs_lookup_op_end_event(hmdfs_lookup_share_end); define_hmdfs_lookup_op_end_event(hmdfs_symlink_merge); define_hmdfs_lookup_op_end_event(hmdfs_symlink_local); diff --git a/fs/hmdfs/inode_local.c b/fs/hmdfs/inode_local.c index c302d320de48..cfbe67fe98db 100644 --- a/fs/hmdfs/inode_local.c +++ b/fs/hmdfs/inode_local.c @@ -66,8 +66,25 @@ static inline void set_symlink_flag(struct hmdfs_dentry_info *gdi) gdi->file_type = HM_SYMLINK; } +static inline void set_sharefile_flag(struct hmdfs_dentry_info *gdi) +{ + gdi->file_type = HM_SHARE; +} + +static inline void check_and_fixup_share_ops(struct inode *inode, + const char *name) +{ + const char *share_dir = ".share"; + + if (S_ISDIR(inode->i_mode) && + !strncmp(name, share_dir, strlen(share_dir))) { + inode->i_op = &hmdfs_dir_inode_ops_share; + inode->i_fop = &hmdfs_dir_ops_share; + } +} + struct inode *fill_inode_local(struct super_block *sb, - struct inode *lower_inode) + struct inode *lower_inode, const char *name) { struct inode *inode; struct hmdfs_inode_info *info; @@ -125,6 +142,7 @@ struct inode *fill_inode_local(struct super_block *sb, } fsstack_copy_inode_size(inode, lower_inode); + check_and_fixup_share_ops(inode, name); unlock_new_inode(inode); return inode; } @@ -250,7 +268,8 @@ struct dentry *hmdfs_lookup_local(struct inode *parent_inode, } else if (!err) { hmdfs_set_lower_path(child_dentry, &lower_path); child_inode = fill_inode_local(parent_inode->i_sb, - d_inode(lower_path.dentry)); + d_inode(lower_path.dentry), + child_dentry->d_name.name); if (S_ISLNK(d_inode(lower_path.dentry)->i_mode)) set_symlink_flag(gdi); if (IS_ERR(child_inode)) { @@ -337,7 +356,7 @@ int hmdfs_mkdir_local_dentry(struct inode *dir, struct dentry *dentry, #ifdef CONFIG_HMDFS_FS_PERMISSION error = hmdfs_persist_perm(lower_dentry, &child_perm); #endif - child_inode = fill_inode_local(sb, lower_inode); + child_inode = fill_inode_local(sb, lower_inode, dentry->d_name.name); if (IS_ERR(child_inode)) { error = PTR_ERR(child_inode); goto out; @@ -425,7 +444,7 @@ int hmdfs_create_local_dentry(struct inode *dir, struct dentry *dentry, #ifdef CONFIG_HMDFS_FS_PERMISSION error = hmdfs_persist_perm(lower_dentry, &child_perm); #endif - child_inode = fill_inode_local(sb, lower_inode); + child_inode = fill_inode_local(sb, lower_inode, dentry->d_name.name); if (IS_ERR(child_inode)) { error = PTR_ERR(child_inode); goto out_created; @@ -767,7 +786,8 @@ int hmdfs_symlink_local(struct inode *dir, struct dentry *dentry, #ifdef CONFIG_HMDFS_FS_PERMISSION err = hmdfs_persist_perm(lower_dentry, &child_perm); #endif - child_inode = fill_inode_local(dir->i_sb, d_inode(lower_dentry)); + child_inode = fill_inode_local(dir->i_sb, d_inode(lower_dentry), + dentry->d_name.name); if (IS_ERR(child_inode)) { err = PTR_ERR(child_inode); goto out_err; @@ -932,6 +952,95 @@ static ssize_t hmdfs_local_listxattr(struct dentry *dentry, char *list, return res; } +int hmdfs_get_path_from_share_table(struct hmdfs_sb_info *sbi, + struct dentry *cur_dentry, struct path *src_path) +{ + struct hmdfs_share_item *item; + const char *path_name; + struct qstr relative_path; + int err = 0; + + path_name = hmdfs_get_dentry_relative_path(cur_dentry); + if (unlikely(!path_name)) { + err = -ENOMEM; + goto err_out; + } + relative_path.name = path_name; + relative_path.len = strlen(path_name); + + spin_lock(&sbi->share_table.item_list_lock); + item = hmdfs_lookup_share_item(&sbi->share_table, &relative_path); + if (!item) { + spin_unlock(&sbi->share_table.item_list_lock); + err = -ENOENT; + goto err_out; + } + *src_path = item->file->f_path; + path_get(src_path); + + kfree(path_name); + spin_unlock(&sbi->share_table.item_list_lock); +err_out: + return err; +} + +struct dentry *hmdfs_lookup_share(struct inode *parent_inode, + struct dentry *child_dentry, unsigned int flags) +{ + const struct qstr *d_name = &child_dentry->d_name; + int err = 0; + struct dentry *ret = NULL; + struct hmdfs_sb_info *sbi = hmdfs_sb(child_dentry->d_sb); + struct path src_path; + struct inode *child_inode = NULL; + + trace_hmdfs_lookup_share(parent_inode, child_dentry, flags); + if (d_name->len > NAME_MAX) { + ret = ERR_PTR(-ENAMETOOLONG); + goto err_out; + } + + err = init_hmdfs_dentry_info(sbi, child_dentry, HMDFS_LAYER_OTHER_LOCAL); + if (err) { + ret = ERR_PTR(err); + goto err_out; + } + + err = hmdfs_get_path_from_share_table(sbi, child_dentry, &src_path); + if (err) { + ret = ERR_PTR(err); + goto err_out; + } + + hmdfs_set_lower_path(child_dentry, &src_path); + child_inode = fill_inode_local(parent_inode->i_sb, + d_inode(src_path.dentry), d_name->name); + + set_sharefile_flag(hmdfs_d(child_dentry)); + + if (IS_ERR(child_inode)) { + err = PTR_ERR(child_inode); + ret = ERR_PTR(err); + hmdfs_put_reset_lower_path(child_dentry); + goto err_out; + } + ret = d_splice_alias(child_inode, child_dentry); + if (IS_ERR(ret)) { + err = PTR_ERR(ret); + hmdfs_put_reset_lower_path(child_dentry); + goto err_out; + } + + check_and_fixup_ownership(parent_inode, child_inode, + src_path.dentry, d_name->name); + +err_out: + if (!err) + hmdfs_set_time(child_dentry, jiffies); + trace_hmdfs_lookup_share_end(parent_inode, child_dentry, err); + return ret; +} + const struct inode_operations hmdfs_symlink_iops_local = { .get_link = hmdfs_get_link_local, .permission = hmdfs_permission, @@ -951,6 +1060,11 @@ const struct inode_operations hmdfs_dir_inode_ops_local = { .getattr = hmdfs_getattr_local, }; +const struct inode_operations hmdfs_dir_inode_ops_share = { + .lookup = hmdfs_lookup_share, + .permission = hmdfs_permission, +}; + const struct inode_operations hmdfs_file_iops_local = { .setattr = hmdfs_setattr_local, .getattr = hmdfs_getattr_local, diff --git a/fs/hmdfs/inode_remote.c b/fs/hmdfs/inode_remote.c index 98a0e34c2253..78f04bdc4813 100644 --- a/fs/hmdfs/inode_remote.c +++ b/fs/hmdfs/inode_remote.c @@ -392,6 +392,19 @@ struct inode *fill_inode_remote(struct super_block *sb, struct hmdfs_peer *con, return inode; } +static bool in_share_dir(struct dentry *child_dentry) +{ + struct dentry *parent_dentry = dget_parent(child_dentry); + bool ret = false; + const char *share_dir = ".share"; + + if (!strncmp(parent_dentry->d_name.name, share_dir, strlen(share_dir))) + ret = true; + + dput(parent_dentry); + return ret; +} + static struct dentry *hmdfs_lookup_remote_dentry(struct inode *parent_inode, struct dentry *child_dentry, int flags) @@ -436,6 +449,8 @@ static struct dentry *hmdfs_lookup_remote_dentry(struct inode *parent_inode, if (lookup_result != NULL) { if (S_ISLNK(lookup_result->i_mode)) gdi->file_type = HM_SYMLINK; + if (in_share_dir(child_dentry)) + gdi->file_type = HM_SHARE; inode = fill_inode_remote(sb, con, lookup_result, parent_inode); ret = d_splice_alias(inode, child_dentry); if (!IS_ERR_OR_NULL(ret)) diff --git a/fs/hmdfs/main.c b/fs/hmdfs/main.c index efc952a36afd..a490d069d239 100644 --- a/fs/hmdfs/main.c +++ b/fs/hmdfs/main.c @@ -661,6 +661,14 @@ static void hmdfs_init_cmd_timeout(struct hmdfs_sb_info *sbi) set_cmd_timeout(sbi, F_LISTXATTR, TIMEOUT_COMMON); } +static void init_share_table(struct hmdfs_sb_info *sbi) +{ + spin_lock_init(&sbi->share_table.item_list_lock); + INIT_LIST_HEAD(&sbi->share_table.item_list_head); + sbi->share_table.item_cnt = 0; + sbi->share_table.max_cnt = HMDFS_SHARE_ITEMS_MAX; +} + static int hmdfs_init_sbi(struct hmdfs_sb_info *sbi) { int ret; @@ -711,6 +719,7 @@ static int hmdfs_init_sbi(struct hmdfs_sb_info *sbi) mutex_init(&sbi->connections.node_lock); INIT_LIST_HEAD(&sbi->connections.node_list); + init_share_table(sbi); init_waitqueue_head(&sbi->async_readdir_wq); INIT_LIST_HEAD(&sbi->async_readdir_msg_list); INIT_LIST_HEAD(&sbi->async_readdir_work_list); -- Gitee From a7d930276ab54387d83097b341caed404aeb514f Mon Sep 17 00:00:00 2001 From: waterwin Date: Mon, 14 Feb 2022 21:25:35 +0800 Subject: [PATCH 022/113] hmdfs: Close lower file permission fixup ohos inclusion category: feature issue: #I4TNK7 CVE: NA ---------------------------------------------- hmdfs manage file permission, fixup it when permission is not what we do not want, close this. Signed-off-by: qianjiaxing --- fs/hmdfs/authority/authentication.c | 52 +---------------------------- fs/hmdfs/authority/authentication.h | 6 ++-- fs/hmdfs/inode_local.c | 7 ++-- 3 files changed, 5 insertions(+), 60 deletions(-) diff --git a/fs/hmdfs/authority/authentication.c b/fs/hmdfs/authority/authentication.c index d56ac1490bb3..4a18580cb646 100644 --- a/fs/hmdfs/authority/authentication.c +++ b/fs/hmdfs/authority/authentication.c @@ -314,30 +314,6 @@ static __u16 __inherit_perm_file(struct inode *parent) return perm; } -static void fixup_ownership_user_group(struct inode *child, struct dentry *lower_dentry, - uid_t uid, gid_t gid) -{ - int err; - struct iattr newattrs; - - newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_FORCE; - newattrs.ia_uid = KUIDT_INIT(uid); - newattrs.ia_gid = KGIDT_INIT(gid); - if (!S_ISDIR(d_inode(lower_dentry)->i_mode)) - newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; - - inode_lock(d_inode(lower_dentry)); - err = notify_change(lower_dentry, &newattrs, NULL); - inode_unlock(d_inode(lower_dentry)); - - if (!err) { - child->i_uid = KUIDT_INIT(uid); - child->i_gid = KGIDT_INIT(gid); - } else { - hmdfs_err("update PKG uid failed, err = %d", err); - } -} - __u16 hmdfs_perm_inherit(struct inode *parent_inode, struct inode *child) { __u16 perm; @@ -349,38 +325,12 @@ __u16 hmdfs_perm_inherit(struct inode *parent_inode, struct inode *child) return perm; } -void check_and_fixup_ownership(struct inode *parent_inode, struct inode *child, - struct dentry *lower_dentry, const char *name) +void check_and_fixup_ownership(struct inode *parent_inode, struct inode *child) { - int bid; struct hmdfs_inode_info *info = hmdfs_i(child); if (info->perm == HMDFS_ALL_MASK) info->perm = hmdfs_perm_inherit(parent_inode, child); - - switch (info->perm & HMDFS_DIR_TYPE_MASK) { - case HMDFS_DIR_PKG: - bid = get_bundle_uid(hmdfs_sb(parent_inode->i_sb), name); - if (bid != child->i_uid.val || bid != child->i_gid.val) - fixup_ownership_user_group(child, lower_dentry, bid, - bid); - - break; - case HMDFS_DIR_DATA: - case HMDFS_FILE_PKG_SUB: - case HMDFS_DIR_PKG_SUB: - case HMDFS_DIR_DEFAULT: - case HMDFS_FILE_DEFAULT: - case HMDFS_DIR_PUBLIC: - if (parent_inode->i_uid.val != child->i_uid.val || - parent_inode->i_gid.val != child->i_gid.val) - fixup_ownership_user_group(child, lower_dentry, - parent_inode->i_uid.val, - parent_inode->i_gid.val); - break; - default: - break; - } } void check_and_fixup_ownership_remote(struct inode *dir, diff --git a/fs/hmdfs/authority/authentication.h b/fs/hmdfs/authority/authentication.h index af6eec9a4897..402a063429e5 100644 --- a/fs/hmdfs/authority/authentication.h +++ b/fs/hmdfs/authority/authentication.h @@ -238,8 +238,7 @@ void hmdfs_revert_fsids(const struct cred *old_cred); int hmdfs_persist_perm(struct dentry *dentry, __u16 *perm); __u16 hmdfs_read_perm(struct inode *inode); void hmdfs_root_inode_perm_init(struct inode *root_inode); -void check_and_fixup_ownership(struct inode *parent_inode, struct inode *child, - struct dentry *lower_dentry, const char *name); +void check_and_fixup_ownership(struct inode *parent_inode, struct inode *child); int hmdfs_override_dir_id_fs(struct cache_fs_override *or, struct inode *dir, struct dentry *dentry, @@ -283,8 +282,7 @@ void hmdfs_revert_dir_id_fs(struct cache_fs_override *or) } static inline -void check_and_fixup_ownership(struct inode *parent_inode, struct inode *child, - struct dentry *lower_dentry, const char *name) +void check_and_fixup_ownership(struct inode *parent_inode, struct inode *child) { } diff --git a/fs/hmdfs/inode_local.c b/fs/hmdfs/inode_local.c index cfbe67fe98db..84904f50d27d 100644 --- a/fs/hmdfs/inode_local.c +++ b/fs/hmdfs/inode_local.c @@ -285,9 +285,7 @@ struct dentry *hmdfs_lookup_local(struct inode *parent_inode, goto out_err; } - check_and_fixup_ownership(parent_inode, child_inode, - lower_path.dentry, - child_dentry->d_name.name); + check_and_fixup_ownership(parent_inode, child_inode); goto out_err; } /* @@ -1031,8 +1029,7 @@ struct dentry *hmdfs_lookup_share(struct inode *parent_inode, goto err_out; } - check_and_fixup_ownership(parent_inode, child_inode, - src_path.dentry, d_name->name); + check_and_fixup_ownership(parent_inode, child_inode); err_out: if (!err) -- Gitee From 9c11e8fbcd5886f59856f0c2764380b3007b0f9c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 27 Mar 2021 17:57:06 +0800 Subject: [PATCH 023/113] f2fs: introduce gc_merge mount option mainline inclusion from v5.13-rc1 commit 5911d2d1d1a38b26585383478bd71d9254e48bdf category:feature issue: #I4TEGS CVE:N/A -------------------------------- In this patch, we will add two new mount options: "gc_merge" and "nogc_merge", when background_gc is on, "gc_merge" option can be set to let background GC thread to handle foreground GC requests, it can eliminate the sluggish issue caused by slow foreground GC operation when GC is triggered from a process with limited I/O and CPU resources. Original idea is from Xiang. Signed-off-by: Wang Xiaojun Signed-off-by: Chao Yu --- Documentation/filesystems/f2fs.rst | 6 ++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/gc.c | 26 ++++++++++++++++++++++---- fs/f2fs/gc.h | 6 ++++++ fs/f2fs/segment.c | 15 +++++++++++++-- fs/f2fs/super.c | 19 +++++++++++++++++-- 6 files changed, 65 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 8c0fbdd8ce6f..cb18f7c1bea3 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -110,6 +110,12 @@ background_gc=%s Turn on/off cleaning operations, namely garbage on synchronous garbage collection running in background. Default value for this option is on. So garbage collection is on by default. +gc_merge When background_gc is on, this option can be enabled to + let background GC thread to handle foreground GC requests, + it can eliminate the sluggish issue caused by slow foreground + GC operation when GC is triggered from a process with limited + I/O and CPU resources. +nogc_merge Disable GC merge feature. disable_roll_forward Disable the roll-forward recovery routine norecovery Disable the roll-forward recovery routine, mounted read- only (i.e., -o ro,disable_roll_forward) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2d7799bd30b1..b2d734438aff 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -99,6 +99,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 #define F2FS_MOUNT_NORECOVERY 0x04000000 #define F2FS_MOUNT_ATGC 0x08000000 +#define F2FS_MOUNT_GC_MERGE 0x20000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 72f227f6ebad..cb3e7808d0e4 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -31,19 +31,24 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; + wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; set_freezable(); do { - bool sync_mode; + bool sync_mode, foreground = false; wait_event_interruptible_timeout(*wq, kthread_should_stop() || freezing(current) || + waitqueue_active(fggc_wq) || gc_th->gc_wake, msecs_to_jiffies(wait_ms)); + if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq)) + foreground = true; + /* give it a try one time */ if (gc_th->gc_wake) gc_th->gc_wake = 0; @@ -90,7 +95,10 @@ static int gc_thread_func(void *data) goto do_gc; } - if (!down_write_trylock(&sbi->gc_lock)) { + if (foreground) { + down_write(&sbi->gc_lock); + goto do_gc; + } else if (!down_write_trylock(&sbi->gc_lock)) { stat_other_skip_bggc_count(sbi); goto next; } @@ -107,14 +115,22 @@ static int gc_thread_func(void *data) else increase_sleep_time(gc_th, &wait_ms); do_gc: - stat_inc_bggc_count(sbi->stat_info); + if (!foreground) + stat_inc_bggc_count(sbi->stat_info); sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; + /* foreground GC was been triggered via f2fs_balance_fs() */ + if (foreground) + sync_mode = false; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, true, false, NULL_SEGNO)) + if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; + if (foreground) + wake_up_all(&gc_th->fggc_wq); + trace_f2fs_background_gc(sbi->sb, wait_ms, prefree_segments(sbi), free_segments(sbi)); @@ -148,6 +164,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); + init_waitqueue_head(&sbi->gc_thread->fggc_wq); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { @@ -165,6 +182,7 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); + wake_up_all(&gc_th->fggc_wq); kfree(gc_th); sbi->gc_thread = NULL; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 0c8dae12dc51..3fe145e8e594 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -42,6 +42,12 @@ struct f2fs_gc_kthread { /* for changing gc mode */ unsigned int gc_wake; + + /* for GC_MERGE mount option */ + wait_queue_head_t fggc_wq; /* + * caller of f2fs_balance_fs() + * will wait on this wait queue. + */ }; struct gc_inode_list { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d04b449978aa..ecbfb63809f9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -510,8 +510,19 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) * dir/node pages without enough free segments. */ if (has_not_enough_free_secs(sbi, 0, 0)) { - down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, false, NULL_SEGNO); + if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && + sbi->gc_thread->f2fs_gc_task) { + DEFINE_WAIT(wait); + + prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sbi->gc_thread->gc_wait_queue_head); + io_schedule(); + finish_wait(&sbi->gc_thread->fggc_wq, &wait); + } else { + down_write(&sbi->gc_lock); + f2fs_gc(sbi, false, false, false, NULL_SEGNO); + } } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index de543168b370..36b8d03ded95 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -147,6 +147,8 @@ enum { Opt_compress_log_size, Opt_compress_extension, Opt_atgc, + Opt_gc_merge, + Opt_nogc_merge, Opt_err, }; @@ -215,6 +217,8 @@ static match_table_t f2fs_tokens = { {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, {Opt_atgc, "atgc"}, + {Opt_gc_merge, "gc_merge"}, + {Opt_nogc_merge, "nogc_merge"}, {Opt_err, NULL}, }; @@ -944,6 +948,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_atgc: set_opt(sbi, ATGC); break; + case Opt_gc_merge: + set_opt(sbi, GC_MERGE); + break; + case Opt_nogc_merge: + clear_opt(sbi, GC_MERGE); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1536,6 +1546,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) seq_printf(seq, ",background_gc=%s", "off"); + if (test_opt(sbi, GC_MERGE)) + seq_puts(seq, ",gc_merge"); + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, NORECOVERY)) @@ -1902,7 +1915,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * option. Also sync the filesystem. */ if ((*flags & SB_RDONLY) || - F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) { + (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF && + !test_opt(sbi, GC_MERGE))) { if (sbi->gc_thread) { f2fs_stop_gc_thread(sbi); need_restart_gc = true; @@ -3872,7 +3886,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF && !f2fs_readonly(sb)) { + if ((F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF || + test_opt(sbi, GC_MERGE)) && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) -- Gitee From f7733970e7dba8cc0bcf1693c27c6a98d25e4d39 Mon Sep 17 00:00:00 2001 From: Wang Xiaojun Date: Wed, 9 Oct 2019 10:49:18 +0800 Subject: [PATCH 024/113] f2fs: introduce a policy to optimize discard ohos inclusion category:feature issue: #I4TEGS CVE:N/A -------------------------------- This patch introduces a new discard policy. This policy adjusts the sending frequency and discard granularity based on the current disk fragmentation and I/O. This avoids interference with foreground I/Os and prolongs the component lifespan. Signed-off-by: Wang Xiaojun --- fs/f2fs/f2fs.h | 38 +++++++++- fs/f2fs/segment.c | 189 +++++++++++++++++++++++++++++++--------------- fs/f2fs/sysfs.c | 66 ++++++++++++++++ 3 files changed, 232 insertions(+), 61 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b2d734438aff..c7724661dd5e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -276,11 +276,17 @@ struct discard_entry { /* default discard granularity of inner discard thread, unit: block count */ #define DEFAULT_DISCARD_GRANULARITY 16 +#define DISCARD_GRAN_BL 16 +#define DISCARD_GRAN_BG 512 +#define DISCARD_GRAN_FORCE 1 /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ (MAX_PLIST_NUM - 1) : ((blk_num) - 1)) +#define FS_FREE_SPACE_PERCENT 20 +#define DEVICE_FREE_SPACE_PERCENT 10 +#define HUNDRED_PERCENT 100 enum { D_PREP, /* initial */ @@ -319,24 +325,37 @@ struct discard_cmd { enum { DPOLICY_BG, + DPOLICY_BALANCE, DPOLICY_FORCE, DPOLICY_FSTRIM, DPOLICY_UMOUNT, MAX_DPOLICY, }; +enum { + SUB_POLICY_BIG, + SUB_POLICY_MID, + SUB_POLICY_SMALL, + NR_SUB_POLICY, +}; + +struct discard_sub_policy { + unsigned int max_requests; + int interval; +}; + struct discard_policy { int type; /* type of discard */ unsigned int min_interval; /* used for candidates exist */ unsigned int mid_interval; /* used for device busy */ unsigned int max_interval; /* used for candidates not exist */ - unsigned int max_requests; /* # of discards issued per round */ unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ bool io_aware; /* issue discard in idle time */ bool sync; /* submit discard with REQ_SYNC flag */ bool ordered; /* issue discard by lba order */ bool timeout; /* discard timeout for put_super */ unsigned int granularity; /* discard granularity */ + struct discard_sub_policy sub_policy[NR_SUB_POLICY]; }; struct discard_cmd_control { @@ -358,6 +377,7 @@ struct discard_cmd_control { atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root_cached root; /* root of discard rb-tree */ bool rbtree_check; /* config for consistence check */ + int discard_type; /* discard type */ }; /* for the list of fsync inodes, used only during recovery */ @@ -3050,6 +3070,18 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, f2fs_record_iostat(sbi); } +static inline block_t fs_free_space_threshold(struct f2fs_sb_info *sbi) +{ + return (block_t)(SM_I(sbi)->main_segments * sbi->blocks_per_seg * + FS_FREE_SPACE_PERCENT) / HUNDRED_PERCENT; +} + +static inline block_t device_free_space_threshold(struct f2fs_sb_info *sbi) +{ + return (block_t)(SM_I(sbi)->main_segments * sbi->blocks_per_seg * + DEVICE_FREE_SPACE_PERCENT) / HUNDRED_PERCENT; +} + #define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) #define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META) @@ -3259,6 +3291,10 @@ void f2fs_destroy_node_manager_caches(void); /* * segment.c */ +unsigned long find_rev_next_bit(const unsigned long *addr, + unsigned long size, unsigned long offset); +unsigned long find_rev_next_zero_bit(const unsigned long *addr, + unsigned long size, unsigned long offset); bool f2fs_need_SSR(struct f2fs_sb_info *sbi); void f2fs_register_inmem_page(struct inode *inode, struct page *page); void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ecbfb63809f9..3fafcc0c5f7e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -30,6 +30,24 @@ static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; +static struct discard_policy dpolicys[MAX_DPOLICY] = { + {DPOLICY_BG, 0, DEF_MID_DISCARD_ISSUE_TIME, DEF_MAX_DISCARD_ISSUE_TIME, + MAX_PLIST_NUM, false, true, false, false, DISCARD_GRAN_BG, + {{1, 0}, {0, 0}, {0, 0}}}, + {DPOLICY_BALANCE, 0, DEF_MID_DISCARD_ISSUE_TIME, DEF_MAX_DISCARD_ISSUE_TIME, + MAX_PLIST_NUM - 1, true, true, false, false, DISCARD_GRAN_BL, + {{1, 0}, {2, 50}, {0, 0}}}, + {DPOLICY_FORCE, 0, DEF_MID_DISCARD_ISSUE_TIME, DEF_MAX_DISCARD_ISSUE_TIME, + MAX_PLIST_NUM - 1, true, true, false, false, DISCARD_GRAN_FORCE, + {{1, 0}, {2, 50}, {4, 2000}}}, + {DPOLICY_FSTRIM, 0, DEF_MID_DISCARD_ISSUE_TIME, DEF_MAX_DISCARD_ISSUE_TIME, + MAX_PLIST_NUM, false, true, false, false, DISCARD_GRAN_FORCE, + {{8, 0}, {8, 0}, {8, 0}}}, + {DPOLICY_UMOUNT, 0, DEF_MID_DISCARD_ISSUE_TIME, DEF_MAX_DISCARD_ISSUE_TIME, + MAX_PLIST_NUM, false, true, false, false, DISCARD_GRAN_BG, + {{UINT_MAX, 0}, {0, 0}, {0, 0}}} +}; + static unsigned long __reverse_ulong(unsigned char *str) { unsigned long tmp = 0; @@ -93,7 +111,7 @@ static inline unsigned long __reverse_ffs(unsigned long word) * f2fs_set_bit(0, bitmap) => 1000 0000 * f2fs_set_bit(7, bitmap) => 0000 0001 */ -static unsigned long __find_rev_next_bit(const unsigned long *addr, +unsigned long find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); @@ -129,7 +147,7 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, return result - size + __reverse_ffs(tmp); } -static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, +unsigned long find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); @@ -1109,7 +1127,7 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi, else size = max_blocks; map = (unsigned long *)(sentry->cur_valid_map); - offset = __find_rev_next_bit(map, size, offset); + offset = find_rev_next_bit(map, size, offset); f2fs_bug_on(sbi, offset != size); blk = START_BLOCK(sbi, segno + 1); } @@ -1117,43 +1135,41 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi, } static void __init_discard_policy(struct f2fs_sb_info *sbi, - struct discard_policy *dpolicy, + struct discard_policy *policy, int discard_type, unsigned int granularity) { - /* common policy */ - dpolicy->type = discard_type; - dpolicy->sync = true; - dpolicy->ordered = false; - dpolicy->granularity = granularity; - - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; - dpolicy->timeout = false; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; if (discard_type == DPOLICY_BG) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = true; - dpolicy->sync = false; - dpolicy->ordered = true; - if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { - dpolicy->granularity = 1; - dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; - } + *policy = dpolicys[DPOLICY_BG]; + } else if (discard_type == DPOLICY_BALANCE) { + *policy = dpolicys[DPOLICY_BALANCE]; } else if (discard_type == DPOLICY_FORCE) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = false; + *policy = dpolicys[DPOLICY_FORCE]; } else if (discard_type == DPOLICY_FSTRIM) { - dpolicy->io_aware = false; + *policy = dpolicys[DPOLICY_FSTRIM]; + if (policy->granularity != granularity) + policy->granularity = granularity; } else if (discard_type == DPOLICY_UMOUNT) { - dpolicy->io_aware = false; - /* we need to issue all to keep CP_TRIMMED_FLAG */ - dpolicy->granularity = 1; - dpolicy->timeout = true; + *policy = dpolicys[DPOLICY_UMOUNT]; } + dcc->discard_type = discard_type; +} + +static void select_sub_discard_policy(struct discard_sub_policy **spolicy, + int index, struct discard_policy *dpolicy) +{ + if (dpolicy->type == DPOLICY_FSTRIM) { + *spolicy = &dpolicy->sub_policy[SUB_POLICY_BIG]; + return; + } + + if ((index + 1) >= DISCARD_GRAN_BG) + *spolicy = &dpolicy->sub_policy[SUB_POLICY_BIG]; + else if ((index + 1) >= DISCARD_GRAN_BL) + *spolicy = &dpolicy->sub_policy[SUB_POLICY_MID]; + else + *spolicy = &dpolicy->sub_policy[SUB_POLICY_SMALL]; } static void __update_discard_tree_range(struct f2fs_sb_info *sbi, @@ -1162,6 +1178,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, + int spolicy_index, struct discard_cmd *dc, unsigned int *issued) { @@ -1173,9 +1190,12 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); int flag = dpolicy->sync ? REQ_SYNC : 0; + struct discard_sub_policy *spolicy = NULL; block_t lstart, start, len, total_len; int err = 0; + select_sub_discard_policy(&spolicy, spolicy_index, dpolicy); + if (dc->state != D_PREP) return 0; @@ -1191,7 +1211,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->len = 0; - while (total_len && *issued < dpolicy->max_requests && !err) { + while (total_len && *issued < spolicy->max_requests && !err) { struct bio *bio = NULL; unsigned long flags; bool last = true; @@ -1202,7 +1222,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, } (*issued)++; - if (*issued == dpolicy->max_requests) + if (*issued == spolicy->max_requests) last = true; dc->len += len; @@ -1449,7 +1469,8 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, } static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, - struct discard_policy *dpolicy) + struct discard_policy *dpolicy, + int spolicy_index) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; @@ -1459,8 +1480,11 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, unsigned int pos = dcc->next_pos; unsigned int issued = 0; bool io_interrupted = false; + struct discard_sub_policy *spolicy = NULL; + select_sub_discard_policy(&spolicy, spolicy_index, dpolicy); mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, pos, (struct rb_entry **)&prev_dc, @@ -1484,9 +1508,9 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, } dcc->next_pos = dc->lstart + dc->len; - err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); + err = __submit_discard_cmd(sbi, dpolicy, spolicy_index, dc, &issued); - if (issued >= dpolicy->max_requests) + if (issued >= spolicy->max_requests) break; next: node = rb_next(&dc->rb_node); @@ -1519,11 +1543,19 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct blk_plug plug; int i, issued; bool io_interrupted = false; + struct discard_sub_policy *spolicy = NULL; if (dpolicy->timeout) f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT); + /* only do this check in CHECK_FS, may be time consumed */ + if (unlikely(dcc->rbtree_check)) { + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, &dcc->root, false)); + mutex_unlock(&dcc->cmd_lock); + } retry: + blk_start_plug(&plug); issued = 0; for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { if (dpolicy->timeout && @@ -1533,8 +1565,13 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, if (i + 1 < dpolicy->granularity) break; - if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) - return __issue_discard_cmd_orderly(sbi, dpolicy); + select_sub_discard_policy(&spolicy, i, dpolicy); + + if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) { + issued = __issue_discard_cmd_orderly(sbi, dpolicy, i); + blk_finish_plug(&plug); + return issued; + } pend_list = &dcc->pend_list[i]; @@ -1544,7 +1581,6 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, &dcc->root, false)); - blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -1555,22 +1591,24 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, if (dpolicy->io_aware && i < dpolicy->io_aware_gran && !is_idle(sbi, DISCARD_TIME)) { io_interrupted = true; - break; + goto skip; } - - __submit_discard_cmd(sbi, dpolicy, dc, &issued); - - if (issued >= dpolicy->max_requests) + __submit_discard_cmd(sbi, dpolicy, i, dc, &issued); +skip: + if (issued >= spolicy->max_requests) break; } - blk_finish_plug(&plug); next: mutex_unlock(&dcc->cmd_lock); - if (issued >= dpolicy->max_requests || io_interrupted) + if (issued >= spolicy->max_requests || io_interrupted) break; } + blk_finish_plug(&plug); + if (spolicy) + dpolicy->min_interval = spolicy->interval; + if (dpolicy->type == DPOLICY_UMOUNT && issued) { __wait_all_discard_cmd(sbi, dpolicy); goto retry; @@ -1731,8 +1769,7 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) struct discard_policy dpolicy; bool dropped; - __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, - dcc->discard_granularity); + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, 0); __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); @@ -1743,6 +1780,29 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) return dropped; } +static int select_discard_type(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + block_t user_block_count = sbi->user_block_count; + block_t ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; + block_t fs_available_blocks = user_block_count - + valid_user_blocks(sbi) + ovp_count; + int discard_type; + + if (fs_available_blocks >= fs_free_space_threshold(sbi) && + fs_available_blocks - dcc->undiscard_blks >= + device_free_space_threshold(sbi)) { + discard_type = DPOLICY_BG; + } else if (fs_available_blocks < fs_free_space_threshold(sbi) && + fs_available_blocks - dcc->undiscard_blks < + device_free_space_threshold(sbi)) { + discard_type = DPOLICY_FORCE; + } else { + discard_type = DPOLICY_BALANCE; + } + return discard_type; +} + static int issue_discard_thread(void *data) { struct f2fs_sb_info *sbi = data; @@ -1750,13 +1810,13 @@ static int issue_discard_thread(void *data) wait_queue_head_t *q = &dcc->discard_wait_queue; struct discard_policy dpolicy; unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; - int issued; + int issued, discard_type; set_freezable(); do { - __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, - dcc->discard_granularity); + discard_type = select_discard_type(sbi); + __init_discard_policy(sbi, &dpolicy, discard_type, 0); wait_event_interruptible_timeout(*q, kthread_should_stop() || freezing(current) || @@ -1782,7 +1842,7 @@ static int issue_discard_thread(void *data) } if (sbi->gc_mode == GC_URGENT_HIGH) - __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 0); sb_start_intwrite(sbi->sb); @@ -1927,11 +1987,11 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, while (force || SM_I(sbi)->dcc_info->nr_discards <= SM_I(sbi)->dcc_info->max_discards) { - start = __find_rev_next_bit(dmap, max_blocks, end + 1); + start = find_rev_next_bit(dmap, max_blocks, end + 1); if (start >= max_blocks) break; - end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + end = find_rev_next_zero_bit(dmap, max_blocks, start + 1); if (force && start && end != max_blocks && (end - start) < cpc->trim_minlen) continue; @@ -2099,7 +2159,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return -ENOMEM; - dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + dcc->discard_granularity = DISCARD_GRAN_BG; INIT_LIST_HEAD(&dcc->entry_list); for (i = 0; i < MAX_PLIST_NUM; i++) INIT_LIST_HEAD(&dcc->pend_list[i]); @@ -2642,7 +2702,7 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi, for (i = 0; i < entries; i++) target_map[i] = ckpt_map[i] | cur_map[i]; - pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); + pos = find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); seg->next_blkoff = pos; } @@ -2673,7 +2733,7 @@ bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) for (i = 0; i < entries; i++) target_map[i] = ckpt_map[i] | cur_map[i]; - pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, 0); + pos = find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, 0); return pos < sbi->blocks_per_seg; } @@ -3014,8 +3074,17 @@ static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; struct blk_plug plug; + struct discard_sub_policy *spolicy = NULL; int issued; unsigned int trimmed = 0; + /* fstrim each time 8 discard without no interrupt */ + select_sub_discard_policy(&spolicy, 0, dpolicy); + + if (dcc->rbtree_check) { + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, &dcc->root, false)); + mutex_unlock(&dcc->cmd_lock); + } next: issued = 0; @@ -3047,9 +3116,9 @@ static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, goto skip; } - err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); + err = __submit_discard_cmd(sbi, dpolicy, 0, dc, &issued); - if (issued >= dpolicy->max_requests) { + if (issued >= spolicy->max_requests) { start = dc->lstart + dc->len; if (err) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index b8850c81068a..f2eb96c2fd37 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -542,6 +542,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_type, discard_type); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); @@ -631,6 +632,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(main_blkaddr), ATTR_LIST(max_small_discards), ATTR_LIST(discard_granularity), + ATTR_LIST(discard_type), ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), @@ -908,6 +910,66 @@ static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, return 0; } +static int undiscard_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct sit_info *sit_i = SIT_I(sbi); + unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main); + unsigned int total = 0; + unsigned int i, j; + + if (!f2fs_realtime_discard_enable(sbi)) + goto out; + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + unsigned int entries = SIT_VBLOCK_MAP_SIZE / + sizeof(unsigned long); + unsigned int max_blocks = sbi->blocks_per_seg; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *discard_map = (unsigned long *)se->discard_map; + unsigned long *dmap = SIT_I(sbi)->tmp_map; + int start = 0, end = -1; + + down_write(&sit_i->sentry_lock); + if (se->valid_blocks == max_blocks) { + up_write(&sit_i->sentry_lock); + continue; + } + + if (se->valid_blocks == 0) { + mutex_lock(&dirty_i->seglist_lock); + if (test_bit((int)i, dirty_i->dirty_segmap[PRE])) + total += 512; + mutex_unlock(&dirty_i->seglist_lock); + } else { + for (j = 0; j < entries; j++) + dmap[j] = ~ckpt_map[j] & ~discard_map[j]; + while (1) { + start = (int)find_rev_next_bit(dmap, + (unsigned long)max_blocks, + (unsigned long)(end + 1)); + + if ((unsigned int)start >= max_blocks) + break; + + end = (int)find_rev_next_zero_bit(dmap, + (unsigned long)max_blocks, + (unsigned long)(start + 1)); + total += (unsigned int)(end - start); + } + } + + up_write(&sit_i->sentry_lock); + } + +out: + seq_printf(seq, "total undiscard:%u K\n", total * 4); + return 0; +} + int __init f2fs_init_sysfs(void) { int ret; @@ -964,6 +1026,9 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) iostat_info_seq_show, sb); proc_create_single_data("victim_bits", S_IRUGO, sbi->s_proc, victim_bits_seq_show, sb); + proc_create_single_data("undiscard_info", S_IRUGO, sbi->s_proc, + undiscard_info_seq_show, sb); + } return 0; } @@ -975,6 +1040,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry("victim_bits", sbi->s_proc); + remove_proc_entry("undiscard_info", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); -- Gitee From ed88bae9cf7caee4fff8650f6bc8af01b7d31570 Mon Sep 17 00:00:00 2001 From: Wang Xiaojun Date: Thu, 21 Nov 2019 09:51:15 +0800 Subject: [PATCH 025/113] f2fs: add f2fs grading ssr feature ohos inclusion category:feature issue: #I4TEGS CVE:N/A -------------------------------- In order to reduce the fragmentation segment and improve the IO performance, we can use: the SSR for small file, the LFS for big file. Since the small file will use the SSR, the fragmentation segment will be reduced, there will be more sequence segment for big file. So the IO performance will be improved. Signed-off-by: Wang Xiaojun --- fs/f2fs/Kconfig | 7 +++ fs/f2fs/data.c | 12 ++-- fs/f2fs/f2fs.h | 20 ++++++- fs/f2fs/gc.c | 2 +- fs/f2fs/segment.c | 107 +++++++++++++++++++++++++++++++++--- fs/f2fs/segment.h | 31 ++++++++++- fs/f2fs/super.c | 46 +++++++++++++++- fs/f2fs/sysfs.c | 42 +++++++++++++- include/trace/events/f2fs.h | 33 +++++++++++ 9 files changed, 282 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index d13c5c6a9787..3dfc4f60de0c 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -137,3 +137,10 @@ config F2FS_FS_LZORLE default y help Support LZO-RLE compress algorithm, if unsure, say Y. + +config F2FS_GRADING_SSR + bool "F2FS grading ssr" + depends on F2FS_FS + default y + help + use grading ssr to improve the end performance diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1b11a42847c4..f54de04e6b87 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1390,7 +1390,7 @@ struct page *f2fs_get_new_data_page(struct inode *inode, return page; } -static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) +static int __allocate_data_block(struct dnode_of_data *dn, int seg_type, int contig_level) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; @@ -1417,7 +1417,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, - &sum, seg_type, NULL); + &sum, seg_type, NULL, contig_level); if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); @@ -1511,6 +1511,10 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, struct extent_info ei = {0,0,0}; block_t blkaddr; unsigned int start_pgofs; + int contig_level = SEQ_NONE; +#ifdef CONFIG_F2FS_GRADING_SSR + contig_level = check_io_seq(maxblocks); +#endif if (!maxblocks) return 0; @@ -1594,7 +1598,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, /* use out-place-update for driect IO under LFS mode */ if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) { - err = __allocate_data_block(&dn, map->m_seg_type); + err = __allocate_data_block(&dn, map->m_seg_type, contig_level); if (err) goto sync_out; blkaddr = dn.data_blkaddr; @@ -1615,7 +1619,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO && flag != F2FS_GET_BLOCK_DIO); err = __allocate_data_block(&dn, - map->m_seg_type); + map->m_seg_type, contig_level); if (!err) set_inode_flag(inode, FI_APPEND_WRITE); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c7724661dd5e..41222b59e596 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1363,6 +1363,20 @@ struct decompress_io_ctx { #define MAX_COMPRESS_LOG_SIZE 8 #define MAX_COMPRESS_WINDOW_SIZE(log_size) ((PAGE_SIZE) << (log_size)) +#ifdef CONFIG_F2FS_GRADING_SSR +struct f2fs_hot_cold_params { + unsigned int enable; + unsigned int hot_data_lower_limit; + unsigned int hot_data_waterline; + unsigned int warm_data_lower_limit; + unsigned int warm_data_waterline; + unsigned int hot_node_lower_limit; + unsigned int hot_node_waterline; + unsigned int warm_node_lower_limit; + unsigned int warm_node_waterline; +}; +#endif + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -1569,6 +1583,10 @@ struct f2fs_sb_info { struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ #endif + +#ifdef CONFIG_F2FS_GRADING_SSR + struct f2fs_hot_cold_params hot_cold_params; +#endif }; struct f2fs_private_dio { @@ -3352,7 +3370,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, - struct f2fs_io_info *fio); + struct f2fs_io_info *fio, int contig_level); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered, bool locked); void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index cb3e7808d0e4..a981e466cc7d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1238,7 +1238,7 @@ static int move_data_block(struct inode *inode, block_t bidx, } f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, type, NULL); + &sum, type, NULL, SEQ_NONE); fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3fafcc0c5f7e..24d22c2954b5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -201,6 +201,75 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } +#ifdef CONFIG_F2FS_GRADING_SSR +static bool need_ssr_by_type(struct f2fs_sb_info *sbi, int type, int contig_level) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + u64 valid_blocks = sbi->total_valid_block_count; + u64 total_blocks = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + u64 left_space = (total_blocks - valid_blocks) << 2; + unsigned int free_segs = free_segments(sbi); + unsigned int ovp_segments = overprovision_segments(sbi); + unsigned int lower_limit = 0; + unsigned int waterline = 0; + int dirty_sum = node_secs + 2 * dent_secs + imeta_secs; + + if (sbi->hot_cold_params.enable == GRADING_SSR_OFF) + return f2fs_need_SSR(sbi); + if (f2fs_lfs_mode(sbi)) + return false; + if (sbi->gc_mode == GC_URGENT_HIGH) + return true; + if (contig_level == SEQ_256BLKS && type == CURSEG_WARM_DATA && + free_sections(sbi) > dirty_sum + 3 * reserved_sections(sbi) / 2) + return false; + if (free_sections(sbi) <= (unsigned int)(dirty_sum + 2 * reserved_sections(sbi))) + return true; + if (contig_level >= SEQ_32BLKS || total_blocks <= SSR_MIN_BLKS_LIMIT) + return false; + + left_space -= ovp_segments * KBS_PER_SEGMENT; + if (unlikely(left_space == 0)) + return false; + + switch (type) { + case CURSEG_HOT_DATA: + lower_limit = sbi->hot_cold_params.hot_data_lower_limit; + waterline = sbi->hot_cold_params.hot_data_waterline; + break; + case CURSEG_WARM_DATA: + lower_limit = sbi->hot_cold_params.warm_data_lower_limit; + waterline = sbi->hot_cold_params.warm_data_waterline; + break; + case CURSEG_HOT_NODE: + lower_limit = sbi->hot_cold_params.hot_node_lower_limit; + waterline = sbi->hot_cold_params.hot_node_waterline; + break; + case CURSEG_WARM_NODE: + lower_limit = sbi->hot_cold_params.warm_node_lower_limit; + waterline = sbi->hot_cold_params.warm_node_waterline; + break; + default: + return false; + } + + if (left_space > lower_limit) + return false; + + if (div_u64((free_segs - ovp_segments) * 100, (left_space / KBS_PER_SEGMENT)) + <= waterline) { + trace_f2fs_grading_ssr_allocate( + (le64_to_cpu(sbi->raw_super->block_count) - sbi->total_valid_block_count), + free_segments(sbi), contig_level); + return true; + } else { + return false; + } +} +#endif + void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; @@ -2940,7 +3009,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, * This function should be returned with success, otherwise BUG */ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, - int type, bool force) + int type, bool force, int contig_level) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2953,8 +3022,12 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, is_next_segment_free(sbi, curseg, type) && likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) new_curseg(sbi, type, false); +#ifdef CONFIG_F2FS_GRADING_SSR + else if (need_ssr_by_type(sbi, type, contig_level) && get_ssr_segment(sbi, type, SSR, 0)) +#else else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) +#endif change_curseg(sbi, type, true); else new_curseg(sbi, type, false); @@ -3012,7 +3085,7 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, return; alloc: old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); + SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true, SEQ_NONE); locate_dirty_segment(sbi, old_segno); } @@ -3412,13 +3485,17 @@ static int __get_segment_type(struct f2fs_io_info *fio) void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, - struct f2fs_io_info *fio) + struct f2fs_io_info *fio, int contig_level) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned long long old_mtime; bool from_gc = (type == CURSEG_ALL_DATA_ATGC); struct seg_entry *se = NULL; +#ifdef CONFIG_F2FS_GRADING_SSR + struct inode *inode = NULL; +#endif + int contig = SEQ_NONE; down_read(&SM_I(sbi)->curseg_lock); @@ -3465,11 +3542,25 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, update_sit_entry(sbi, old_blkaddr, -1); if (!__has_curseg_space(sbi, curseg)) { - if (from_gc) + if (from_gc) { get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); - else - sit_i->s_ops->allocate_segment(sbi, type, false); + } else { +#ifdef CONFIG_F2FS_GRADING_SSR + if (contig_level != SEQ_NONE) { + contig = contig_level; + goto allocate_label; + } + + if (page && page->mapping && page->mapping != NODE_MAPPING(sbi) && + page->mapping != META_MAPPING(sbi)) { + inode = page->mapping->host; + contig = check_io_seq(get_dirty_pages(inode)); + } +allocate_label: +#endif + sit_i->s_ops->allocate_segment(sbi, type, false, contig); + } } /* * segment dirty status should be updated after segment allocation, @@ -3536,7 +3627,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) down_read(&fio->sbi->io_order_lock); reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, - &fio->new_blkaddr, sum, type, fio); + &fio->new_blkaddr, sum, type, fio, SEQ_NONE); if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) invalidate_mapping_pages(META_MAPPING(fio->sbi), fio->old_blkaddr, fio->old_blkaddr); @@ -4905,7 +4996,7 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) f2fs_notice(sbi, "Assign new section to curseg[%d]: " "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); - allocate_segment_by_default(sbi, type, true); + allocate_segment_by_default(sbi, type, true, SEQ_NONE); /* check consistency of the zone curseg pointed to */ if (check_zone_write_pointer(sbi, zbd, &zone)) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 1bf33fc27b8f..fa18a6b6fc4c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -130,7 +130,18 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) +#ifdef CONFIG_F2FS_GRADING_SSR +#define KBS_PER_SEGMENT 2048 +#define SSR_MIN_BLKS_LIMIT (16 << 18) /* 16G */ +#define SSR_CONTIG_DIRTY_NUMS 32 /* Dirty pages for LFS alloction in grading ssr. */ +#define SSR_CONTIG_LARGE 256 /* Larege files */ +#endif +enum { + SEQ_NONE, + SEQ_32BLKS, + SEQ_256BLKS +}; /* * indicate a block allocation direction: RIGHT and LEFT. * RIGHT means allocating new sections towards the end of volume. @@ -180,6 +191,13 @@ enum { FORCE_FG_GC, }; +#ifdef CONFIG_F2FS_GRADING_SSR +enum { + GRADING_SSR_OFF = 0, + GRADING_SSR_ON +}; +#endif + /* for a function parameter to select a victim segment */ struct victim_sel_policy { int alloc_mode; /* LFS or SSR */ @@ -221,7 +239,7 @@ struct sec_entry { }; struct segment_allocation { - void (*allocate_segment)(struct f2fs_sb_info *, int, bool); + void (*allocate_segment)(struct f2fs_sb_info *, int, bool, int); }; #define MAX_SKIP_GC_COUNT 16 @@ -913,3 +931,14 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) dcc->discard_wake = 1; wake_up_interruptible_all(&dcc->discard_wait_queue); } + +#ifdef CONFIG_F2FS_GRADING_SSR +static inline int check_io_seq(int blks) +{ + if (blks >= SSR_CONTIG_LARGE) + return SEQ_256BLKS; + if (blks >= SSR_CONTIG_DIRTY_NUMS) + return SEQ_32BLKS; + return SEQ_NONE; +} +#endif diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 36b8d03ded95..b305e024d46f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -36,6 +36,19 @@ #define CREATE_TRACE_POINTS #include +#ifdef CONFIG_F2FS_GRADING_SSR +#define SSR_DEFALT_SPACE_LIMIT (5<<20) /* 5G default space limit */ +#define SSR_DEFALT_WATERLINE 80 /* 80% default waterline */ +#define SSR_HN_SAPCE_LIMIT_128G (8<<20) /* 8G default sapce limit for 128G devices */ +#define SSR_HN_WATERLINE_128G 80 /* 80% default hot node waterline for 128G devices */ +#define SSR_WN_SAPCE_LIMIT_128G (5<<20) /* 5G default warm node sapce limit for 128G devices */ +#define SSR_WN_WATERLINE_128G 70 /* 70% default warm node waterline for 128G devices */ +#define SSR_HD_SAPCE_LIMIT_128G (8<<20) /* 8G default hot data sapce limit for 128G devices */ +#define SSR_HD_WATERLINE_128G 65 /* 65% default hot data waterline for 128G devices */ +#define SSR_WD_SAPCE_LIMIT_128G (5<<20) /* 5G default warm data sapce limit for 128G devices */ +#define SSR_WD_WATERLINE_128G 60 /* 60% default warm data waterline for 128G devices */ +#endif + static struct kmem_cache *f2fs_inode_cachep; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -3503,6 +3516,35 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) sbi->readdir_ra = 1; } +#ifdef CONFIG_F2FS_GRADING_SSR +static void f2fs_init_grading_ssr(struct f2fs_sb_info *sbi) +{ + u32 total_blocks = le64_to_cpu(sbi->raw_super->block_count) >> 18; + + if (total_blocks > 64) { /* 64G */ + sbi->hot_cold_params.hot_data_lower_limit = SSR_HD_SAPCE_LIMIT_128G; + sbi->hot_cold_params.hot_data_waterline = SSR_HD_WATERLINE_128G; + sbi->hot_cold_params.warm_data_lower_limit = SSR_WD_SAPCE_LIMIT_128G; + sbi->hot_cold_params.warm_data_waterline = SSR_WD_WATERLINE_128G; + sbi->hot_cold_params.hot_node_lower_limit = SSR_HD_SAPCE_LIMIT_128G; + sbi->hot_cold_params.hot_node_waterline = SSR_HN_WATERLINE_128G; + sbi->hot_cold_params.warm_node_lower_limit = SSR_WN_SAPCE_LIMIT_128G; + sbi->hot_cold_params.warm_node_waterline = SSR_WN_WATERLINE_128G; + sbi->hot_cold_params.enable = GRADING_SSR_OFF; + } else { + sbi->hot_cold_params.hot_data_lower_limit = SSR_DEFALT_SPACE_LIMIT; + sbi->hot_cold_params.hot_data_waterline = SSR_DEFALT_WATERLINE; + sbi->hot_cold_params.warm_data_lower_limit = SSR_DEFALT_SPACE_LIMIT; + sbi->hot_cold_params.warm_data_waterline = SSR_DEFALT_WATERLINE; + sbi->hot_cold_params.hot_node_lower_limit = SSR_DEFALT_SPACE_LIMIT; + sbi->hot_cold_params.hot_node_waterline = SSR_DEFALT_WATERLINE; + sbi->hot_cold_params.warm_node_lower_limit = SSR_DEFALT_SPACE_LIMIT; + sbi->hot_cold_params.warm_node_waterline = SSR_DEFALT_WATERLINE; + sbi->hot_cold_params.enable = GRADING_SSR_OFF; + } +} +#endif + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -3795,7 +3837,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = -ENOMEM; goto free_node_inode; } - +#ifdef CONFIG_F2FS_GRADING_SSR + f2fs_init_grading_ssr(sbi); +#endif err = f2fs_register_sysfs(sbi); if (err) goto free_root_inode; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f2eb96c2fd37..c90280c3168f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -34,6 +34,9 @@ enum { FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_GRADING_SSR + F2FS_HOT_COLD_PARAMS, /* struct f2fs_hot_cold_params */ +#endif }; struct f2fs_attr { @@ -61,6 +64,10 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS) return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_GRADING_SSR + else if (struct_type == F2FS_HOT_COLD_PARAMS) + return (unsigned char *)&sbi->hot_cold_params; +#endif #ifdef CONFIG_F2FS_FAULT_INJECTION else if (struct_type == FAULT_INFO_RATE || struct_type == FAULT_INFO_TYPE) @@ -569,6 +576,26 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); +#ifdef CONFIG_F2FS_GRADING_SSR +F2FS_RW_ATTR(F2FS_HOT_COLD_PARAMS, f2fs_hot_cold_params, + hc_hot_data_lower_limit, hot_data_lower_limit); +F2FS_RW_ATTR(F2FS_HOT_COLD_PARAMS, f2fs_hot_cold_params, + hc_hot_data_waterline, hot_data_waterline); +F2FS_RW_ATTR(F2FS_HOT_COLD_PARAMS, f2fs_hot_cold_params, + hc_warm_data_lower_limit, warm_data_lower_limit); +F2FS_RW_ATTR(F2FS_HOT_COLD_PARAMS, f2fs_hot_cold_params, + hc_warm_data_waterline, warm_data_waterline); +F2FS_RW_ATTR(F2FS_HOT_COLD_PARAMS, f2fs_hot_cold_params, + hc_hot_node_lower_limit, hot_node_lower_limit); +F2FS_RW_ATTR(F2FS_HOT_COLD_PARAMS, f2fs_hot_cold_params, + hc_hot_node_waterline, hot_node_waterline); +F2FS_RW_ATTR(F2FS_HOT_COLD_PARAMS, f2fs_hot_cold_params, + hc_warm_node_lower_limit, warm_node_lower_limit); +F2FS_RW_ATTR(F2FS_HOT_COLD_PARAMS, f2fs_hot_cold_params, + hc_warm_node_waterline, warm_node_waterline); +F2FS_RW_ATTR(F2FS_HOT_COLD_PARAMS, f2fs_hot_cold_params, + hc_enable, enable); +#endif #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); @@ -679,6 +706,17 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(moved_blocks_foreground), ATTR_LIST(moved_blocks_background), ATTR_LIST(avg_vblocks), +#endif +#ifdef CONFIG_F2FS_GRADING_SSR + ATTR_LIST(hc_hot_data_lower_limit), + ATTR_LIST(hc_hot_data_waterline), + ATTR_LIST(hc_warm_data_lower_limit), + ATTR_LIST(hc_warm_data_waterline), + ATTR_LIST(hc_hot_node_lower_limit), + ATTR_LIST(hc_hot_node_waterline), + ATTR_LIST(hc_warm_node_lower_limit), + ATTR_LIST(hc_warm_node_waterline), + ATTR_LIST(hc_enable), #endif NULL, }; @@ -919,6 +957,8 @@ static int undiscard_info_seq_show(struct seq_file *seq, void *offset) unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main); unsigned int total = 0; unsigned int i, j; + unsigned int max_blocks = sbi->blocks_per_seg; + unsigned long *dmap = SIT_I(sbi)->tmp_map; if (!f2fs_realtime_discard_enable(sbi)) goto out; @@ -927,10 +967,8 @@ static int undiscard_info_seq_show(struct seq_file *seq, void *offset) struct seg_entry *se = get_seg_entry(sbi, i); unsigned int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); - unsigned int max_blocks = sbi->blocks_per_seg; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *discard_map = (unsigned long *)se->discard_map; - unsigned long *dmap = SIT_I(sbi)->tmp_map; int start = 0, end = -1; down_write(&sit_i->sentry_lock); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 56b113e3cd6a..b4fe1db78eae 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1957,6 +1957,39 @@ TRACE_EVENT(f2fs_fiemap, __entry->ret) ); +#ifdef CONFIG_F2FS_GRADING_SSR +DECLARE_EVENT_CLASS(f2fs_grading_ssr, + + TP_PROTO(unsigned int left, unsigned int free, + unsigned int seq), + + TP_ARGS(left, free, seq), + + TP_STRUCT__entry( + __field(unsigned int, left) + __field(unsigned int, free) + __field(unsigned int, seq) + ), + + TP_fast_assign( + __entry->left = left; + __entry->free = free; + __entry->seq = seq; + ), + + TP_printk("ssr: left_space %u free_segments: %u is_seq: %u ", + __entry->left, __entry->free, __entry->seq) +); + +DEFINE_EVENT(f2fs_grading_ssr, f2fs_grading_ssr_allocate, + + TP_PROTO(unsigned int left, unsigned int free, + unsigned int seq), + + TP_ARGS(left, free, seq) +); +#endif + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ -- Gitee From 5c072c64809884984d3945b86bdd8f909f31f472 Mon Sep 17 00:00:00 2001 From: zhizhimeimei6 Date: Mon, 14 Feb 2022 17:05:42 +0800 Subject: [PATCH 026/113] zerohung: open zerohung feature ohos inclusion category: feature issue: #I4PJDN CVE: NA ----------------- zerohung is used to detect and upload kernel event. Signed-off-by: zhizhimeimei6 --- drivers/staging/Kconfig | 2 + drivers/staging/Makefile | 1 + drivers/staging/zerohung/Kconfig | 6 + drivers/staging/zerohung/Makefile | 2 + drivers/staging/zerohung/watchpoint/Makefile | 2 + .../zerohung/watchpoint/hung_wp_screen.c | 283 +++++++++++++++++ drivers/staging/zerohung/zrhung_event.c | 299 ++++++++++++++++++ drivers/video/backlight/backlight.c | 10 + include/dfx/hung_wp_screen.h | 36 +++ include/dfx/zrhung.h | 23 ++ 10 files changed, 664 insertions(+) create mode 100644 drivers/staging/zerohung/Kconfig create mode 100644 drivers/staging/zerohung/Makefile create mode 100644 drivers/staging/zerohung/watchpoint/Makefile create mode 100644 drivers/staging/zerohung/watchpoint/hung_wp_screen.c create mode 100644 drivers/staging/zerohung/zrhung_event.c create mode 100644 include/dfx/hung_wp_screen.h create mode 100644 include/dfx/zrhung.h diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index d47ee7199bfc..d9ace574ad9d 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -122,6 +122,8 @@ source "drivers/staging/hilog/Kconfig" source "drivers/staging/hievent/Kconfig" +source "drivers/staging/zerohung/Kconfig" + source "drivers/staging/hungtask/Kconfig" source "drivers/staging/blackbox/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index b0fe6f912da6..3883281dbe7a 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -51,5 +51,6 @@ obj-$(CONFIG_WFX) += wfx/ obj-y += hikey9xx/ obj-$(CONFIG_HILOG) += hilog/ obj-$(CONFIG_HIEVENT) += hievent/ +obj-$(CONFIG_DFX_ZEROHUNG) += zerohung/ obj-$(CONFIG_DFX_HUNGTASK) += hungtask/ obj-$(CONFIG_BLACKBOX) += blackbox/ diff --git a/drivers/staging/zerohung/Kconfig b/drivers/staging/zerohung/Kconfig new file mode 100644 index 000000000000..3f575f72bfb8 --- /dev/null +++ b/drivers/staging/zerohung/Kconfig @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +config DFX_ZEROHUNG + bool "zerohung driver" + default n + help + This feature support to catch hung log diff --git a/drivers/staging/zerohung/Makefile b/drivers/staging/zerohung/Makefile new file mode 100644 index 000000000000..3727a0e91dca --- /dev/null +++ b/drivers/staging/zerohung/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_DFX_ZEROHUNG) += zrhung_event.o watchpoint/ diff --git a/drivers/staging/zerohung/watchpoint/Makefile b/drivers/staging/zerohung/watchpoint/Makefile new file mode 100644 index 000000000000..1cb8d7f99417 --- /dev/null +++ b/drivers/staging/zerohung/watchpoint/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_DFX_ZEROHUNG) += hung_wp_screen.o diff --git a/drivers/staging/zerohung/watchpoint/hung_wp_screen.c b/drivers/staging/zerohung/watchpoint/hung_wp_screen.c new file mode 100644 index 000000000000..9b295fed067d --- /dev/null +++ b/drivers/staging/zerohung/watchpoint/hung_wp_screen.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#define pr_fmt(fmt) "zrhung " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TIME_CONVERT_UNIT 1000 +#define DEFAULT_TIMEOUT 10 + +#define LPRESSEVENT_TIME 5 +#define POWERKEYEVENT_MAX_COUNT 10 +#define POWERKEYEVENT_DEFAULT_COUNT 3 +#define POWERKEYEVENT_DEFAULT_TIMEWINDOW 5 +#define POWERKEYEVENT_DEFAULT_LIMIT_MS 300 +#define POWERKEYEVENT_DEFAULT_REPORT_MIN 2 +#define POWERKEYEVENT_TIME_LEN (POWERKEYEVENT_MAX_COUNT + 2) + +struct hung_wp_screen_data { + struct timer_list timer; + struct timer_list long_press_timer; + struct workqueue_struct *workq; + struct work_struct send_work; + spinlock_t lock; + int fb_blank; + int check_id; + int tag_id; +}; + +static bool init_done; +static struct hung_wp_screen_data g_hung_data; +static unsigned int lastreport_time; +static unsigned int lastprkyevt_time; +static unsigned int powerkeyevent_time[POWERKEYEVENT_TIME_LEN] = {0}; +static unsigned int newevt; +static unsigned int headevt; +static int *check_off_point; +struct work_struct powerkeyevent_sendwork; +struct work_struct lpressevent_sendwork; + + +static void zrhung_lpressevent_send_work(struct work_struct *work) +{ + pr_info("LONGPRESS_EVENT send to zerohung\n"); + zrhung_send_event(WP_SCREEN_DOMAIN, WP_SCREEN_LPRESS_NAME, "none"); +} + +static void zrhung_wp_lpress_send(struct timer_list *t) +{ + int *check_off = check_off_point; + + del_timer(&g_hung_data.long_press_timer); + *check_off = 0; + queue_work(g_hung_data.workq, &lpressevent_sendwork); +} + +static void zrhung_powerkeyevent_send_work(struct work_struct *work) +{ + pr_info("POWERKEY_EVENT send to zerohung\n"); + zrhung_send_event(WP_SCREEN_DOMAIN, WP_SCREEN_PWK_NAME, "none"); +} + +static void zrhung_powerkeyevent_report(unsigned int dur, unsigned int end) +{ + unsigned int send_interval; + + send_interval = end > lastreport_time ? + ((end - lastreport_time) / TIME_CONVERT_UNIT) : POWERKEYEVENT_DEFAULT_REPORT_MIN; + if (unlikely(lastreport_time == 0)) { + lastreport_time = end; + } else if (send_interval < POWERKEYEVENT_DEFAULT_REPORT_MIN) { + pr_info("powerkeyevent too fast to report: %d\n", end); + return; + } + lastreport_time = end; + queue_work(g_hung_data.workq, &powerkeyevent_sendwork); +} + +static unsigned int refresh_prkyevt_index(unsigned int event) +{ + unsigned int evt = event; + + if (evt < POWERKEYEVENT_MAX_COUNT) + evt++; + else + evt = 0; + return evt; +} + +static void zrhung_new_powerkeyevent(unsigned int tmescs) +{ + unsigned int prkyevt_interval; + unsigned int evt_index; + int diff; + + powerkeyevent_time[newevt] = tmescs; + evt_index = (newevt >= headevt) ? + (newevt - headevt) : (newevt + POWERKEYEVENT_MAX_COUNT + 1 - headevt); + if (evt_index < (POWERKEYEVENT_DEFAULT_COUNT - 1)) { + pr_info("powerkeyevent not enough-%d\n", POWERKEYEVENT_DEFAULT_COUNT); + } else { + diff = powerkeyevent_time[newevt] - powerkeyevent_time[headevt]; + if (diff < 0) { + pr_info("powerkeyevent sth wrong in record time\n"); + return; + } + + prkyevt_interval = (unsigned int)(diff / TIME_CONVERT_UNIT); + if (prkyevt_interval <= POWERKEYEVENT_DEFAULT_TIMEWINDOW) + zrhung_powerkeyevent_report(prkyevt_interval, tmescs); + headevt = refresh_prkyevt_index(headevt); + } + newevt = refresh_prkyevt_index(newevt); +} + +static void zrhung_powerkeyevent_handler(void) +{ + unsigned int curtime; + unsigned long curjiff; + + pr_info("powerkeyevent check start"); + curjiff = jiffies; + curtime = jiffies_to_msecs(curjiff); + if (unlikely(lastprkyevt_time > curtime)) { + pr_info("powerkeyevent check but time overflow"); + lastprkyevt_time = curtime; + return; + } else if ((curtime - lastprkyevt_time) < POWERKEYEVENT_DEFAULT_LIMIT_MS) { + pr_info("powerkeyevent user press powerkey too fast-time:%d", curtime); + return; + } + lastprkyevt_time = curtime; + zrhung_new_powerkeyevent(curtime); +} + +void hung_wp_screen_setblank(int blank) +{ + unsigned long flags; + + if (!init_done) + return; + + spin_lock_irqsave(&(g_hung_data.lock), flags); + g_hung_data.fb_blank = blank; + if (((g_hung_data.check_id == ZRHUNG_WP_SCREENON) && (blank == 0)) || + ((g_hung_data.check_id == ZRHUNG_WP_SCREENOFF) && (blank != 0))) { + pr_info("check_id=%d, blank=%d", g_hung_data.check_id, g_hung_data.fb_blank); + del_timer(&g_hung_data.timer); + g_hung_data.check_id = ZRHUNG_WP_NONE; + } + spin_unlock_irqrestore(&(g_hung_data.lock), flags); +} + +static void hung_wp_screen_send_work(struct work_struct *work) +{ + unsigned long flags = 0; + + show_state_filter(TASK_UNINTERRUPTIBLE); + + if (g_hung_data.check_id == 1) + zrhung_send_event(WP_SCREEN_DOMAIN, WP_SCREEN_ON_NAME, "none"); + else + zrhung_send_event(WP_SCREEN_DOMAIN, WP_SCREEN_OFF_NAME, "none"); + pr_info("send event: %d\n", g_hung_data.check_id); + spin_lock_irqsave(&(g_hung_data.lock), flags); + g_hung_data.check_id = ZRHUNG_WP_NONE; + spin_unlock_irqrestore(&(g_hung_data.lock), flags); +} + +static void hung_wp_screen_send(struct timer_list *t) +{ + del_timer(&g_hung_data.timer); + pr_info("hung_wp_screen_%d end\n", g_hung_data.tag_id); + queue_work(g_hung_data.workq, &g_hung_data.send_work); +} + +static void hung_wp_screen_start(int check_id) +{ + if (g_hung_data.check_id != ZRHUNG_WP_NONE) { + pr_info("already in check_id: %d\n", g_hung_data.check_id); + return; + } + + g_hung_data.check_id = check_id; + if (timer_pending(&g_hung_data.timer)) + del_timer(&g_hung_data.timer); + + g_hung_data.timer.expires = jiffies + msecs_to_jiffies(DEFAULT_TIMEOUT * TIME_CONVERT_UNIT); + add_timer(&g_hung_data.timer); + pr_info("going to check ID=%d timeout=%d\n", check_id, DEFAULT_TIMEOUT); + + return; +} + +void hung_wp_screen_powerkey_ncb(int event) +{ + static int check_off; + unsigned long flags = 0; + + if (!init_done) + return; + + spin_lock_irqsave(&(g_hung_data.lock), flags); + if (event == WP_SCREEN_PWK_PRESS) { + pr_info("hung_wp_screen_%d start! fb_blank=%d", + ++g_hung_data.tag_id, g_hung_data.fb_blank); + check_off = 0; + if (g_hung_data.fb_blank != 0) { + hung_wp_screen_start(ZRHUNG_WP_SCREENON); + } else { + check_off = 1; + pr_info("start longpress test timer\n"); + check_off_point = &check_off; + g_hung_data.long_press_timer.expires = jiffies + + msecs_to_jiffies(LPRESSEVENT_TIME * TIME_CONVERT_UNIT); + if (!timer_pending(&g_hung_data.long_press_timer)) + add_timer(&g_hung_data.long_press_timer); + } + zrhung_powerkeyevent_handler(); + } else if (check_off) { + check_off = 0; + del_timer(&g_hung_data.long_press_timer); + if (event == WP_SCREEN_PWK_RELEASE && g_hung_data.fb_blank == 0) + hung_wp_screen_start(ZRHUNG_WP_SCREENOFF); + } + spin_unlock_irqrestore(&(g_hung_data.lock), flags); +} + +static int __init hung_wp_screen_init(void) +{ + init_done = false; + pr_info("%s start\n", __func__); + g_hung_data.fb_blank = 0; + g_hung_data.tag_id = 0; + g_hung_data.check_id = ZRHUNG_WP_NONE; + spin_lock_init(&(g_hung_data.lock)); + + timer_setup(&g_hung_data.timer, hung_wp_screen_send, 0); + timer_setup(&g_hung_data.long_press_timer, zrhung_wp_lpress_send, 0); + + g_hung_data.workq = create_workqueue("hung_wp_screen_workq"); + if (g_hung_data.workq == NULL) { + pr_err("create workq failed\n"); + return -EFAULT; + } + INIT_WORK(&g_hung_data.send_work, hung_wp_screen_send_work); + INIT_WORK(&powerkeyevent_sendwork, zrhung_powerkeyevent_send_work); + INIT_WORK(&lpressevent_sendwork, zrhung_lpressevent_send_work); + + init_done = true; + pr_info("%s done\n", __func__); + return 0; +} + +module_init(hung_wp_screen_init); + +MODULE_AUTHOR("OHOS"); +MODULE_DESCRIPTION("Reporting the frozen screen alarm event"); +MODULE_LICENSE("GPL"); diff --git a/drivers/staging/zerohung/zrhung_event.c b/drivers/staging/zerohung/zrhung_event.c new file mode 100644 index 000000000000..0ad2d9abb31d --- /dev/null +++ b/drivers/staging/zerohung/zrhung_event.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#define pr_fmt(fmt) "zrhung " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MINUTE_TO_SECS 60 +#define SEC_TO_MILLISEC 1000 +#define MILLISEC_TO_NANOSEC (1000 * 1000) +#define TIME_ZONE_LEN 6 +#define HISYSEVENT_MAX_STR_LEN 1024 +#define HISYSEVENT_INFO_BUF_LEN 1024 +#define HISYSEVENT_WRITER_DEV "/dev/bbox" +static int CHECK_CODE = 0x7BCDABCD; + +#define BUF_POINTER_FORWARD \ +do { \ + if (tmp_len >= 0 && tmp_len < len) { \ + tmp += tmp_len; \ + len -= tmp_len; \ + } else { \ + pr_err("string over length"); \ + tmp += len; \ + len = 0; \ + } \ +} while (0) + +struct hisysevent { + char *domain; + char *event_name; + unsigned int type; + long long time; + char *tz; + unsigned int pid; + unsigned int tid; + unsigned int uid; + char *msg; +}; + +int hisysevent_set_time(struct hisysevent *event) +{ + struct timespec64 ts; + struct timezone tz = sys_tz; + int tz_index = 0; + char time_zone[TIME_ZONE_LEN]; + int tz_hour; + int tz_min; + long long millisecs = 0; + + if (!event) { + pr_err("invalid event"); + return -EINVAL; + } + + ktime_get_real_ts64(&ts); + millisecs = ts.tv_sec * SEC_TO_MILLISEC + ts.tv_nsec / MILLISEC_TO_NANOSEC; + event->time = millisecs; + tz_hour = (-tz.tz_minuteswest) / MINUTE_TO_SECS; + time_zone[tz_index++] = tz_hour >= 0 ? '+' : '-'; + tz_min = (-tz.tz_minuteswest) % MINUTE_TO_SECS; + sprintf(&time_zone[tz_index], "%02u%02u", abs(tz_hour), abs(tz_min)); + time_zone[TIME_ZONE_LEN - 1] = '\0'; + event->tz = kstrdup(time_zone, GFP_ATOMIC); + + return 0; +} + +int hisysevent_set_msg(struct hisysevent *event, const char *msg_buf) +{ + int len; + + if (!event) { + pr_err("invalid event"); + return -EINVAL; + } + + len = strlen(msg_buf); + if ((!msg_buf) || (msg_buf[0] == 0) || len > HISYSEVENT_MAX_STR_LEN) { + pr_err("invalid msg_buf"); + return -EINVAL; + } + + event->msg = kstrdup(msg_buf, GFP_ATOMIC); + + return 0; +} + +struct hisysevent *create_hisysevent(const char *domain, const char *event_name) +{ + struct hisysevent *event = NULL; + + event = vmalloc(sizeof(*event)); + if (!event) { + pr_err("failed to vmalloc for event"); + return -ENOMEM; + } + + memset(event, 0, sizeof(*event)); + + if ((!domain) || (domain[0] == 0)) { + pr_err("valid domain"); + vfree(event); + return NULL; + } + event->domain = kstrdup(domain, GFP_ATOMIC); + + if ((!event_name) || (event_name[0] == 0)) { + pr_err("valid event_name"); + kfree(event->domain); + vfree(event); + return NULL; + } + event->event_name = kstrdup(event_name, GFP_ATOMIC); + event->type = ZRHUNG_EVENT_TYPE; + + pr_info("create hisysevent succ, domain=%s, event_name=%s, type=%u", event->domain, + event->event_name, event->type); + + return (void *)event; +} + +struct hisysevent *inner_build_hisysevent(const char *domain, const char *event_name, + const char *msg_buf) +{ + struct hisysevent *event = NULL; + + event = create_hisysevent(domain, event_name); + hisysevent_set_time(event); + event->pid = current->pid; + event->tid = current->tgid; + event->uid = current_uid().val; + hisysevent_set_msg(event, msg_buf); + + return (void *)event; +} + +void zrhung_hisysevent_destroy(struct hisysevent *event) +{ + if (!event->domain) { + kfree(event->domain); + event->domain = NULL; + } + if (!event->event_name) { + kfree(event->event_name); + event->event_name = NULL; + } + if (!event->tz) { + kfree(event->tz); + event->tz = NULL; + } + if (!event->msg) { + kfree(event->msg); + event->msg = NULL; + } + + vfree(event); +} + +int hisysevent_convert_string(struct hisysevent *event, char **buf_ptr) +{ + int len; + char *tmp; + int tmp_len; + int base_index = 0; + static const char * const base_param_keys[] = {"domain_", "name_", "type_", "time_", "tz_", + "pid_", "tid_", "uid_", "MSG"}; + int buf_len = HISYSEVENT_INFO_BUF_LEN; + char *buf = vmalloc(buf_len); + + if (!buf) { + pr_err("failed to malloc buff for convert_string"); + return -ENOMEM; + } + memset(buf, 0, buf_len); + + len = buf_len; + tmp = buf; + + tmp_len = snprintf(tmp, len, "{\"%s\":\"%s\",", base_param_keys[base_index++], event->domain); + BUF_POINTER_FORWARD; + + tmp_len = snprintf(tmp, len, "\"%s\":\"%s\",", base_param_keys[base_index++], event->event_name); + BUF_POINTER_FORWARD; + + tmp_len = snprintf(tmp, len, "\"%s\":%u,", base_param_keys[base_index++], event->type); + BUF_POINTER_FORWARD; + + tmp_len = snprintf(tmp, len, "\"%s\":%lld,", base_param_keys[base_index++], event->time); + BUF_POINTER_FORWARD; + + tmp_len = snprintf(tmp, len, "\"%s\":\"%s\",", base_param_keys[base_index++], event->tz); + BUF_POINTER_FORWARD; + + tmp_len = snprintf(tmp, len, "\"%s\":%u,", base_param_keys[base_index++], event->pid); + BUF_POINTER_FORWARD; + + tmp_len = snprintf(tmp, len, "\"%s\":%u,", base_param_keys[base_index++], event->tid); + BUF_POINTER_FORWARD; + + tmp_len = snprintf(tmp, len, "\"%s\":%u,", base_param_keys[base_index++], event->uid); + BUF_POINTER_FORWARD; + + tmp_len = snprintf(tmp, len, "\"%s\":\"%s\"}", base_param_keys[base_index++], event->msg); + BUF_POINTER_FORWARD; + + *buf_ptr = buf; + + return (HISYSEVENT_INFO_BUF_LEN - len); +} + +int zrhung_hisysevent_write(struct hisysevent *event) +{ + struct iov_iter iter; + mm_segment_t oldfs; + char *data = NULL; + struct file *filp = NULL; + struct iovec vec[3]; + unsigned long vcount = 0; + int ret; + + hisysevent_convert_string(event, &data); + if (!data) { + pr_err("failed to convert string"); + return -EINVAL; + } + + filp = filp_open(HISYSEVENT_WRITER_DEV, O_WRONLY, 0); + + if ((!filp) || IS_ERR(filp)) { + ret = PTR_ERR(filp); + pr_err("access '%s' failed, res=%d", HISYSEVENT_WRITER_DEV, ret); + vfree(data); + return -ENODEV; + } + + vec[vcount].iov_base = &CHECK_CODE; + vec[vcount++].iov_len = sizeof(CHECK_CODE); + vec[vcount].iov_base = data; + vec[vcount++].iov_len = strlen(data) + 1; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + iov_iter_init(&iter, WRITE, vec, vcount, iov_length(vec, vcount)); + ret = vfs_iter_write(filp, &iter, &filp->f_pos, 0); + set_fs(oldfs); + + if (ret < 0) { + pr_err("write '%s' failed, res=%d", HISYSEVENT_WRITER_DEV, ret); + ret = -EIO; + goto out; + } + +out: + filp_close(filp, NULL); + vfree(data); + return ret; +} + +int zrhung_send_event(const char *domain, const char *event_name, const char *msg_buf) +{ + struct hisysevent *event = NULL; + int ret = 0; + + event = inner_build_hisysevent(domain, event_name, msg_buf); + + if (!event) { + pr_err("failed to build event"); + return -EINVAL; + } + + ret = zrhung_hisysevent_write(event); + zrhung_hisysevent_destroy(event); + return ret; +} diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 537fe1b376ad..4bda5fc5e624 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -22,6 +22,10 @@ #include #endif +#ifdef CONFIG_DFX_ZEROHUNG +#include +#endif + /** * DOC: overview * @@ -120,6 +124,9 @@ static int fb_notifier_callback(struct notifier_block *self, bd->props.state &= ~BL_CORE_FBBLANK; bd->props.fb_blank = FB_BLANK_UNBLANK; backlight_update_status(bd); +#ifdef CONFIG_DFX_ZEROHUNG + hung_wp_screen_setblank(fb_blank); +#endif } } else if (fb_blank != FB_BLANK_UNBLANK && bd->fb_bl_on[node]) { bd->fb_bl_on[node] = false; @@ -127,6 +134,9 @@ static int fb_notifier_callback(struct notifier_block *self, bd->props.state |= BL_CORE_FBBLANK; bd->props.fb_blank = fb_blank; backlight_update_status(bd); +#ifdef CONFIG_DFX_ZEROHUNG + hung_wp_screen_setblank(fb_blank); +#endif } } out: diff --git a/include/dfx/hung_wp_screen.h b/include/dfx/hung_wp_screen.h new file mode 100644 index 000000000000..88bb62435d6a --- /dev/null +++ b/include/dfx/hung_wp_screen.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef HUNG_WP_SCREEN_H +#define HUNG_WP_SCREEN_H + +#define WP_SCREEN_PWK_RELEASE 0 +#define WP_SCREEN_PWK_PRESS 1 + +#define ZRHUNG_WP_NONE 0 +#define ZRHUNG_WP_SCREENON 1 +#define ZRHUNG_WP_SCREENOFF 2 + +#define WP_SCREEN_DOMAIN "KERNEL_VENDOR" +#define WP_SCREEN_PWK_NAME "POWERKEY" +#define WP_SCREEN_LPRESS_NAME "LONGPRESS" +#define WP_SCREEN_ON_NAME "SCREEN_ON" +#define WP_SCREEN_OFF_NAME "SCREEN_OFF" + +void hung_wp_screen_powerkey_ncb(int event); +void hung_wp_screen_setblank(int blank); +int hung_wp_screen_getbl(void); + +#endif /* HUNG_WP_SCREEN_H */ diff --git a/include/dfx/zrhung.h b/include/dfx/zrhung.h new file mode 100644 index 000000000000..9d54df21c817 --- /dev/null +++ b/include/dfx/zrhung.h @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef ZRHUNG_H +#define ZRHUNG_H + +#define ZRHUNG_EVENT_TYPE 1 + +int zrhung_send_event(const char *domain, const char *event_name, const char *msg_buf); + +#endif /* ZRHUNG_H */ -- Gitee From ebb66b8df91b34e9b729f8b29f8a6afa09df3230 Mon Sep 17 00:00:00 2001 From: waterwin Date: Tue, 15 Feb 2022 12:06:03 +0800 Subject: [PATCH 027/113] hmdfs: Bugfix in device security level feature in hmdfs ohos inclusion category: bugfix issue: #I4TQ0H CVE: NA ---------------------------------------------- hmdfs device security level feature, panic when data security level is invalid. Signed-off-by: qianjiaxing --- fs/hmdfs/hmdfs_server.c | 4 ++-- fs/hmdfs/inode_local.c | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/hmdfs/hmdfs_server.c b/fs/hmdfs/hmdfs_server.c index dd4508b80e8e..ea3697f33128 100644 --- a/fs/hmdfs/hmdfs_server.c +++ b/fs/hmdfs/hmdfs_server.c @@ -252,7 +252,7 @@ static int parse_data_sec_level(const char *sl_value, size_t sl_value_len) { int i; - for (i = 0; i <= sizeof(datasl_str) / sizeof(datasl_str[0]); i++) { + for (i = 0; i < sizeof(datasl_str) / sizeof(datasl_str[0]); i++) { if (!strncmp(sl_value, datasl_str[i], strlen(datasl_str[i]))) return i + DATA_SEC_LEVEL0; } @@ -266,7 +266,7 @@ static int check_sec_level(struct hmdfs_peer *node, const char *file_name) int ret = 0; struct path root_path; struct path file_path; - char *value; + char *value = NULL; size_t value_len = DATA_SEC_LEVEL_LENGTH; if (node->devsl <= 0) { diff --git a/fs/hmdfs/inode_local.c b/fs/hmdfs/inode_local.c index 84904f50d27d..561b45dbb465 100644 --- a/fs/hmdfs/inode_local.c +++ b/fs/hmdfs/inode_local.c @@ -901,7 +901,6 @@ int hmdfs_permission(struct inode *inode, int mask) { #ifdef CONFIG_HMDFS_FS_PERMISSION unsigned int mode = inode->i_mode; - struct hmdfs_inode_info *hii = hmdfs_i(inode); kuid_t cur_uid = current_fsuid(); if (uid_eq(cur_uid, ROOT_UID) || uid_eq(cur_uid, SYSTEM_UID)) -- Gitee From acceb4fb840423fdcb37a5943b04d6f10d5ebae0 Mon Sep 17 00:00:00 2001 From: CY Fan Date: Tue, 15 Feb 2022 16:18:32 +0800 Subject: [PATCH 028/113] hyperhold: fix the compiler warnings when hyperhold configs are closed ohos inclusion category: bugfix issue: #I4TSO9 CVE: NA ----------------- This patch fixes the compilers warnings by modifying configs and functions dependencies Signed-off-by: CY Fan --- drivers/hyperhold/Kconfig | 2 ++ include/linux/zswapd.h | 58 ++------------------------------------- mm/Kconfig | 2 +- mm/zswapd.c | 2 ++ mm/zswapd_control.c | 1 + 5 files changed, 9 insertions(+), 56 deletions(-) diff --git a/drivers/hyperhold/Kconfig b/drivers/hyperhold/Kconfig index 8e5e7a1ee957..4bba0efd1c3e 100644 --- a/drivers/hyperhold/Kconfig +++ b/drivers/hyperhold/Kconfig @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 config HYPERHOLD bool "Hyperhold driver" + select HYPERHOLD_ZSWAPD + select HYPERHOLD_MEMCG default n help Hyperhold driver. diff --git a/include/linux/zswapd.h b/include/linux/zswapd.h index f549137f71b0..3a9768a358a8 100644 --- a/include/linux/zswapd.h +++ b/include/linux/zswapd.h @@ -34,8 +34,6 @@ struct group_swap_device { #ifdef CONFIG_HYPERHOLD_ZSWAPD extern int zswapd_run(int nid); extern void zswapd_stop(int nid); -extern void wakeup_zswapd(pg_data_t *pgdat); -extern bool zram_watermark_ok(void); extern void zswapd_status_show(struct seq_file *m); extern void wake_all_zswapd(void); extern void set_snapshotd_init_flag(unsigned int val); @@ -43,60 +41,10 @@ extern pid_t get_zswapd_pid(void); extern unsigned long long get_free_swap_threshold(void); extern struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv); extern void unregister_group_swap(struct group_swap_device *gsdev); -extern void memcg_eswap_info_show(struct seq_file *m); -#else -static inline int zswap_run(int nid) -{ - return 0; -} - -static inline void zswapd_stop(int nid) -{ -} - -static inline void wakeup_zswapd(pg_data_t *pgdat) -{ -} - -static inline bool zram_watermark_ok(void) -{ - return true; -} - -static inline void zswapd_status_show(struct seq_file *m) -{ -} - -static inline void wake_all_zswapd(void) -{ -} - -static inline void set_snapshotd_init_flag(unsigned int val) -{ -} -static inline pid_t get_zswapd_pid(void) -{ - return -EINVAL; -} - -static inline u64 get_free_swap_threshold(void) -{ - return 0; -} - -static struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv) -{ - return NULL; -} - -static void unregister_group_swap(struct group_swap_device *gsdev) -{ -} - -static void memcg_eswap_info_show(struct seq_file *m) -{ -} +#ifdef CONFIG_HYPERHOLD_DEBUG +extern void memcg_eswap_info_show(struct seq_file *m); +#endif #endif #endif /* _LINUX_ZSWAPD_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 6760018a1c8c..df9bf9f4ade7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -82,7 +82,7 @@ config HYPERHOLD_MEMCG config HYPERHOLD_ZSWAPD bool "Enable zswapd thread to reclaim anon pages in background" - depends on HYPERHOLD + depends on HYPERHOLD && ZRAM default n help zswapd is a kernel thread that reclaim anonymous pages in the diff --git a/mm/zswapd.c b/mm/zswapd.c index b5fcb0d2aa08..4bde41f21f93 100644 --- a/mm/zswapd.c +++ b/mm/zswapd.c @@ -473,6 +473,7 @@ void wake_all_zswapd(void) } } +#ifdef CONFIG_HYPERHOLD_FILE_LRU static void zswapd_shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) { @@ -565,6 +566,7 @@ static void zswapd_shrink_anon_memcg(struct pglist_data *pgdat, blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; } +#endif static bool zswapd_shrink_anon(pg_data_t *pgdat, struct scan_control *sc) { diff --git a/mm/zswapd_control.c b/mm/zswapd_control.c index d7ea6a6fe2cb..d91a08b5ae2f 100644 --- a/mm/zswapd_control.c +++ b/mm/zswapd_control.c @@ -630,6 +630,7 @@ static int compress_ratio_show(struct seq_file *m, void *v) return 0; } + static int zswapd_vmstat_show(struct seq_file *m, void *v) { #ifdef CONFIG_VM_EVENT_COUNTERS -- Gitee From 18f30ae68a853a0c093cd391bf7ad53d49c069b9 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Sun, 13 Feb 2022 22:32:45 +0800 Subject: [PATCH 029/113] sched: Introduce related thread group scheduling codeaurora inclusion category: feature issue: #I4SULH CVE: NA Signed-off-by: Li Ming ------------------------------------------- The original patch is from Code Aurora's latest msm-4.14. Based on the original patch, we add definitions for related thread group, and a subsequent changeset will provide improved schedule for related thread group. Signed-off-by: Vikram Mulukutla Signed-off-by: Satya Durga Srinivasu Prabhala Signed-off-by: Srinath Sridharan --- include/linux/sched.h | 7 +++++++ include/linux/sched/rtg.h | 15 +++++++++++++++ init/Kconfig | 2 ++ kernel/sched/Makefile | 1 + kernel/sched/core.c | 11 +++++++++++ kernel/sched/rtg/Kconfig | 10 ++++++++++ kernel/sched/rtg/Makefile | 2 ++ kernel/sched/rtg/rtg.c | 13 +++++++++++++ kernel/sched/rtg/rtg.h | 14 ++++++++++++++ kernel/sched/walt.c | 5 +++++ 10 files changed, 80 insertions(+) create mode 100644 include/linux/sched/rtg.h create mode 100644 kernel/sched/rtg/Kconfig create mode 100644 kernel/sched/rtg/Makefile create mode 100644 kernel/sched/rtg/rtg.c create mode 100644 kernel/sched/rtg/rtg.h diff --git a/include/linux/sched.h b/include/linux/sched.h index 44d5d8ed532a..c5e0c99cb3cd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -34,6 +34,7 @@ #include #include #include +#include /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -801,6 +802,12 @@ struct task_struct { u64 last_sleep_ts; #endif +#ifdef CONFIG_SCHED_RTG + int rtg_depth; + struct related_thread_group *grp; + struct list_head grp_list; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif diff --git a/include/linux/sched/rtg.h b/include/linux/sched/rtg.h new file mode 100644 index 000000000000..c17636439964 --- /dev/null +++ b/include/linux/sched/rtg.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __SCHED_RTG_H +#define __SCHED_RTG_H + +#ifdef CONFIG_SCHED_RTG +struct related_thread_group { + int id; + raw_spinlock_t lock; + struct list_head tasks; + struct list_head list; + + unsigned int nr_running; +}; +#endif /* CONFIG_SCHED_RTG */ +#endif diff --git a/init/Kconfig b/init/Kconfig index 1512479e7782..1d248e9c5a89 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -858,6 +858,8 @@ config UCLAMP_BUCKETS_COUNT If in doubt, use the default value. +source "kernel/sched/rtg/Kconfig" + endmenu # diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 0e3173ee99fb..1b4834073ae7 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -28,6 +28,7 @@ obj-y += wait.o wait_bit.o swait.o completion.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o obj-$(CONFIG_SCHED_WALT) += walt.o +obj-$(CONFIG_SCHED_RTG) += rtg/ obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 33e19cbd4eee..3a86b124f41c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -29,6 +29,7 @@ #include "pelt.h" #include "smp.h" #include "walt.h" +#include "rtg/rtg.h" /* * Export tracepoints that act as a bare tracehook (ie: have no trace event @@ -3207,6 +3208,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; #endif +#ifdef CONFIG_SCHED_RTG + p->rtg_depth = 0; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -3350,7 +3354,14 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (unlikely(p->sched_reset_on_fork)) { if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; +#ifdef CONFIG_SCHED_RTG + if (current->rtg_depth != 0) + p->static_prio = current->static_prio; + else + p->static_prio = NICE_TO_PRIO(0); +#else p->static_prio = NICE_TO_PRIO(0); +#endif p->rt_priority = 0; } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); diff --git a/kernel/sched/rtg/Kconfig b/kernel/sched/rtg/Kconfig new file mode 100644 index 000000000000..11a0343d935b --- /dev/null +++ b/kernel/sched/rtg/Kconfig @@ -0,0 +1,10 @@ +menu "Related Thread Group" + +config SCHED_RTG + bool "Related Thread Group" + depends on SCHED_WALT + default n + help + Set related threads into a group. + +endmenu diff --git a/kernel/sched/rtg/Makefile b/kernel/sched/rtg/Makefile new file mode 100644 index 000000000000..a911575b0734 --- /dev/null +++ b/kernel/sched/rtg/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_SCHED_RTG) += rtg.o diff --git a/kernel/sched/rtg/rtg.c b/kernel/sched/rtg/rtg.c new file mode 100644 index 000000000000..f48905afbbf4 --- /dev/null +++ b/kernel/sched/rtg/rtg.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * related thread group sched + * + */ +#include +#include "rtg.h" + +void init_task_rtg(struct task_struct *p) +{ + rcu_assign_pointer(p->grp, NULL); + INIT_LIST_HEAD(&p->grp_list); +} diff --git a/kernel/sched/rtg/rtg.h b/kernel/sched/rtg/rtg.h new file mode 100644 index 000000000000..39046758a6b7 --- /dev/null +++ b/kernel/sched/rtg/rtg.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * related thread group sched header + */ +#ifndef __RTG_H +#define __RTG_H + +#include +#include + +#ifdef CONFIG_SCHED_RTG +void init_task_rtg(struct task_struct *p); +#endif /* CONFIG_SCHED_RTG */ +#endif diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 30db3d617914..38699a333540 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -24,6 +24,7 @@ #include "sched.h" #include "walt.h" #include "core_ctl.h" +#include "rtg/rtg.h" #define CREATE_TRACE_POINTS #include #undef CREATE_TRACE_POINTS @@ -1160,6 +1161,10 @@ void init_new_task_load(struct task_struct *p) u32 init_load_windows_scaled = sched_init_task_load_windows_scaled; u32 init_load_pct = current->init_load_pct; +#ifdef CONFIG_SCHED_RTG + init_task_rtg(p); +#endif + p->last_sleep_ts = 0; p->init_load_pct = 0; memset(&p->ravg, 0, sizeof(struct ravg)); -- Gitee From cf327988be3e038a3630c66f0965b4bb61fc78bd Mon Sep 17 00:00:00 2001 From: Li Ming Date: Sun, 13 Feb 2022 23:36:45 +0800 Subject: [PATCH 030/113] sched: Minimally initialize the related thread group codeaurora inclusion category: feature issue: #I4SULH CVE: NA Signed-off-by: Li Ming ------------------------------------------- 21 groups (MAX_NUM_CGROUP_COLOC_ID) are created by default, of which DEFAULT_CGROUP_COLOC_ID is a reserved id. Signed-off-by: Vikram Mulukutla Signed-off-by: Satya Durga Srinivasu Prabhala Signed-off-by: Srinath Sridharan --- include/linux/sched/rtg.h | 7 ++ kernel/sched/core.c | 5 + kernel/sched/rtg/rtg.c | 204 ++++++++++++++++++++++++++++++++++++++ kernel/sched/rtg/rtg.h | 6 ++ kernel/sched/walt.c | 3 + 5 files changed, 225 insertions(+) diff --git a/include/linux/sched/rtg.h b/include/linux/sched/rtg.h index c17636439964..85bd334fa9cc 100644 --- a/include/linux/sched/rtg.h +++ b/include/linux/sched/rtg.h @@ -3,6 +3,11 @@ #define __SCHED_RTG_H #ifdef CONFIG_SCHED_RTG + +#define DEFAULT_RTG_GRP_ID 0 +#define DEFAULT_CGROUP_COLOC_ID 1 +#define MAX_NUM_CGROUP_COLOC_ID 21 + struct related_thread_group { int id; raw_spinlock_t lock; @@ -11,5 +16,7 @@ struct related_thread_group { unsigned int nr_running; }; + +int sched_set_group_id(struct task_struct *p, unsigned int group_id); #endif /* CONFIG_SCHED_RTG */ #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a86b124f41c..574c155b9e3a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7656,6 +7656,7 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); } + BUG_ON(alloc_related_thread_groups()); set_load_weight(&init_task, false); /* @@ -8970,6 +8971,10 @@ void sched_exit(struct task_struct *p) struct rq *rq; u64 wallclock; +#ifdef CONFIG_SCHED_RTG + sched_set_group_id(p, 0); +#endif + rq = task_rq_lock(p, &rf); /* rq->curr == p */ diff --git a/kernel/sched/rtg/rtg.c b/kernel/sched/rtg/rtg.c index f48905afbbf4..6d54c48ba36f 100644 --- a/kernel/sched/rtg/rtg.c +++ b/kernel/sched/rtg/rtg.c @@ -4,10 +4,214 @@ * */ #include + +#include "../sched.h" #include "rtg.h" +struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID]; +static DEFINE_RWLOCK(related_thread_group_lock); +static LIST_HEAD(active_related_thread_groups); + void init_task_rtg(struct task_struct *p) { rcu_assign_pointer(p->grp, NULL); INIT_LIST_HEAD(&p->grp_list); } + +struct related_thread_group *task_related_thread_group(struct task_struct *p) +{ + return rcu_dereference(p->grp); +} + +struct related_thread_group * +lookup_related_thread_group(unsigned int group_id) +{ + return related_thread_groups[group_id]; +} + +int alloc_related_thread_groups(void) +{ + int i, ret; + struct related_thread_group *grp = NULL; + + /* groupd_id = 0 is invalid as it's special id to remove group. */ + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = kzalloc(sizeof(*grp), GFP_NOWAIT); + if (!grp) { + ret = -ENOMEM; + goto err; + } + + grp->id = i; + INIT_LIST_HEAD(&grp->tasks); + INIT_LIST_HEAD(&grp->list); + raw_spin_lock_init(&grp->lock); + + related_thread_groups[i] = grp; + } + + return 0; + +err: + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = lookup_related_thread_group(i); + if (grp) { + kfree(grp); + related_thread_groups[i] = NULL; + } else { + break; + } + } + + return ret; +} + +static void remove_task_from_group(struct task_struct *p) +{ + struct related_thread_group *grp = p->grp; + struct rq *rq = NULL; + bool empty_group = true; + struct rq_flags flag; + unsigned long irqflag; + + rq = __task_rq_lock(p, &flag); + + raw_spin_lock_irqsave(&grp->lock, irqflag); + list_del_init(&p->grp_list); + rcu_assign_pointer(p->grp, NULL); + + if (p->on_cpu) + grp->nr_running--; + + if ((int)grp->nr_running < 0) { + WARN_ON(1); + grp->nr_running = 0; + } + + if (!list_empty(&grp->tasks)) + empty_group = false; + + raw_spin_unlock_irqrestore(&grp->lock, irqflag); + __task_rq_unlock(rq, &flag); + + /* Reserved groups cannot be destroyed */ + if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) { + /* + * We test whether grp->list is attached with list_empty() + * hence re-init the list after deletion. + */ + write_lock(&related_thread_group_lock); + list_del_init(&grp->list); + write_unlock(&related_thread_group_lock); + } +} + +static int +add_task_to_group(struct task_struct *p, struct related_thread_group *grp) +{ + struct rq *rq = NULL; + struct rq_flags flag; + unsigned long irqflag; + + /* + * Change p->grp under rq->lock. Will prevent races with read-side + * reference of p->grp in various hot-paths + */ + rq = __task_rq_lock(p, &flag); + + raw_spin_lock_irqsave(&grp->lock, irqflag); + list_add(&p->grp_list, &grp->tasks); + rcu_assign_pointer(p->grp, grp); + if (p->on_cpu) + grp->nr_running++; + + raw_spin_unlock_irqrestore(&grp->lock, irqflag); + __task_rq_unlock(rq, &flag); + + return 0; +} + +static int __sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + int rc = 0; + unsigned long flags; + struct related_thread_group *grp = NULL; + struct related_thread_group *old_grp = NULL; + + if (group_id >= MAX_NUM_CGROUP_COLOC_ID) + return -EINVAL; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + old_grp = p->grp; + if ((current != p && (p->flags & PF_EXITING)) || + (!old_grp && !group_id)) + goto done; + + /* + * If the system has CONFIG_SCHED_RTG_CGROUP, only tasks in DEFAULT group + * can be directly switched to other groups. + * + * In other cases, Switching from one group to another directly is not permitted. + */ + if (old_grp && group_id) { + pr_err("%s[%d] switching group from %d to %d failed.\n", + p->comm, p->pid, old_grp->id, group_id); + rc = -EINVAL; + goto done; + } + + if (!group_id) { + remove_task_from_group(p); + goto done; + } + + grp = lookup_related_thread_group(group_id); + write_lock(&related_thread_group_lock); + if (list_empty(&grp->list)) + list_add(&grp->list, &active_related_thread_groups); + write_unlock(&related_thread_group_lock); + + rc = add_task_to_group(p, grp); +done: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return rc; +} + +/* group_id == 0: remove task from rtg */ +int sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ + if (group_id == DEFAULT_CGROUP_COLOC_ID) + return -EINVAL; + + return __sched_set_group_id(p, group_id); +} + +void update_group_nr_running(struct task_struct *p, int event) +{ + struct related_thread_group *grp; + + rcu_read_lock(); + grp = task_related_thread_group(p); + if (!grp) { + rcu_read_unlock(); + return; + } + + raw_spin_lock(&grp->lock); + + if (event == PICK_NEXT_TASK) + grp->nr_running++; + else if (event == PUT_PREV_TASK) + grp->nr_running--; + + if ((int)grp->nr_running < 0) { + WARN_ON(1); + grp->nr_running = 0; + } + + raw_spin_unlock(&grp->lock); + + rcu_read_unlock(); +} diff --git a/kernel/sched/rtg/rtg.h b/kernel/sched/rtg/rtg.h index 39046758a6b7..80661f8b2d32 100644 --- a/kernel/sched/rtg/rtg.h +++ b/kernel/sched/rtg/rtg.h @@ -10,5 +10,11 @@ #ifdef CONFIG_SCHED_RTG void init_task_rtg(struct task_struct *p); +int alloc_related_thread_groups(void); +struct related_thread_group *lookup_related_thread_group(unsigned int group_id); +struct related_thread_group *task_related_thread_group(struct task_struct *p); +void update_group_nr_running(struct task_struct *p, int event); +#else +static inline int alloc_related_thread_groups(void) { return 0; } #endif /* CONFIG_SCHED_RTG */ #endif diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 38699a333540..f560321b8691 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -1109,6 +1109,9 @@ void update_task_ravg(struct task_struct *p, struct rq *rq, int event, old_window_start = update_window_start(rq, wallclock, event); +#ifdef CONFIG_SCHED_RTG + update_group_nr_running(p, event); +#endif if (!p->ravg.mark_start) goto done; -- Gitee From f1ca14c3ed9637a0f123ef938624d8029d265d84 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Mon, 14 Feb 2022 09:46:44 +0800 Subject: [PATCH 031/113] sched: Add debugfs for related thread group codeaurora inclusion category: feature issue: #I4SULH CVE: NA Signed-off-by: Li Ming ------------------------------------------- /proc/$PID/sched_group_id: 1. write $GROUP_ID to the sched_group_id file to add task (pid = $PID) to related thread group (group_id = $GROUP_ID). 2. read the group id which the process is located from the sched_group_id file Signed-off-by: Vikram Mulukutla Signed-off-by: Satya Durga Srinivasu Prabhala Signed-off-by: Srinath Sridharan --- fs/proc/base.c | 70 ++++++++++++++++++ include/linux/sched/rtg.h | 1 + kernel/sched/rtg/Kconfig | 7 ++ kernel/sched/rtg/rtg.c | 151 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 229 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index 96d4ab81619e..0d40f7a2cc4d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1499,6 +1499,70 @@ static const struct file_operations proc_pid_sched_operations = { #endif +#ifdef CONFIG_SCHED_RTG_DEBUG +static int sched_group_id_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + seq_printf(m, "%d\n", sched_get_group_id(p)); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_group_id_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[PROC_NUMBUF]; + int group_id, err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &group_id); + if (err) + goto out; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = sched_set_group_id(p, group_id); + + put_task_struct(p); + +out: + return err < 0 ? err : count; +} + +static int sched_group_id_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_group_id_show, inode); +} + +static const struct file_operations proc_pid_sched_group_id_operations = { + .open = sched_group_id_open, + .read = seq_read, + .write = sched_group_id_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SCHED_RTG_DEBUG */ + #ifdef CONFIG_SCHED_AUTOGROUP /* * Print out autogroup related information: @@ -3372,6 +3436,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_ACCESS_TOKENID ONE("tokenid", S_IRUSR, proc_token_operations), #endif +#ifdef CONFIG_SCHED_RTG_DEBUG + REG("sched_group_id", S_IRUGO|S_IWUGO, proc_pid_sched_group_id_operations), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3704,6 +3771,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_ACCESS_TOKENID ONE("tokenid", S_IRUSR, proc_token_operations), #endif +#ifdef CONFIG_SCHED_RTG_DEBUG + REG("sched_group_id", S_IRUGO|S_IWUGO, proc_pid_sched_group_id_operations), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/sched/rtg.h b/include/linux/sched/rtg.h index 85bd334fa9cc..5da7ef60d8ee 100644 --- a/include/linux/sched/rtg.h +++ b/include/linux/sched/rtg.h @@ -18,5 +18,6 @@ struct related_thread_group { }; int sched_set_group_id(struct task_struct *p, unsigned int group_id); +unsigned int sched_get_group_id(struct task_struct *p); #endif /* CONFIG_SCHED_RTG */ #endif diff --git a/kernel/sched/rtg/Kconfig b/kernel/sched/rtg/Kconfig index 11a0343d935b..a96073631d16 100644 --- a/kernel/sched/rtg/Kconfig +++ b/kernel/sched/rtg/Kconfig @@ -7,4 +7,11 @@ config SCHED_RTG help Set related threads into a group. +config SCHED_RTG_DEBUG + bool "Related Thread Group DebugFS" + depends on SCHED_RTG + default n + help + If set, debug node will show rtg threads + endmenu diff --git a/kernel/sched/rtg/rtg.c b/kernel/sched/rtg/rtg.c index 6d54c48ba36f..a3fb4481bd78 100644 --- a/kernel/sched/rtg/rtg.c +++ b/kernel/sched/rtg/rtg.c @@ -12,6 +12,9 @@ struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID]; static DEFINE_RWLOCK(related_thread_group_lock); static LIST_HEAD(active_related_thread_groups); +#define for_each_related_thread_group(grp) \ + list_for_each_entry(grp, &active_related_thread_groups, list) + void init_task_rtg(struct task_struct *p) { rcu_assign_pointer(p->grp, NULL); @@ -188,6 +191,19 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) return __sched_set_group_id(p, group_id); } +unsigned int sched_get_group_id(struct task_struct *p) +{ + unsigned int group_id; + struct related_thread_group *grp = NULL; + + rcu_read_lock(); + grp = task_related_thread_group(p); + group_id = grp ? grp->id : 0; + rcu_read_unlock(); + + return group_id; +} + void update_group_nr_running(struct task_struct *p, int event) { struct related_thread_group *grp; @@ -215,3 +231,138 @@ void update_group_nr_running(struct task_struct *p, int event) rcu_read_unlock(); } + +#ifdef CONFIG_SCHED_RTG_DEBUG +#define seq_printf_rtg(m, x...) \ +do { \ + if (m) \ + seq_printf(m, x); \ + else \ + printk(x); \ +} while (0) + +static void print_rtg_info(struct seq_file *file, + const struct related_thread_group *grp) +{ + seq_printf_rtg(file, "RTG_ID : %d\n", grp->id); +} + +static char rtg_task_state_to_char(const struct task_struct *tsk) +{ + static const char state_char[] = "RSDTtXZPI"; + unsigned int tsk_state = READ_ONCE(tsk->state); + unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; + + BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); + + if (tsk_state == TASK_IDLE) + state = TASK_REPORT_IDLE; + return state_char[fls(state)]; +} + +static inline void print_rtg_task_header(struct seq_file *file, + const char *header, int run, int nr) +{ + seq_printf_rtg(file, + "%s : %d/%d\n" + "STATE COMM PID PRIO CPU\n" + "---------------------------------------------------------\n", + header, run, nr); +} + +static inline void print_rtg_task(struct seq_file *file, + const struct task_struct *tsk) +{ + seq_printf_rtg(file, "%5c %15s %5d %5d %5d(%*pbl)\n", + rtg_task_state_to_char(tsk), tsk->comm, tsk->pid, + tsk->prio, task_cpu(tsk), cpumask_pr_args(tsk->cpus_ptr)); +} + +static void print_rtg_threads(struct seq_file *file, + const struct related_thread_group *grp) +{ + struct task_struct *tsk = NULL; + int nr_thread = 0; + + list_for_each_entry(tsk, &grp->tasks, grp_list) + nr_thread++; + + if (!nr_thread) + return; + + print_rtg_task_header(file, "RTG_THREADS", + grp->nr_running, nr_thread); + list_for_each_entry(tsk, &grp->tasks, grp_list) { + if (unlikely(!tsk)) + continue; + get_task_struct(tsk); + print_rtg_task(file, tsk); + put_task_struct(tsk); + } + seq_printf_rtg(file, "---------------------------------------------------------\n"); +} + +static int sched_rtg_debug_show(struct seq_file *file, void *param) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + bool have_task = false; + + for_each_related_thread_group(grp) { + if (unlikely(!grp)) { + seq_printf_rtg(file, "RTG none\n"); + return 0; + } + + raw_spin_lock_irqsave(&grp->lock, flags); + if (list_empty(&grp->tasks)) { + raw_spin_unlock_irqrestore(&grp->lock, flags); + continue; + } + + if (!have_task) + have_task = true; + + seq_printf_rtg(file, "\n\n"); + print_rtg_info(file, grp); + print_rtg_threads(file, grp); + raw_spin_unlock_irqrestore(&grp->lock, flags); + } + + if (!have_task) + seq_printf_rtg(file, "RTG tasklist empty\n"); + + return 0; +} + +static int sched_rtg_debug_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); + return 0; +} + +static int sched_rtg_debug_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_rtg_debug_show, NULL); +} + +static const struct proc_ops sched_rtg_debug_fops = { + .proc_open = sched_rtg_debug_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = sched_rtg_debug_release, +}; + +static int __init init_sched_rtg_debug_procfs(void) +{ + struct proc_dir_entry *pe = NULL; + + pe = proc_create("sched_rtg_debug", + 0400, NULL, &sched_rtg_debug_fops); + if (unlikely(!pe)) + return -ENOMEM; + return 0; +} +late_initcall(init_sched_rtg_debug_procfs); +#endif -- Gitee From 4adbc9fbbc760de140d78d84eb75869adee30c40 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Mon, 14 Feb 2022 11:25:07 +0800 Subject: [PATCH 032/113] sched: Provide independent load tracking for each group codeaurora inclusion category: feature issue: #I4SULH CVE: NA Signed-off-by: Li Ming ------------------------------------------- 1. record group load in grp->ravg. 2. task's cpu usage is accounted in grp->cpu_time[cpu]->curr/prev_runnable_sum when its ->grp is not NULL, otherwise rq->curr/prev_runnable_sum. Signed-off-by: Vikram Mulukutla Signed-off-by: Satya Durga Srinivasu Prabhala Signed-off-by: Srinath Sridharan --- include/linux/sched.h | 6 + include/linux/sched/rtg.h | 22 +++ include/trace/events/walt.h | 88 ++++++++++++ kernel/sched/rtg/rtg.c | 272 +++++++++++++++++++++++++++++++++++- kernel/sched/rtg/rtg.h | 7 + kernel/sched/sched.h | 3 + kernel/sched/walt.c | 92 ++++++++++-- kernel/sched/walt.h | 11 ++ 8 files changed, 489 insertions(+), 12 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index c5e0c99cb3cd..393cdfdfa6d9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -223,6 +223,12 @@ enum task_event { IRQ_UPDATE = 5, }; +/* Note: this need to be in sync with migrate_type_names array */ +enum migrate_types { + GROUP_TO_RQ, + RQ_TO_GROUP, +}; + #ifdef CONFIG_CPU_ISOLATION_OPT extern int sched_isolate_count(const cpumask_t *mask, bool include_offline); extern int sched_isolate_cpu(int cpu); diff --git a/include/linux/sched/rtg.h b/include/linux/sched/rtg.h index 5da7ef60d8ee..a35114766acb 100644 --- a/include/linux/sched/rtg.h +++ b/include/linux/sched/rtg.h @@ -8,6 +8,22 @@ #define DEFAULT_CGROUP_COLOC_ID 1 #define MAX_NUM_CGROUP_COLOC_ID 21 +struct group_cpu_time { + u64 window_start; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; +}; + +struct group_ravg { + unsigned long curr_window_load; + unsigned long curr_window_exec; + unsigned long prev_window_load; + unsigned long prev_window_exec; + unsigned long normalized_util; +}; + struct related_thread_group { int id; raw_spinlock_t lock; @@ -15,6 +31,12 @@ struct related_thread_group { struct list_head list; unsigned int nr_running; + struct group_ravg ravg; + u64 window_start; + u64 mark_start; + u64 prev_window_time; + /* rtg window information for WALT */ + unsigned int window_size; }; int sched_set_group_id(struct task_struct *p, unsigned int group_id); diff --git a/include/trace/events/walt.h b/include/trace/events/walt.h index e5328b75a8bd..9af92c8689b9 100644 --- a/include/trace/events/walt.h +++ b/include/trace/events/walt.h @@ -47,6 +47,43 @@ static inline s64 __rq_update_sum(struct rq *rq, bool curr, bool new) else return rq->prev_runnable_sum; } + +#ifdef CONFIG_SCHED_RTG +static inline s64 __grp_update_sum(struct rq *rq, bool curr, bool new) +{ + if (curr) + if (new) + return rq->grp_time.nt_curr_runnable_sum; + else + return rq->grp_time.curr_runnable_sum; + else + if (new) + return rq->grp_time.nt_prev_runnable_sum; + else + return rq->grp_time.prev_runnable_sum; +} + +static inline s64 +__get_update_sum(struct rq *rq, enum migrate_types migrate_type, + bool src, bool new, bool curr) +{ + switch (migrate_type) { + case RQ_TO_GROUP: + if (src) + return __rq_update_sum(rq, curr, new); + else + return __grp_update_sum(rq, curr, new); + case GROUP_TO_RQ: + if (src) + return __grp_update_sum(rq, curr, new); + else + return __rq_update_sum(rq, curr, new); + default: + WARN_ON_ONCE(1); + return -1; + } +} +#endif #endif TRACE_EVENT(sched_update_history, @@ -162,6 +199,57 @@ TRACE_EVENT(sched_update_task_ravg, __entry->active_windows) ); +extern const char *migrate_type_names[]; + +#ifdef CONFIG_SCHED_RTG +TRACE_EVENT(sched_migration_update_sum, + + TP_PROTO(struct task_struct *p, enum migrate_types migrate_type, struct rq *rq), + + TP_ARGS(p, migrate_type, rq), + + TP_STRUCT__entry( + __field(int, tcpu) + __field(int, pid) + __field(enum migrate_types, migrate_type) + __field(s64, src_cs) + __field(s64, src_ps) + __field(s64, dst_cs) + __field(s64, dst_ps) + __field(s64, src_nt_cs) + __field(s64, src_nt_ps) + __field(s64, dst_nt_cs) + __field(s64, dst_nt_ps) + ), + + TP_fast_assign( + __entry->tcpu = task_cpu(p); + __entry->pid = p->pid; + __entry->migrate_type = migrate_type; + __entry->src_cs = __get_update_sum(rq, migrate_type, + true, false, true); + __entry->src_ps = __get_update_sum(rq, migrate_type, + true, false, false); + __entry->dst_cs = __get_update_sum(rq, migrate_type, + false, false, true); + __entry->dst_ps = __get_update_sum(rq, migrate_type, + false, false, false); + __entry->src_nt_cs = __get_update_sum(rq, migrate_type, + true, true, true); + __entry->src_nt_ps = __get_update_sum(rq, migrate_type, + true, true, false); + __entry->dst_nt_cs = __get_update_sum(rq, migrate_type, + false, true, true); + __entry->dst_nt_ps = __get_update_sum(rq, migrate_type, + false, true, false); + ), + + TP_printk("pid %d task_cpu %d migrate_type %s src_cs %llu src_ps %llu dst_cs %lld dst_ps %lld src_nt_cs %llu src_nt_ps %llu dst_nt_cs %lld dst_nt_ps %lld", + __entry->pid, __entry->tcpu, migrate_type_names[__entry->migrate_type], + __entry->src_cs, __entry->src_ps, __entry->dst_cs, __entry->dst_ps, + __entry->src_nt_cs, __entry->src_nt_ps, __entry->dst_nt_cs, __entry->dst_nt_ps) +); +#endif #endif /* _TRACE_WALT_H */ /* This part must be outside protection */ diff --git a/kernel/sched/rtg/rtg.c b/kernel/sched/rtg/rtg.c index a3fb4481bd78..76d8f366fff5 100644 --- a/kernel/sched/rtg/rtg.c +++ b/kernel/sched/rtg/rtg.c @@ -4,9 +4,16 @@ * */ #include +#include #include "../sched.h" #include "rtg.h" +#include "../walt.h" + +#define ADD_TASK 0 +#define REM_TASK 1 + +#define DEFAULT_GROUP_RATE 60 /* 60FPS */ struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID]; static DEFINE_RWLOCK(related_thread_group_lock); @@ -48,6 +55,7 @@ int alloc_related_thread_groups(void) grp->id = i; INIT_LIST_HEAD(&grp->tasks); INIT_LIST_HEAD(&grp->list); + grp->window_size = NSEC_PER_SEC / DEFAULT_GROUP_RATE; raw_spin_lock_init(&grp->lock); related_thread_groups[i] = grp; @@ -69,6 +77,111 @@ int alloc_related_thread_groups(void) return ret; } +/* + * Task's cpu usage is accounted in: + * rq->curr/prev_runnable_sum, when its ->grp is NULL + * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL + * + * Transfer task's cpu usage between those counters when transitioning between + * groups + */ +static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, + struct task_struct *p, int event) +{ + u64 wallclock; + struct group_cpu_time *cpu_time; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + int migrate_type; + int cpu = cpu_of(rq); + bool new_task; + int i; + + wallclock = sched_ktime_clock(); + + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); + new_task = is_new_task(p); + + cpu_time = &rq->grp_time; + if (event == ADD_TASK) { + migrate_type = RQ_TO_GROUP; + + src_curr_runnable_sum = &rq->curr_runnable_sum; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &rq->prev_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + + src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + *src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu]; + *src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu]; + if (new_task) { + *src_nt_curr_runnable_sum -= + p->ravg.curr_window_cpu[cpu]; + *src_nt_prev_runnable_sum -= + p->ravg.prev_window_cpu[cpu]; + } + + update_cluster_load_subtractions(p, cpu, + rq->window_start, new_task); + + } else { + migrate_type = GROUP_TO_RQ; + + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_curr_runnable_sum = &rq->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_prev_runnable_sum = &rq->prev_runnable_sum; + + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + + *src_curr_runnable_sum -= p->ravg.curr_window; + *src_prev_runnable_sum -= p->ravg.prev_window; + if (new_task) { + *src_nt_curr_runnable_sum -= p->ravg.curr_window; + *src_nt_prev_runnable_sum -= p->ravg.prev_window; + } + + /* + * Need to reset curr/prev windows for all CPUs, not just the + * ones in the same cluster. Since inter cluster migrations + * did not result in the appropriate book keeping, the values + * per CPU would be inaccurate. + */ + for_each_possible_cpu(i) { + p->ravg.curr_window_cpu[i] = 0; + p->ravg.prev_window_cpu[i] = 0; + } + } + + *dst_curr_runnable_sum += p->ravg.curr_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + if (new_task) { + *dst_nt_curr_runnable_sum += p->ravg.curr_window; + *dst_nt_prev_runnable_sum += p->ravg.prev_window; + } + + /* + * When a task enter or exits a group, it's curr and prev windows are + * moved to a single CPU. This behavior might be sub-optimal in the + * exit case, however, it saves us the overhead of handling inter + * cluster migration fixups while the task is part of a related group. + */ + p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window; + p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window; + + trace_sched_migration_update_sum(p, migrate_type, rq); +} + static void remove_task_from_group(struct task_struct *p) { struct related_thread_group *grp = p->grp; @@ -78,6 +191,7 @@ static void remove_task_from_group(struct task_struct *p) unsigned long irqflag; rq = __task_rq_lock(p, &flag); + transfer_busy_time(rq, p->grp, p, REM_TASK); raw_spin_lock_irqsave(&grp->lock, irqflag); list_del_init(&p->grp_list); @@ -121,12 +235,17 @@ add_task_to_group(struct task_struct *p, struct related_thread_group *grp) * reference of p->grp in various hot-paths */ rq = __task_rq_lock(p, &flag); + transfer_busy_time(rq, grp, p, ADD_TASK); raw_spin_lock_irqsave(&grp->lock, irqflag); list_add(&p->grp_list, &grp->tasks); rcu_assign_pointer(p->grp, grp); - if (p->on_cpu) + if (p->on_cpu) { grp->nr_running++; + if (grp->nr_running == 1) + grp->mark_start = max(grp->mark_start, + sched_ktime_clock()); + } raw_spin_unlock_irqrestore(&grp->lock, irqflag); __task_rq_unlock(rq, &flag); @@ -232,6 +351,157 @@ void update_group_nr_running(struct task_struct *p, int event) rcu_read_unlock(); } +int sched_set_group_window_size(unsigned int grp_id, unsigned int window_size) +{ + struct related_thread_group *grp = NULL; + unsigned long flag; + + if (!window_size) + return -EINVAL; + + grp = lookup_related_thread_group(grp_id); + if (!grp) { + pr_err("set window size for group %d fail\n", grp_id); + return -ENODEV; + } + + raw_spin_lock_irqsave(&grp->lock, flag); + grp->window_size = window_size; + raw_spin_unlock_irqrestore(&grp->lock, flag); + + return 0; +} + +void group_time_rollover(struct group_ravg *ravg) +{ + ravg->prev_window_load = ravg->curr_window_load; + ravg->curr_window_load = 0; + ravg->prev_window_exec = ravg->curr_window_exec; + ravg->curr_window_exec = 0; +} + +int sched_set_group_window_rollover(unsigned int grp_id) +{ + struct related_thread_group *grp = NULL; + u64 wallclock; + unsigned long flag; + + grp = lookup_related_thread_group(grp_id); + if (!grp) { + pr_err("set window start for group %d fail\n", grp_id); + return -ENODEV; + } + + raw_spin_lock_irqsave(&grp->lock, flag); + + wallclock = sched_ktime_clock(); + grp->prev_window_time = wallclock - grp->window_start; + grp->window_start = wallclock; + + group_time_rollover(&grp->ravg); + raw_spin_unlock_irqrestore(&grp->lock, flag); + + return 0; +} + +static void add_to_group_time(struct related_thread_group *grp, struct rq *rq, u64 wallclock) +{ + u64 delta_exec, delta_load; + u64 mark_start = grp->mark_start; + u64 window_start = grp->window_start; + + if (unlikely(wallclock <= mark_start)) + return; + + /* per group load tracking in RTG */ + if (likely(mark_start >= window_start)) { + /* + * ws ms wc + * | | | + * V V V + * |---------------| + */ + delta_exec = wallclock - mark_start; + grp->ravg.curr_window_exec += delta_exec; + + delta_load = scale_exec_time(delta_exec, rq); + grp->ravg.curr_window_load += delta_load; + } else { + /* + * ms ws wc + * | | | + * V V V + * -----|---------- + */ + /* prev window statistic */ + delta_exec = window_start - mark_start; + grp->ravg.prev_window_exec += delta_exec; + + delta_load = scale_exec_time(delta_exec, rq); + grp->ravg.prev_window_load += delta_load; + + /* curr window statistic */ + delta_exec = wallclock - window_start; + grp->ravg.curr_window_exec += delta_exec; + + delta_load = scale_exec_time(delta_exec, rq); + grp->ravg.curr_window_load += delta_load; + } +} + +static inline void add_to_group_demand(struct related_thread_group *grp, + struct rq *rq, u64 wallclock) +{ + if (unlikely(wallclock <= grp->window_start)) + return; + + add_to_group_time(grp, rq, wallclock); +} + +static int account_busy_for_group_demand(struct task_struct *p, int event) +{ + /* + *No need to bother updating task demand for exiting tasks + * or the idle task. + */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + if (event == TASK_WAKE || event == TASK_MIGRATE) + return 0; + + return 1; +} + +void update_group_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + struct related_thread_group *grp; + + if (!account_busy_for_group_demand(p, event)) + return; + + rcu_read_lock(); + grp = task_related_thread_group(p); + if (!grp) { + rcu_read_unlock(); + return; + } + + raw_spin_lock(&grp->lock); + + if (grp->nr_running == 1) + grp->mark_start = max(grp->mark_start, p->ravg.mark_start); + + add_to_group_demand(grp, rq, wallclock); + + grp->mark_start = wallclock; + + raw_spin_unlock(&grp->lock); + + rcu_read_unlock(); +} + #ifdef CONFIG_SCHED_RTG_DEBUG #define seq_printf_rtg(m, x...) \ do { \ diff --git a/kernel/sched/rtg/rtg.h b/kernel/sched/rtg/rtg.h index 80661f8b2d32..5970d28cadef 100644 --- a/kernel/sched/rtg/rtg.h +++ b/kernel/sched/rtg/rtg.h @@ -14,6 +14,13 @@ int alloc_related_thread_groups(void); struct related_thread_group *lookup_related_thread_group(unsigned int group_id); struct related_thread_group *task_related_thread_group(struct task_struct *p); void update_group_nr_running(struct task_struct *p, int event); +struct rq; +void update_group_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock); +int sched_set_group_window_size(unsigned int grp_id, unsigned int window_size); +int sched_set_group_window_rollover(unsigned int grp_id); +struct group_cpu_time *group_update_cpu_time(struct rq *rq, + struct related_thread_group *grp); #else static inline int alloc_related_thread_groups(void) { return 0; } #endif /* CONFIG_SCHED_RTG */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 22ff400d5b08..fdb69a9ad1f9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1077,7 +1077,10 @@ struct rq { u64 nt_prev_runnable_sum; u64 cum_window_demand_scaled; struct load_subtractions load_subs[NUM_TRACKED_WINDOWS]; +#ifdef CONFIG_SCHED_RTG + struct group_cpu_time grp_time; #endif +#endif /* CONFIG_SCHED_WALT */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index f560321b8691..a2824cc9bc2e 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -32,6 +32,8 @@ const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK", "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", "IRQ_UPDATE"}; +const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP", + "RQ_TO_RQ", "GROUP_TO_GROUP"}; #define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 #define SCHED_ACCOUNT_WAIT_TIME 1 @@ -476,6 +478,13 @@ void fixup_busy_time(struct task_struct *p, int new_cpu) struct rq *dest_rq = cpu_rq(new_cpu); u64 wallclock; bool new_task; +#ifdef CONFIG_SCHED_RTG + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + struct related_thread_group *grp; +#endif if (!p->on_rq && p->state != TASK_WAKING) return; @@ -513,9 +522,58 @@ void fixup_busy_time(struct task_struct *p, int new_cpu) } new_task = is_new_task(p); +#ifdef CONFIG_SCHED_RTG + /* Protected by rq_lock */ + grp = task_related_thread_group(p); + + /* + * For frequency aggregation, we continue to do migration fixups + * even for intra cluster migrations. This is because, the aggregated + * load has to reported on a single CPU regardless. + */ + if (grp) { + struct group_cpu_time *cpu_time; + + cpu_time = &src_rq->grp_time; + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + cpu_time = &dest_rq->grp_time; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + if (p->ravg.curr_window) { + *src_curr_runnable_sum -= p->ravg.curr_window; + *dst_curr_runnable_sum += p->ravg.curr_window; + if (new_task) { + *src_nt_curr_runnable_sum -= + p->ravg.curr_window; + *dst_nt_curr_runnable_sum += + p->ravg.curr_window; + } + } - inter_cluster_migration_fixup(p, new_cpu, - task_cpu(p), new_task); + if (p->ravg.prev_window) { + *src_prev_runnable_sum -= p->ravg.prev_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + if (new_task) { + *src_nt_prev_runnable_sum -= + p->ravg.prev_window; + *dst_nt_prev_runnable_sum += + p->ravg.prev_window; + } + } + } else { +#endif + inter_cluster_migration_fixup(p, new_cpu, + task_cpu(p), new_task); +#ifdef CONFIG_SCHED_RTG + } +#endif if (!same_freq_domain(new_cpu, task_cpu(p))) irq_work_queue(&walt_migration_irq_work); @@ -634,15 +692,6 @@ static void update_history(struct rq *rq, struct task_struct *p, #define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) -static inline u64 scale_exec_time(u64 delta, struct rq *rq) -{ - unsigned long capcurr = capacity_curr_of(cpu_of(rq)); - - delta = (delta * capcurr) >> SCHED_CAPACITY_SHIFT; - - return delta; -} - static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) { delta = scale_exec_time(delta, rq); @@ -712,6 +761,10 @@ static u64 update_task_demand(struct task_struct *p, struct rq *rq, u32 window_size = sched_ravg_window; u64 runtime; +#ifdef CONFIG_SCHED_RTG + update_group_demand(p, rq, event, wallclock); +#endif + new_window = mark_start < window_start; if (!account_busy_for_task_demand(rq, p, event)) { if (new_window) @@ -870,6 +923,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; bool new_task; int cpu = rq->cpu; +#ifdef CONFIG_SCHED_RTG + struct group_cpu_time *cpu_time; + struct related_thread_group *grp; +#endif new_window = mark_start < window_start; if (new_window) { @@ -895,6 +952,19 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, if (!account_busy_for_cpu_time(rq, p, irqtime, event)) goto done; +#ifdef CONFIG_SCHED_RTG + grp = task_related_thread_group(p); + if (grp) { + cpu_time = &rq->grp_time; + + curr_runnable_sum = &cpu_time->curr_runnable_sum; + prev_runnable_sum = &cpu_time->prev_runnable_sum; + + nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + } +#endif + if (!new_window) { /* * account_busy_for_cpu_time() = 1 so busy time needs diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index fcb1555d53f8..84da97ccce20 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -45,6 +45,15 @@ static inline struct sched_cluster *cpu_cluster(int cpu) return cpu_rq(cpu)->cluster; } +static inline u64 scale_exec_time(u64 delta, struct rq *rq) +{ + unsigned long capcurr = capacity_curr_of(cpu_of(rq)); + + delta = (delta * capcurr) >> SCHED_CAPACITY_SHIFT; + + return delta; +} + static inline bool is_new_task(struct task_struct *p) { return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS; @@ -192,6 +201,8 @@ static inline void assign_cluster_ids(struct list_head *head) } } +extern void update_cluster_load_subtractions(struct task_struct *p, + int cpu, u64 ws, bool new_task); #else /* CONFIG_SCHED_WALT */ static inline void walt_sched_init_rq(struct rq *rq) { } -- Gitee From 39ceb100c088bef899f96f2b733fe1f6b90b0564 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Mon, 14 Feb 2022 13:19:01 +0800 Subject: [PATCH 033/113] sched: scehd: Introduce sched_update_rtg_tick() ohos inclusion category: feature issue: #I4SULH CVE: NA ------------------------------------------- sched_update_rtg_tick() is called in tick. Signed-off-by: Li Ming --- include/linux/sched/rtg.h | 7 +++++++ kernel/sched/core.c | 3 +++ kernel/sched/rtg/rtg.c | 17 +++++++++++++++++ kernel/sched/rtg/rtg.h | 1 + 4 files changed, 28 insertions(+) diff --git a/include/linux/sched/rtg.h b/include/linux/sched/rtg.h index a35114766acb..b5cc92fcece9 100644 --- a/include/linux/sched/rtg.h +++ b/include/linux/sched/rtg.h @@ -24,6 +24,8 @@ struct group_ravg { unsigned long normalized_util; }; +struct rtg_class; + struct related_thread_group { int id; raw_spinlock_t lock; @@ -37,6 +39,11 @@ struct related_thread_group { u64 prev_window_time; /* rtg window information for WALT */ unsigned int window_size; + const struct rtg_class *rtg_class; +}; + +struct rtg_class { + void (*sched_update_rtg_tick)(struct related_thread_group *grp); }; int sched_set_group_id(struct task_struct *p, unsigned int group_id); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 574c155b9e3a..20dd5009e315 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4124,6 +4124,9 @@ void scheduler_tick(void) rq_unlock(rq, &rf); +#ifdef CONFIG_SCHED_RTG + sched_update_rtg_tick(curr); +#endif perf_event_task_tick(); #ifdef CONFIG_SMP diff --git a/kernel/sched/rtg/rtg.c b/kernel/sched/rtg/rtg.c index 76d8f366fff5..78fbcd1b9cd3 100644 --- a/kernel/sched/rtg/rtg.c +++ b/kernel/sched/rtg/rtg.c @@ -502,6 +502,23 @@ void update_group_demand(struct task_struct *p, struct rq *rq, rcu_read_unlock(); } +void sched_update_rtg_tick(struct task_struct *p) +{ + struct related_thread_group *grp = NULL; + + rcu_read_lock(); + grp = task_related_thread_group(p); + if (!grp || list_empty(&grp->tasks)) { + rcu_read_unlock(); + return; + } + + if (grp->rtg_class && grp->rtg_class->sched_update_rtg_tick) + grp->rtg_class->sched_update_rtg_tick(grp); + + rcu_read_unlock(); +} + #ifdef CONFIG_SCHED_RTG_DEBUG #define seq_printf_rtg(m, x...) \ do { \ diff --git a/kernel/sched/rtg/rtg.h b/kernel/sched/rtg/rtg.h index 5970d28cadef..e32c67aebb96 100644 --- a/kernel/sched/rtg/rtg.h +++ b/kernel/sched/rtg/rtg.h @@ -21,6 +21,7 @@ int sched_set_group_window_size(unsigned int grp_id, unsigned int window_size); int sched_set_group_window_rollover(unsigned int grp_id); struct group_cpu_time *group_update_cpu_time(struct rq *rq, struct related_thread_group *grp); +void sched_update_rtg_tick(struct task_struct *p); #else static inline int alloc_related_thread_groups(void) { return 0; } #endif /* CONFIG_SCHED_RTG */ -- Gitee From c18d29d98ab6d78f6ba5a16b4ba980963368624c Mon Sep 17 00:00:00 2001 From: Li Ming Date: Mon, 14 Feb 2022 14:51:40 +0800 Subject: [PATCH 034/113] sched: Introduce perferred cluster to optimize cpu selection for related threads codeaurora inclusion category: feature issue: #I4SULH CVE: NA Signed-off-by: Li Ming ------------------------------------------- Set the preferred cluster of group according to the group load and prioritize cpu selection for related threads from preferred cluster Signed-off-by: Vikram Mulukutla Signed-off-by: Satya Durga Srinivasu Prabhala Signed-off-by: Srinath Sridharan --- include/linux/sched/rtg.h | 1 + kernel/sched/fair.c | 76 ++++++++++++++-- kernel/sched/rtg/rtg.c | 186 ++++++++++++++++++++++++++++++++++++++ kernel/sched/rtg/rtg.h | 12 +++ kernel/sched/sched.h | 13 ++- kernel/sched/walt.h | 6 ++ 6 files changed, 286 insertions(+), 8 deletions(-) diff --git a/include/linux/sched/rtg.h b/include/linux/sched/rtg.h index b5cc92fcece9..eae7f83808ff 100644 --- a/include/linux/sched/rtg.h +++ b/include/linux/sched/rtg.h @@ -40,6 +40,7 @@ struct related_thread_group { /* rtg window information for WALT */ unsigned int window_size; const struct rtg_class *rtg_class; + struct sched_cluster *preferred_cluster; }; struct rtg_class { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42d51caa611c..3b8d6c1dfc30 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -22,6 +22,7 @@ */ #include "sched.h" #include "walt.h" +#include "rtg/rtg.h" #ifdef CONFIG_SCHED_WALT static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p, @@ -773,7 +774,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); -static unsigned long capacity_of(int cpu); /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) @@ -4104,8 +4104,27 @@ static inline int task_fits_capacity(struct task_struct *p, long capacity) return fits_capacity(uclamp_task_util(p), capacity); } +#ifdef CONFIG_SCHED_RTG +bool task_fits_max(struct task_struct *p, int cpu) +{ + unsigned long capacity = capacity_orig_of(cpu); + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity; + + if (capacity == max_capacity) + return true; + + return task_fits_capacity(p, capacity); +} +#endif + static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { + bool task_fits = false; +#ifdef CONFIG_SCHED_RTG + int cpu = cpu_of(rq); + struct cpumask *rtg_target = NULL; +#endif + if (!static_branch_unlikely(&sched_asym_cpucapacity)) return; @@ -4114,7 +4133,17 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { +#ifdef CONFIG_SCHED_RTG + rtg_target = find_rtg_target(p); + if (rtg_target) + task_fits = capacity_orig_of(cpu) >= + capacity_orig_of(cpumask_first(rtg_target)); + else + task_fits = task_fits_capacity(p, capacity_of(cpu_of(rq))); +#else + task_fits = task_fits_capacity(p, capacity_of(cpu_of(rq))); +#endif + if (task_fits) { rq->misfit_task_load = 0; return; } @@ -5805,11 +5834,6 @@ static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p) return runnable; } -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - static void record_wakee(struct task_struct *p) { /* @@ -6574,6 +6598,12 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) return min_t(unsigned long, util, capacity_orig_of(cpu)); } +#ifdef CONFIG_SCHED_RTG +unsigned long capacity_spare_without(int cpu, struct task_struct *p) +{ + return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0); +} +#endif /* * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) * to @dst_cpu. @@ -6840,6 +6870,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int new_cpu = prev_cpu; int want_affine = 0; int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); +#ifdef CONFIG_SCHED_RTG + int target_cpu = -1; + target_cpu = find_rtg_cpu(p); + if (target_cpu >= 0) + return target_cpu; +#endif if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); @@ -7524,6 +7560,7 @@ enum migration_type { #define LBF_SOME_PINNED 0x08 #define LBF_NOHZ_STATS 0x10 #define LBF_NOHZ_AGAIN 0x20 +#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200 struct lb_env { struct sched_domain *sd; @@ -7706,6 +7743,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; + +#ifdef CONFIG_SCHED_RTG + if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS && + !preferred_cluster(cpu_rq(env->dst_cpu)->cluster, p)) + return 0; +#endif + if (task_running(env->src_rq, p)) { schedstat_inc(p->se.statistics.nr_failed_migrations_running); return 0; @@ -7798,12 +7842,21 @@ static int detach_tasks(struct lb_env *env) unsigned long util, load; struct task_struct *p; int detached = 0; +#ifdef CONFIG_SCHED_RTG + int orig_loop = env->loop; +#endif lockdep_assert_held(&env->src_rq->lock); if (env->imbalance <= 0) return 0; +#ifdef CONFIG_SCHED_RTG + if (!same_cluster(env->dst_cpu, env->src_cpu)) + env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS; + +redo: +#endif while (!list_empty(tasks)) { /* * We don't want to steal all, otherwise we may be treated likewise, @@ -7905,6 +7958,15 @@ static int detach_tasks(struct lb_env *env) list_move(&p->se.group_node, tasks); } +#ifdef CONFIG_SCHED_RTG + if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS && !detached) { + tasks = &env->src_rq->cfs_tasks; + env->flags &= ~LBF_IGNORE_PREFERRED_CLUSTER_TASKS; + env->loop = orig_loop; + goto redo; + } +#endif + /* * Right now, this is one of only two places we collect this stat * so we can safely collect detach_one_task() stats here rather diff --git a/kernel/sched/rtg/rtg.c b/kernel/sched/rtg/rtg.c index 78fbcd1b9cd3..200895617a71 100644 --- a/kernel/sched/rtg/rtg.c +++ b/kernel/sched/rtg/rtg.c @@ -4,6 +4,7 @@ * */ #include +#include #include #include "../sched.h" @@ -182,6 +183,8 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, trace_sched_migration_update_sum(p, migrate_type, rq); } +static void _set_preferred_cluster(struct related_thread_group *grp, + int sched_cluster_id); static void remove_task_from_group(struct task_struct *p) { struct related_thread_group *grp = p->grp; @@ -207,6 +210,8 @@ static void remove_task_from_group(struct task_struct *p) if (!list_empty(&grp->tasks)) empty_group = false; + else + _set_preferred_cluster(grp, -1); raw_spin_unlock_irqrestore(&grp->lock, irqflag); __task_rq_unlock(rq, &flag); @@ -519,6 +524,185 @@ void sched_update_rtg_tick(struct task_struct *p) rcu_read_unlock(); } +int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +{ + struct related_thread_group *grp = NULL; + int rc = 1; + + rcu_read_lock(); + + grp = task_related_thread_group(p); + if (grp != NULL) + rc = (grp->preferred_cluster == cluster); + + rcu_read_unlock(); + return rc; +} + +unsigned int get_cluster_grp_running(int cluster_id) +{ + struct related_thread_group *grp = NULL; + unsigned int total_grp_running = 0; + unsigned long flag, rtg_flag; + unsigned int i; + + read_lock_irqsave(&related_thread_group_lock, rtg_flag); + + /* grp_id 0 is used for exited tasks */ + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = lookup_related_thread_group(i); + if (!grp) + continue; + + raw_spin_lock_irqsave(&grp->lock, flag); + if (grp->preferred_cluster != NULL && + grp->preferred_cluster->id == cluster_id) + total_grp_running += grp->nr_running; + raw_spin_unlock_irqrestore(&grp->lock, flag); + } + read_unlock_irqrestore(&related_thread_group_lock, rtg_flag); + + return total_grp_running; +} + +static void _set_preferred_cluster(struct related_thread_group *grp, + int sched_cluster_id) +{ + struct sched_cluster *cluster = NULL; + struct sched_cluster *cluster_found = NULL; + + if (sched_cluster_id == -1) { + grp->preferred_cluster = NULL; + return; + } + + for_each_sched_cluster_reverse(cluster) { + if (cluster->id == sched_cluster_id) { + cluster_found = cluster; + break; + } + } + + if (cluster_found != NULL) + grp->preferred_cluster = cluster_found; + else + pr_err("cannot found sched_cluster_id=%d\n", sched_cluster_id); +} + +/* + * sched_cluster_id == -1: grp will set to NULL + */ +static void set_preferred_cluster(struct related_thread_group *grp, + int sched_cluster_id) +{ + unsigned long flag; + + raw_spin_lock_irqsave(&grp->lock, flag); + _set_preferred_cluster(grp, sched_cluster_id); + raw_spin_unlock_irqrestore(&grp->lock, flag); +} + +int sched_set_group_preferred_cluster(unsigned int grp_id, int sched_cluster_id) +{ + struct related_thread_group *grp = NULL; + + /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ + if (grp_id == DEFAULT_CGROUP_COLOC_ID || + grp_id >= MAX_NUM_CGROUP_COLOC_ID) + return -EINVAL; + + grp = lookup_related_thread_group(grp_id); + if (!grp) { + pr_err("set preferred cluster for group %d fail\n", grp_id); + return -ENODEV; + } + set_preferred_cluster(grp, sched_cluster_id); + + return 0; +} + +struct cpumask *find_rtg_target(struct task_struct *p) +{ + struct related_thread_group *grp = NULL; + struct sched_cluster *preferred_cluster = NULL; + struct cpumask *rtg_target = NULL; + + rcu_read_lock(); + grp = task_related_thread_group(p); + rcu_read_unlock(); + + if (!grp) + return NULL; + + preferred_cluster = grp->preferred_cluster; + if (!preferred_cluster) + return NULL; + + rtg_target = &preferred_cluster->cpus; + if (!task_fits_max(p, cpumask_first(rtg_target))) + return NULL; + + return rtg_target; +} + +int find_rtg_cpu(struct task_struct *p) +{ + int i; + cpumask_t search_cpus = CPU_MASK_NONE; + int max_spare_cap_cpu = -1; + unsigned long max_spare_cap = 0; + int idle_backup_cpu = -1; + struct cpumask *preferred_cpus = find_rtg_target(p); + + if (!preferred_cpus) + return -1; + + cpumask_and(&search_cpus, p->cpus_ptr, cpu_online_mask); +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&search_cpus, &search_cpus, cpu_isolated_mask); +#endif + + /* search the perferred idle cpu */ + for_each_cpu_and(i, &search_cpus, preferred_cpus) { + if (is_reserved(i)) + continue; + + if (idle_cpu(i) || (i == task_cpu(p) && p->state == TASK_RUNNING)) + return i; + } + + for_each_cpu(i, &search_cpus) { + unsigned long spare_cap; + + if (sched_cpu_high_irqload(i)) + continue; + + if (is_reserved(i)) + continue; + + /* take the Active LB CPU as idle_backup_cpu */ + if (idle_cpu(i) || (i == task_cpu(p) && p->state == TASK_RUNNING)) { + /* find the idle_backup_cpu with max capacity */ + if (idle_backup_cpu == -1 || + capacity_orig_of(i) > capacity_orig_of(idle_backup_cpu)) + idle_backup_cpu = i; + + continue; + } + + spare_cap = capacity_spare_without(i, p); + if (spare_cap > max_spare_cap) { + max_spare_cap = spare_cap; + max_spare_cap_cpu = i; + } + } + + if (idle_backup_cpu != -1) + return idle_backup_cpu; + + return max_spare_cap_cpu; +} + #ifdef CONFIG_SCHED_RTG_DEBUG #define seq_printf_rtg(m, x...) \ do { \ @@ -532,6 +716,8 @@ static void print_rtg_info(struct seq_file *file, const struct related_thread_group *grp) { seq_printf_rtg(file, "RTG_ID : %d\n", grp->id); + seq_printf_rtg(file, "RTG_CLUSTER : %d\n", + grp->preferred_cluster ? grp->preferred_cluster->id : -1); } static char rtg_task_state_to_char(const struct task_struct *tsk) diff --git a/kernel/sched/rtg/rtg.h b/kernel/sched/rtg/rtg.h index e32c67aebb96..a158ab74f292 100644 --- a/kernel/sched/rtg/rtg.h +++ b/kernel/sched/rtg/rtg.h @@ -8,6 +8,9 @@ #include #include +#define for_each_sched_cluster_reverse(cluster) \ + list_for_each_entry_reverse(cluster, &cluster_head, list) + #ifdef CONFIG_SCHED_RTG void init_task_rtg(struct task_struct *p); int alloc_related_thread_groups(void); @@ -22,7 +25,16 @@ int sched_set_group_window_rollover(unsigned int grp_id); struct group_cpu_time *group_update_cpu_time(struct rq *rq, struct related_thread_group *grp); void sched_update_rtg_tick(struct task_struct *p); +int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p); +int sched_set_group_preferred_cluster(unsigned int grp_id, int sched_cluster_id); +struct cpumask *find_rtg_target(struct task_struct *p); +int find_rtg_cpu(struct task_struct *p); #else static inline int alloc_related_thread_groups(void) { return 0; } +static inline int sched_set_group_preferred_cluster(unsigned int grp_id, + int sched_cluster_id) +{ + return 0; +} #endif /* CONFIG_SCHED_RTG */ #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fdb69a9ad1f9..9630e3c00558 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -477,7 +477,6 @@ struct task_group { /* Effective clamp values used for a task group */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif - }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -2594,6 +2593,11 @@ static inline bool uclamp_is_used(void) #endif #ifdef CONFIG_SMP +static inline unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; +} + static inline unsigned long capacity_orig_of(int cpu) { return cpu_rq(cpu)->cpu_capacity_orig; @@ -2748,6 +2752,13 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); +#ifdef CONFIG_SCHED_RTG +extern bool task_fits_max(struct task_struct *p, int cpu); +extern unsigned long capacity_spare_without(int cpu, struct task_struct *p); +extern int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load, bool from_tick); +#endif + #ifdef CONFIG_SCHED_WALT static inline int cluster_first_cpu(struct sched_cluster *cluster) { diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index 84da97ccce20..a1fba5b65640 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -45,6 +45,11 @@ static inline struct sched_cluster *cpu_cluster(int cpu) return cpu_rq(cpu)->cluster; } +static inline int same_cluster(int src_cpu, int dst_cpu) +{ + return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster; +} + static inline u64 scale_exec_time(u64 delta, struct rq *rq) { unsigned long capcurr = capacity_curr_of(cpu_of(rq)); @@ -243,6 +248,7 @@ static inline int sched_cpu_high_irqload(int cpu) { return 0; } +static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; } #endif /* CONFIG_SCHED_WALT */ #endif /* __WALT_H */ -- Gitee From ffa3ddb88dd6f59899a000ac6a98e19a37092371 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Mon, 14 Feb 2022 20:44:20 +0800 Subject: [PATCH 035/113] sched: Add interfaces for normalized utilization of related thread group codeaurora inclusion category: feature issue: #I4SULH CVE: NA Signed-off-by: Li Ming ------------------------------------------- Use normalized util as RTG util and support the RTG util invalid interval adjustable. Signed-off-by: Vikram Mulukutla Signed-off-by: Satya Durga Srinivasu Prabhala Signed-off-by: Srinath Sridharan --- include/linux/sched/rtg.h | 4 + kernel/sched/rtg/rtg.c | 214 +++++++++++++++++++++++++++++++++++++- kernel/sched/rtg/rtg.h | 11 +- kernel/sched/walt.c | 2 +- 4 files changed, 226 insertions(+), 5 deletions(-) diff --git a/include/linux/sched/rtg.h b/include/linux/sched/rtg.h index eae7f83808ff..d27e1507e334 100644 --- a/include/linux/sched/rtg.h +++ b/include/linux/sched/rtg.h @@ -41,6 +41,10 @@ struct related_thread_group { unsigned int window_size; const struct rtg_class *rtg_class; struct sched_cluster *preferred_cluster; + int max_boost; + unsigned long util_invalid_interval; /* in nanoseconds */ + unsigned long util_update_timeout; /* in nanoseconds */ + u64 last_util_update_time; }; struct rtg_class { diff --git a/kernel/sched/rtg/rtg.c b/kernel/sched/rtg/rtg.c index 200895617a71..016b2143ea8c 100644 --- a/kernel/sched/rtg/rtg.c +++ b/kernel/sched/rtg/rtg.c @@ -15,6 +15,8 @@ #define REM_TASK 1 #define DEFAULT_GROUP_RATE 60 /* 60FPS */ +#define DEFAULT_UTIL_INVALID_INTERVAL (~0U) /* ns */ +#define DEFAULT_UTIL_UPDATE_TIMEOUT 20000000 /* ns */ struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID]; static DEFINE_RWLOCK(related_thread_group_lock); @@ -57,6 +59,9 @@ int alloc_related_thread_groups(void) INIT_LIST_HEAD(&grp->tasks); INIT_LIST_HEAD(&grp->list); grp->window_size = NSEC_PER_SEC / DEFAULT_GROUP_RATE; + grp->util_invalid_interval = DEFAULT_UTIL_INVALID_INTERVAL; + grp->util_update_timeout = DEFAULT_UTIL_UPDATE_TIMEOUT; + grp->max_boost = 0; raw_spin_lock_init(&grp->lock); related_thread_groups[i] = grp; @@ -208,10 +213,15 @@ static void remove_task_from_group(struct task_struct *p) grp->nr_running = 0; } - if (!list_empty(&grp->tasks)) + if (!list_empty(&grp->tasks)) { empty_group = false; - else + } else { +#ifdef CONFIG_UCLAMP_TASK + grp->max_boost = 0; +#endif _set_preferred_cluster(grp, -1); + grp->ravg.normalized_util = 0; + } raw_spin_unlock_irqrestore(&grp->lock, irqflag); __task_rq_unlock(rq, &flag); @@ -234,6 +244,9 @@ add_task_to_group(struct task_struct *p, struct related_thread_group *grp) struct rq *rq = NULL; struct rq_flags flag; unsigned long irqflag; +#ifdef CONFIG_UCLAMP_TASK + int boost; +#endif /* * Change p->grp under rq->lock. Will prevent races with read-side @@ -252,6 +265,11 @@ add_task_to_group(struct task_struct *p, struct related_thread_group *grp) sched_ktime_clock()); } +#ifdef CONFIG_UCLAMP_TASK + boost = (int)uclamp_eff_value(p, UCLAMP_MIN); + if (boost > grp->max_boost) + grp->max_boost = boost; +#endif raw_spin_unlock_irqrestore(&grp->lock, irqflag); __task_rq_unlock(rq, &flag); @@ -328,9 +346,10 @@ unsigned int sched_get_group_id(struct task_struct *p) return group_id; } -void update_group_nr_running(struct task_struct *p, int event) +void update_group_nr_running(struct task_struct *p, int event, u64 wallclock) { struct related_thread_group *grp; + bool need_update = false; rcu_read_lock(); grp = task_related_thread_group(p); @@ -351,9 +370,17 @@ void update_group_nr_running(struct task_struct *p, int event) grp->nr_running = 0; } + /* update preferred cluster if no update long */ + if (wallclock - grp->last_util_update_time > grp->util_update_timeout) + need_update = true; + raw_spin_unlock(&grp->lock); rcu_read_unlock(); + + if (need_update && grp->rtg_class && grp->rtg_class->sched_update_rtg_tick && + grp->id != DEFAULT_CGROUP_COLOC_ID) + grp->rtg_class->sched_update_rtg_tick(grp); } int sched_set_group_window_size(unsigned int grp_id, unsigned int window_size) @@ -390,6 +417,10 @@ int sched_set_group_window_rollover(unsigned int grp_id) struct related_thread_group *grp = NULL; u64 wallclock; unsigned long flag; +#ifdef CONFIG_UCLAMP_TASK + struct task_struct *p = NULL; + int boost; +#endif grp = lookup_related_thread_group(grp_id); if (!grp) { @@ -402,6 +433,15 @@ int sched_set_group_window_rollover(unsigned int grp_id) wallclock = sched_ktime_clock(); grp->prev_window_time = wallclock - grp->window_start; grp->window_start = wallclock; + grp->max_boost = 0; + +#ifdef CONFIG_UCLAMP_TASK + list_for_each_entry(p, &grp->tasks, grp_list) { + boost = (int)uclamp_eff_value(p, UCLAMP_MIN); + if (boost > 0) + grp->max_boost = boost; + } +#endif group_time_rollover(&grp->ravg); raw_spin_unlock_irqrestore(&grp->lock, flag); @@ -703,6 +743,172 @@ int find_rtg_cpu(struct task_struct *p) return max_spare_cap_cpu; } +int sched_set_group_util_invalid_interval(unsigned int grp_id, + unsigned int interval) +{ + struct related_thread_group *grp = NULL; + unsigned long flag; + + if (interval == 0) + return -EINVAL; + + /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ + if (grp_id == DEFAULT_CGROUP_COLOC_ID || + grp_id >= MAX_NUM_CGROUP_COLOC_ID) + return -EINVAL; + + grp = lookup_related_thread_group(grp_id); + if (!grp) { + pr_err("set invalid interval for group %d fail\n", grp_id); + return -ENODEV; + } + + raw_spin_lock_irqsave(&grp->lock, flag); + if ((signed int)interval < 0) + grp->util_invalid_interval = DEFAULT_UTIL_INVALID_INTERVAL; + else + grp->util_invalid_interval = interval * NSEC_PER_MSEC; + + raw_spin_unlock_irqrestore(&grp->lock, flag); + + return 0; +} + +static inline bool +group_should_invalid_util(struct related_thread_group *grp, u64 now) +{ + if (grp->util_invalid_interval == DEFAULT_UTIL_INVALID_INTERVAL) + return false; + + return true; +} + +static inline bool valid_normalized_util(struct related_thread_group *grp) +{ + struct task_struct *p = NULL; + cpumask_t rtg_cpus = CPU_MASK_NONE; + bool valid = false; + + if (grp->nr_running != 0) { + list_for_each_entry(p, &grp->tasks, grp_list) { + get_task_struct(p); + if (p->state == TASK_RUNNING) + cpumask_set_cpu(task_cpu(p), &rtg_cpus); + put_task_struct(p); + } + + valid = cpumask_intersects(&rtg_cpus, + &grp->preferred_cluster->cpus); + } + + return valid; +} + +void sched_get_max_group_util(const struct cpumask *query_cpus, + unsigned long *util, unsigned int *freq) +{ + struct related_thread_group *grp = NULL; + unsigned long max_grp_util = 0; + unsigned int max_grp_freq = 0; + u64 now = ktime_get_ns(); + unsigned long rtg_flag; + unsigned long flag; + + /* + * sum the prev_runnable_sum for each rtg, + * return the max rtg->load + */ + read_lock_irqsave(&related_thread_group_lock, rtg_flag); + if (list_empty(&active_related_thread_groups)) + goto unlock; + + for_each_related_thread_group(grp) { + raw_spin_lock_irqsave(&grp->lock, flag); + if (!list_empty(&grp->tasks) && + grp->preferred_cluster != NULL && + cpumask_intersects(query_cpus, + &grp->preferred_cluster->cpus) && + !group_should_invalid_util(grp, now)) { + + if (grp->ravg.normalized_util > max_grp_util && + valid_normalized_util(grp)) + max_grp_util = grp->ravg.normalized_util; + } + raw_spin_unlock_irqrestore(&grp->lock, flag); + } + +unlock: + read_unlock_irqrestore(&related_thread_group_lock, rtg_flag); + + *freq = max_grp_freq; + *util = max_grp_util; +} + +static struct sched_cluster *best_cluster(struct related_thread_group *grp) +{ + struct sched_cluster *cluster = NULL; + struct sched_cluster *max_cluster = NULL; + int cpu; + unsigned long util = grp->ravg.normalized_util; + unsigned long boosted_grp_util = util + grp->max_boost; + unsigned long max_cap = 0; + unsigned long cap = 0; + + /* find new cluster */ + for_each_sched_cluster(cluster) { + cpu = cpumask_first(&cluster->cpus); + cap = capacity_orig_of(cpu); + if (cap > max_cap) { + max_cap = cap; + max_cluster = cluster; + } + + if (boosted_grp_util <= cap) + return cluster; + } + + return max_cluster; +} + +int sched_set_group_normalized_util(unsigned int grp_id, unsigned long util, + unsigned int flag) +{ + struct related_thread_group *grp = NULL; + u64 now; + unsigned long flags; + struct sched_cluster *preferred_cluster = NULL; + + grp = lookup_related_thread_group(grp_id); + if (!grp) { + pr_err("set normalized util for group %d fail\n", grp_id); + return -ENODEV; + } + + raw_spin_lock_irqsave(&grp->lock, flags); + + if (list_empty(&grp->tasks)) { + raw_spin_unlock_irqrestore(&grp->lock, flags); + return 0; + } + + grp->ravg.normalized_util = util; + + preferred_cluster = best_cluster(grp); + + /* update prev_cluster force when preferred_cluster changed */ + if (!grp->preferred_cluster) + grp->preferred_cluster = preferred_cluster; + else if (grp->preferred_cluster != preferred_cluster) + grp->preferred_cluster = preferred_cluster; + + now = ktime_get_ns(); + grp->last_util_update_time = now; + + raw_spin_unlock_irqrestore(&grp->lock, flags); + + return 0; +} + #ifdef CONFIG_SCHED_RTG_DEBUG #define seq_printf_rtg(m, x...) \ do { \ @@ -716,6 +922,8 @@ static void print_rtg_info(struct seq_file *file, const struct related_thread_group *grp) { seq_printf_rtg(file, "RTG_ID : %d\n", grp->id); + seq_printf_rtg(file, "RTG_INTERVAL : INVALID:%lums\n", + grp->util_invalid_interval / NSEC_PER_MSEC); seq_printf_rtg(file, "RTG_CLUSTER : %d\n", grp->preferred_cluster ? grp->preferred_cluster->id : -1); } diff --git a/kernel/sched/rtg/rtg.h b/kernel/sched/rtg/rtg.h index a158ab74f292..abd70d449ddb 100644 --- a/kernel/sched/rtg/rtg.h +++ b/kernel/sched/rtg/rtg.h @@ -16,7 +16,7 @@ void init_task_rtg(struct task_struct *p); int alloc_related_thread_groups(void); struct related_thread_group *lookup_related_thread_group(unsigned int group_id); struct related_thread_group *task_related_thread_group(struct task_struct *p); -void update_group_nr_running(struct task_struct *p, int event); +void update_group_nr_running(struct task_struct *p, int event, u64 wallclock); struct rq; void update_group_demand(struct task_struct *p, struct rq *rq, int event, u64 wallclock); @@ -29,6 +29,10 @@ int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p); int sched_set_group_preferred_cluster(unsigned int grp_id, int sched_cluster_id); struct cpumask *find_rtg_target(struct task_struct *p); int find_rtg_cpu(struct task_struct *p); +int sched_set_group_util_invalid_interval(unsigned int grp_id, + unsigned int interval); +int sched_set_group_normalized_util(unsigned int grp_id, unsigned long util, + unsigned int flag); #else static inline int alloc_related_thread_groups(void) { return 0; } static inline int sched_set_group_preferred_cluster(unsigned int grp_id, @@ -36,5 +40,10 @@ static inline int sched_set_group_preferred_cluster(unsigned int grp_id, { return 0; } +static inline int sched_set_group_normalized_util(unsigned int grp_id, unsigned long util, + unsigned int flag) +{ + return 0; +} #endif /* CONFIG_SCHED_RTG */ #endif diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index a2824cc9bc2e..40515b1bbdb7 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -1180,7 +1180,7 @@ void update_task_ravg(struct task_struct *p, struct rq *rq, int event, old_window_start = update_window_start(rq, wallclock, event); #ifdef CONFIG_SCHED_RTG - update_group_nr_running(p, event); + update_group_nr_running(p, event, wallclock); #endif if (!p->ravg.mark_start) goto done; -- Gitee From 9559e7e57b517576f3ed5890a6ceeebbdc9b012d Mon Sep 17 00:00:00 2001 From: Li Ming Date: Mon, 14 Feb 2022 14:55:36 +0800 Subject: [PATCH 036/113] sched: Add debugfs for sched cluster codeaurora inclusion category: feature issue: #I4SULH CVE: NA Signed-off-by: Li Ming ------------------------------------------- Show the information of sched cluster in /proc/sched_cluster. Signed-off-by: Vikram Mulukutla Signed-off-by: Satya Durga Srinivasu Prabhala Signed-off-by: Srinath Sridharan --- kernel/sched/walt.c | 46 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 40515b1bbdb7..8d4c79028d8a 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -1813,3 +1813,49 @@ void walt_sched_init_rq(struct rq *rq) for (j = 0; j < NUM_TRACKED_WINDOWS; j++) memset(&rq->load_subs[j], 0, sizeof(struct load_subtractions)); } + +#define min_cap_cluster() \ + list_first_entry(&cluster_head, struct sched_cluster, list) +#define max_cap_cluster() \ + list_last_entry(&cluster_head, struct sched_cluster, list) +static int sched_cluster_debug_show(struct seq_file *file, void *param) +{ + struct sched_cluster *cluster = NULL; + + seq_printf(file, "min_id:%d, max_id:%d\n", + min_cap_cluster()->id, + max_cap_cluster()->id); + + for_each_sched_cluster(cluster) { + seq_printf(file, "id:%d, cpumask:%d(%*pbl)\n", + cluster->id, + cpumask_first(&cluster->cpus), + cpumask_pr_args(&cluster->cpus)); + } + + return 0; +} + +static int sched_cluster_debug_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_cluster_debug_show, NULL); +} + +static const struct proc_ops sched_cluster_fops = { + .proc_open = sched_cluster_debug_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int __init init_sched_cluster_debug_procfs(void) +{ + struct proc_dir_entry *pe = NULL; + + pe = proc_create("sched_cluster", + 0444, NULL, &sched_cluster_fops); + if (!pe) + return -ENOMEM; + return 0; +} +late_initcall(init_sched_cluster_debug_procfs); -- Gitee From b7f2b5a8b7a4f1d479b0960a49480912e2783ef8 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Mon, 14 Feb 2022 18:01:31 +0800 Subject: [PATCH 037/113] sched: Support forced adjustment of CPU frequency according to the group util codeaurora inclusion category: feature issue: #I4SULH CVE: NA Signed-off-by: Li Ming ------------------------------------------- Add SCHED_CPUFREQ_FORCE_UPDATE flag to support skip CPU frequency scaling interval (rate_limit_us is 20ms by default) check. Signed-off-by: Vikram Mulukutla Signed-off-by: Satya Durga Srinivasu Prabhala Signed-off-by: Srinath Sridharan --- include/linux/sched/cpufreq.h | 1 + include/linux/sched/rtg.h | 7 +++ kernel/sched/cpufreq_schedutil.c | 26 +++++++++-- kernel/sched/rtg/rtg.c | 79 ++++++++++++++++++++++++++++++-- kernel/sched/rtg/rtg.h | 8 ++++ 5 files changed, 114 insertions(+), 7 deletions(-) diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index c7cf63236f5b..94e7f84de227 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -11,6 +11,7 @@ #define SCHED_CPUFREQ_IOWAIT (1U << 0) #define SCHED_CPUFREQ_WALT (1U << 1) #define SCHED_CPUFREQ_CONTINUE (1U << 2) +#define SCHED_CPUFREQ_FORCE_UPDATE (1U << 3) #ifdef CONFIG_CPU_FREQ struct cpufreq_policy; diff --git a/include/linux/sched/rtg.h b/include/linux/sched/rtg.h index d27e1507e334..735b8ccae745 100644 --- a/include/linux/sched/rtg.h +++ b/include/linux/sched/rtg.h @@ -44,13 +44,20 @@ struct related_thread_group { int max_boost; unsigned long util_invalid_interval; /* in nanoseconds */ unsigned long util_update_timeout; /* in nanoseconds */ + unsigned long freq_update_interval; /* in nanoseconds */ u64 last_util_update_time; + u64 last_freq_update_time; }; struct rtg_class { void (*sched_update_rtg_tick)(struct related_thread_group *grp); }; +enum rtg_freq_update_flags { + RTG_FREQ_FORCE_UPDATE = (1 << 0), + RTG_FREQ_NORMAL_UPDATE = (1 << 1), +}; + int sched_set_group_id(struct task_struct *p, unsigned int group_id); unsigned int sched_get_group_id(struct task_struct *p); #endif /* CONFIG_SCHED_RTG */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index cb72dc5c2002..742ed2fe50de 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -9,6 +9,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "sched.h" +#include "rtg/rtg.h" #include #include @@ -38,6 +39,10 @@ struct sugov_policy { struct mutex work_lock; struct kthread_worker worker; struct task_struct *thread; +#ifdef CONFIG_SCHED_RTG + unsigned long rtg_util; + unsigned int rtg_freq; +#endif bool work_in_progress; bool limits_changed; @@ -448,13 +453,18 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned long util, max; unsigned int next_f; unsigned int cached_freq = sg_policy->cached_raw_freq; + bool force_update = false; + +#ifdef CONFIG_SCHED_RTG + force_update = flags & SCHED_CPUFREQ_FORCE_UPDATE; +#endif sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; ignore_dl_rate_limit(sg_cpu, sg_policy); - if (!sugov_should_update_freq(sg_policy, time)) + if (!force_update && !sugov_should_update_freq(sg_policy, time)) return; util = sugov_get_util(sg_cpu); @@ -507,6 +517,11 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) } } +#ifdef CONFIG_SCHED_RTG + sched_get_max_group_util(policy->cpus, &sg_policy->rtg_util, &sg_policy->rtg_freq); + util = max(sg_policy->rtg_util, util); +#endif + return get_next_freq(sg_policy, util, max); } @@ -516,7 +531,11 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned int next_f; + bool force_update = false; +#ifdef CONFIG_SCHED_RTG + force_update = flags & SCHED_CPUFREQ_FORCE_UPDATE; +#endif raw_spin_lock(&sg_policy->update_lock); sugov_iowait_boost(sg_cpu, time, flags); @@ -525,9 +544,10 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) ignore_dl_rate_limit(sg_cpu, sg_policy); #ifdef CONFIG_SCHED_WALT - if (sugov_should_update_freq(sg_policy, time) && !(flags & SCHED_CPUFREQ_CONTINUE)) { + if ((force_update || sugov_should_update_freq(sg_policy, time)) + && !(flags & SCHED_CPUFREQ_CONTINUE)) { #else - if (sugov_should_update_freq(sg_policy, time)) { + if (force_update || sugov_should_update_freq(sg_policy, time)) { #endif next_f = sugov_next_freq_shared(sg_cpu, time); diff --git a/kernel/sched/rtg/rtg.c b/kernel/sched/rtg/rtg.c index 016b2143ea8c..51b9c3fad7da 100644 --- a/kernel/sched/rtg/rtg.c +++ b/kernel/sched/rtg/rtg.c @@ -17,6 +17,7 @@ #define DEFAULT_GROUP_RATE 60 /* 60FPS */ #define DEFAULT_UTIL_INVALID_INTERVAL (~0U) /* ns */ #define DEFAULT_UTIL_UPDATE_TIMEOUT 20000000 /* ns */ +#define DEFAULT_FREQ_UPDATE_INTERVAL 8000000 /* ns */ struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID]; static DEFINE_RWLOCK(related_thread_group_lock); @@ -62,6 +63,7 @@ int alloc_related_thread_groups(void) grp->util_invalid_interval = DEFAULT_UTIL_INVALID_INTERVAL; grp->util_update_timeout = DEFAULT_UTIL_UPDATE_TIMEOUT; grp->max_boost = 0; + grp->freq_update_interval = DEFAULT_FREQ_UPDATE_INTERVAL; raw_spin_lock_init(&grp->lock); related_thread_groups[i] = grp; @@ -780,7 +782,7 @@ group_should_invalid_util(struct related_thread_group *grp, u64 now) if (grp->util_invalid_interval == DEFAULT_UTIL_INVALID_INTERVAL) return false; - return true; + return (now - grp->last_freq_update_time >= grp->util_invalid_interval); } static inline bool valid_normalized_util(struct related_thread_group *grp) @@ -870,13 +872,34 @@ static struct sched_cluster *best_cluster(struct related_thread_group *grp) return max_cluster; } +static bool group_should_update_freq(struct related_thread_group *grp, + int cpu, unsigned int flags, u64 now) +{ + if (!grp) + return true; + + if (flags & RTG_FREQ_FORCE_UPDATE) { + return true; + } else if (flags & RTG_FREQ_NORMAL_UPDATE) { + if (now - grp->last_freq_update_time >= + grp->freq_update_interval) + return true; + } + + return false; +} + int sched_set_group_normalized_util(unsigned int grp_id, unsigned long util, unsigned int flag) { struct related_thread_group *grp = NULL; + bool need_update_prev_freq = false; + bool need_update_next_freq = false; u64 now; unsigned long flags; struct sched_cluster *preferred_cluster = NULL; + int prev_cpu; + int next_cpu; grp = lookup_related_thread_group(grp_id); if (!grp) { @@ -896,16 +919,63 @@ int sched_set_group_normalized_util(unsigned int grp_id, unsigned long util, preferred_cluster = best_cluster(grp); /* update prev_cluster force when preferred_cluster changed */ - if (!grp->preferred_cluster) + if (!grp->preferred_cluster) { grp->preferred_cluster = preferred_cluster; - else if (grp->preferred_cluster != preferred_cluster) + } else if (grp->preferred_cluster != preferred_cluster) { + prev_cpu = cpumask_first(&grp->preferred_cluster->cpus); grp->preferred_cluster = preferred_cluster; + need_update_prev_freq = true; + } + + if (grp->preferred_cluster != NULL) + next_cpu = cpumask_first(&grp->preferred_cluster->cpus); + else + next_cpu = 0; + now = ktime_get_ns(); grp->last_util_update_time = now; + need_update_next_freq = + group_should_update_freq(grp, next_cpu, flag, now); + if (need_update_next_freq) + grp->last_freq_update_time = now; raw_spin_unlock_irqrestore(&grp->lock, flags); + if (need_update_prev_freq) + cpufreq_update_util(cpu_rq(prev_cpu), + SCHED_CPUFREQ_FORCE_UPDATE | SCHED_CPUFREQ_WALT); + + if (need_update_next_freq) + cpufreq_update_util(cpu_rq(next_cpu), + SCHED_CPUFREQ_FORCE_UPDATE | SCHED_CPUFREQ_WALT); + + return 0; +} + +int sched_set_group_freq_update_interval(unsigned int grp_id, unsigned int interval) +{ + struct related_thread_group *grp = NULL; + unsigned long flag; + + if ((signed int)interval <= 0) + return -EINVAL; + + /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ + if (grp_id == DEFAULT_CGROUP_COLOC_ID || + grp_id >= MAX_NUM_CGROUP_COLOC_ID) + return -EINVAL; + + grp = lookup_related_thread_group(grp_id); + if (!grp) { + pr_err("set update interval for group %d fail\n", grp_id); + return -ENODEV; + } + + raw_spin_lock_irqsave(&grp->lock, flag); + grp->freq_update_interval = interval * NSEC_PER_MSEC; + raw_spin_unlock_irqrestore(&grp->lock, flag); + return 0; } @@ -922,7 +992,8 @@ static void print_rtg_info(struct seq_file *file, const struct related_thread_group *grp) { seq_printf_rtg(file, "RTG_ID : %d\n", grp->id); - seq_printf_rtg(file, "RTG_INTERVAL : INVALID:%lums\n", + seq_printf_rtg(file, "RTG_INTERVAL : UPDATE:%lums#INVALID:%lums\n", + grp->freq_update_interval / NSEC_PER_MSEC, grp->util_invalid_interval / NSEC_PER_MSEC); seq_printf_rtg(file, "RTG_CLUSTER : %d\n", grp->preferred_cluster ? grp->preferred_cluster->id : -1); diff --git a/kernel/sched/rtg/rtg.h b/kernel/sched/rtg/rtg.h index abd70d449ddb..23536c62859a 100644 --- a/kernel/sched/rtg/rtg.h +++ b/kernel/sched/rtg/rtg.h @@ -33,6 +33,10 @@ int sched_set_group_util_invalid_interval(unsigned int grp_id, unsigned int interval); int sched_set_group_normalized_util(unsigned int grp_id, unsigned long util, unsigned int flag); +void sched_get_max_group_util(const struct cpumask *query_cpus, + unsigned long *util, unsigned int *freq); +int sched_set_group_freq_update_interval(unsigned int grp_id, + unsigned int interval); #else static inline int alloc_related_thread_groups(void) { return 0; } static inline int sched_set_group_preferred_cluster(unsigned int grp_id, @@ -45,5 +49,9 @@ static inline int sched_set_group_normalized_util(unsigned int grp_id, unsigned { return 0; } +static inline void sched_get_max_group_util(const struct cpumask *query_cpus, + unsigned long *util, unsigned int *freq) +{ +} #endif /* CONFIG_SCHED_RTG */ #endif -- Gitee From e2a541d6c6b538ab88da0a273e3faba31662a9cd Mon Sep 17 00:00:00 2001 From: Li Ming Date: Mon, 14 Feb 2022 18:38:18 +0800 Subject: [PATCH 038/113] sched: Support adding new tasks to the default group via cgroup attach codeaurora inclusion category: feature issue: #I4SULH CVE: NA Signed-off-by: Li Ming ------------------------------------------- If uclamp.colocate of cpu cgroup is set, the new task which belongs to cgroup will be added to the default group (group_id = DEFAULT_CGROUP_COLOC_ID). Signed-off-by: Vikram Mulukutla Signed-off-by: Satya Durga Srinivasu Prabhala Signed-off-by: Srinath Sridharan --- kernel/sched/core.c | 60 +++++++++++++++++++ kernel/sched/rtg/Kconfig | 8 +++ kernel/sched/rtg/rtg.c | 125 +++++++++++++++++++++++++++++++++++++-- kernel/sched/rtg/rtg.h | 7 +++ kernel/sched/sched.h | 11 ++++ 5 files changed, 207 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 20dd5009e315..8e506f6efc73 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3452,6 +3452,8 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + add_new_task_to_grp(p); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP /* @@ -8060,6 +8062,11 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); +#ifdef CONFIG_SCHED_RTG_CGROUP + tg->colocate = false; + tg->colocate_update_disabled = false; +#endif + return &tg->css; } @@ -8149,6 +8156,25 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) return ret; } +#if defined(CONFIG_UCLAMP_TASK_GROUP) && defined(CONFIG_SCHED_RTG_CGROUP) +static void schedgp_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + bool colocate; + struct task_group *tg; + + cgroup_taskset_first(tset, &css); + tg = css_tg(css); + + colocate = tg->colocate; + + cgroup_taskset_for_each(task, css, tset) + sync_cgroup_colocation(task, colocate); +} +#else +static void schedgp_attach(struct cgroup_taskset *tset) { } +#endif static void cpu_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; @@ -8156,6 +8182,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task); + + schedgp_attach(tset); } #ifdef CONFIG_UCLAMP_TASK_GROUP @@ -8333,6 +8361,30 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) cpu_uclamp_print(sf, UCLAMP_MAX); return 0; } + +#ifdef CONFIG_SCHED_RTG_CGROUP +static u64 sched_colocate_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return (u64) tg->colocate; +} + +static int sched_colocate_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 colocate) +{ + struct task_group *tg = css_tg(css); + + if (tg->colocate_update_disabled) + return -EPERM; + + tg->colocate = !!colocate; + tg->colocate_update_disabled = true; + + return 0; +} +#endif /* CONFIG_SCHED_RTG_CGROUP */ #endif /* CONFIG_UCLAMP_TASK_GROUP */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -8701,6 +8753,14 @@ static struct cftype cpu_legacy_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, +#ifdef CONFIG_SCHED_RTG_CGROUP + { + .name = "uclamp.colocate", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = sched_colocate_read, + .write_u64 = sched_colocate_write, + }, +#endif #endif { } /* Terminate */ }; diff --git a/kernel/sched/rtg/Kconfig b/kernel/sched/rtg/Kconfig index a96073631d16..3e5acad17ac5 100644 --- a/kernel/sched/rtg/Kconfig +++ b/kernel/sched/rtg/Kconfig @@ -14,4 +14,12 @@ config SCHED_RTG_DEBUG help If set, debug node will show rtg threads +config SCHED_RTG_CGROUP + bool "enable DEFAULT_CGROUP_COLOC RTG" + depends on SCHED_RTG + default n + help + If set, support for adding the tasks which belong to + co-located cgroup to DEFAULT_CGROUP_COLOC RTG. + endmenu diff --git a/kernel/sched/rtg/rtg.c b/kernel/sched/rtg/rtg.c index 51b9c3fad7da..e2cf2cdab65c 100644 --- a/kernel/sched/rtg/rtg.c +++ b/kernel/sched/rtg/rtg.c @@ -301,10 +301,18 @@ static int __sched_set_group_id(struct task_struct *p, unsigned int group_id) * In other cases, Switching from one group to another directly is not permitted. */ if (old_grp && group_id) { - pr_err("%s[%d] switching group from %d to %d failed.\n", - p->comm, p->pid, old_grp->id, group_id); - rc = -EINVAL; - goto done; +#ifdef CONFIG_SCHED_RTG_CGROUP + if (old_grp->id == DEFAULT_CGROUP_COLOC_ID) { + remove_task_from_group(p); + } else { +#endif + pr_err("%s[%d] switching group from %d to %d failed.\n", + p->comm, p->pid, old_grp->id, group_id); + rc = -EINVAL; + goto done; +#ifdef CONFIG_SCHED_RTG_CGROUP + } +#endif } if (!group_id) { @@ -979,6 +987,115 @@ int sched_set_group_freq_update_interval(unsigned int grp_id, unsigned int inter return 0; } +#ifdef CONFIG_SCHED_RTG_CGROUP +#ifdef CONFIG_UCLAMP_TASK_GROUP +static inline bool uclamp_task_colocated(struct task_struct *p) +{ + struct cgroup_subsys_state *css; + struct task_group *tg; + bool colocate; + + rcu_read_lock(); + css = task_css(p, cpu_cgrp_id); + if (!css) { + rcu_read_unlock(); + return false; + } + tg = container_of(css, struct task_group, css); + colocate = tg->colocate; + rcu_read_unlock(); + + return colocate; +} +#else +static inline bool uclamp_task_colocated(struct task_struct *p) +{ + return false; +} +#endif /* CONFIG_UCLAMP_TASK_GROUP */ + +void add_new_task_to_grp(struct task_struct *new) +{ + struct related_thread_group *grp = NULL; + unsigned long flag; + + /* + * If the task does not belong to colocated schedtune + * cgroup, nothing to do. We are checking this without + * lock. Even if there is a race, it will be added + * to the co-located cgroup via cgroup attach. + */ + if (!uclamp_task_colocated(new)) + return; + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + write_lock_irqsave(&related_thread_group_lock, flag); + + /* + * It's possible that someone already added the new task to the + * group. or it might have taken out from the colocated schedtune + * cgroup. check these conditions under lock. + */ + if (!uclamp_task_colocated(new) || new->grp) { + write_unlock_irqrestore(&related_thread_group_lock, flag); + return; + } + + raw_spin_lock(&grp->lock); + + rcu_assign_pointer(new->grp, grp); + list_add(&new->grp_list, &grp->tasks); + + raw_spin_unlock(&grp->lock); + write_unlock_irqrestore(&related_thread_group_lock, flag); +} + + +/* + * We create a default colocation group at boot. There is no need to + * synchronize tasks between cgroups at creation time because the + * correct cgroup hierarchy is not available at boot. Therefore cgroup + * colocation is turned off by default even though the colocation group + * itself has been allocated. Furthermore this colocation group cannot + * be destroyted once it has been created. All of this has been as part + * of runtime optimizations. + * + * The job of synchronizing tasks to the colocation group is done when + * the colocation flag in the cgroup is turned on. + */ +static int __init create_default_coloc_group(void) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + write_lock_irqsave(&related_thread_group_lock, flags); + list_add(&grp->list, &active_related_thread_groups); + write_unlock_irqrestore(&related_thread_group_lock, flags); + + return 0; +} +late_initcall(create_default_coloc_group); + +int sync_cgroup_colocation(struct task_struct *p, bool insert) +{ + unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; + unsigned int old_grp_id; + + if (p) { + old_grp_id = sched_get_group_id(p); + /* + * If the task is already in a group which is not DEFAULT_CGROUP_COLOC_ID, + * we should not change the group id during switch to background. + */ + if ((old_grp_id != DEFAULT_CGROUP_COLOC_ID) && (grp_id == 0)) + return 0; + } + + return __sched_set_group_id(p, grp_id); +} +#endif /* CONFIG_SCHED_RTG_CGROUP */ + #ifdef CONFIG_SCHED_RTG_DEBUG #define seq_printf_rtg(m, x...) \ do { \ diff --git a/kernel/sched/rtg/rtg.h b/kernel/sched/rtg/rtg.h index 23536c62859a..4f0cedc332f0 100644 --- a/kernel/sched/rtg/rtg.h +++ b/kernel/sched/rtg/rtg.h @@ -37,6 +37,12 @@ void sched_get_max_group_util(const struct cpumask *query_cpus, unsigned long *util, unsigned int *freq); int sched_set_group_freq_update_interval(unsigned int grp_id, unsigned int interval); +#ifdef CONFIG_SCHED_RTG_CGROUP +int sync_cgroup_colocation(struct task_struct *p, bool insert); +void add_new_task_to_grp(struct task_struct *new); +#else +static inline void add_new_task_to_grp(struct task_struct *new) {} +#endif /* CONFIG_SCHED_RTG_CGROUP */ #else static inline int alloc_related_thread_groups(void) { return 0; } static inline int sched_set_group_preferred_cluster(unsigned int grp_id, @@ -53,5 +59,6 @@ static inline void sched_get_max_group_util(const struct cpumask *query_cpus, unsigned long *util, unsigned int *freq) { } +static inline void add_new_task_to_grp(struct task_struct *new) {} #endif /* CONFIG_SCHED_RTG */ #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9630e3c00558..d79744dcc048 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -477,6 +477,17 @@ struct task_group { /* Effective clamp values used for a task group */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif + +#ifdef CONFIG_SCHED_RTG_CGROUP + /* + * Controls whether tasks of this cgroup should be colocated with each + * other and tasks of other cgroups that have the same flag turned on. + */ + bool colocate; + + /* Controls whether further updates are allowed to the colocate flag */ + bool colocate_update_disabled; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED -- Gitee From 1f0d48afb6ced6e61855f6a97a8dafb022676294 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Mon, 14 Feb 2022 18:46:40 +0800 Subject: [PATCH 039/113] sched: Add trace points for related thread group scheduling codeaurora inclusion category: feature issue: #I4SULH CVE: NA Signed-off-by: Li Ming ------------------------------------------- Add find_rtg_cpu/sched_rtg_task_each/sched_rtg_valid_normalized_util trace points for cpu selection. Signed-off-by: Vikram Mulukutla Signed-off-by: Satya Durga Srinivasu Prabhala Signed-off-by: Srinath Sridharan --- include/trace/events/rtg.h | 117 +++++++++++++++++++++++++++++++++++++ kernel/sched/rtg/rtg.c | 15 ++++- 2 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 include/trace/events/rtg.h diff --git a/include/trace/events/rtg.h b/include/trace/events/rtg.h new file mode 100644 index 000000000000..12422d2c3ee2 --- /dev/null +++ b/include/trace/events/rtg.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rtg + +#if !defined(_TRACE_RTG_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RTG_H + +#include +#include + +struct rq; + +TRACE_EVENT(find_rtg_cpu, + + TP_PROTO(struct task_struct *p, const struct cpumask *perferred_cpumask, + char *msg, int cpu), + + TP_ARGS(p, perferred_cpumask, msg, cpu), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __bitmask(cpus, num_possible_cpus()) + __array(char, msg, TASK_COMM_LEN) + __field(int, cpu) + ), + + TP_fast_assign( + __entry->pid = p->pid; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __assign_bitmask(cpus, cpumask_bits(perferred_cpumask), num_possible_cpus()); + memcpy(__entry->msg, msg, min((size_t)TASK_COMM_LEN, strlen(msg)+1)); + __entry->cpu = cpu; + ), + + TP_printk("comm=%s pid=%d perferred_cpus=%s reason=%s target_cpu=%d", + __entry->comm, __entry->pid, __get_bitmask(cpus), __entry->msg, __entry->cpu) +); + +TRACE_EVENT(sched_rtg_task_each, + + TP_PROTO(unsigned int id, unsigned int nr_running, struct task_struct *task), + + TP_ARGS(id, nr_running, task), + + TP_STRUCT__entry( + __field(unsigned int, id) + __field(unsigned int, nr_running) + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(int, prio) + __bitmask(allowed, num_possible_cpus()) + __field(int, cpu) + __field(int, state) + __field(bool, on_rq) + __field(int, on_cpu) + ), + + TP_fast_assign( + __entry->id = id; + __entry->nr_running = nr_running; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->pid = task->pid; + __entry->prio = task->prio; + __assign_bitmask(allowed, cpumask_bits(&task->cpus_mask), num_possible_cpus()); + __entry->cpu = task_cpu(task); + __entry->state = task->state; + __entry->on_rq = task->on_rq; + __entry->on_cpu = task->on_cpu; + ), + + TP_printk("comm=%s pid=%d prio=%d allowed=%s cpu=%d state=%s%s on_rq=%d on_cpu=%d", + __entry->comm, __entry->pid, __entry->prio, __get_bitmask(allowed), __entry->cpu, + __entry->state & (TASK_REPORT_MAX) ? + __print_flags(__entry->state & (TASK_REPORT_MAX), "|", + { TASK_INTERRUPTIBLE, "S" }, + { TASK_UNINTERRUPTIBLE, "D" }, + { __TASK_STOPPED, "T" }, + { __TASK_TRACED, "t" }, + { EXIT_DEAD, "X" }, + { EXIT_ZOMBIE, "Z" }, + { TASK_DEAD, "x" }, + { TASK_WAKEKILL, "K"}, + { TASK_WAKING, "W"}) : "R", + __entry->state & TASK_STATE_MAX ? "+" : "", + __entry->on_rq, __entry->on_cpu) +); + +TRACE_EVENT(sched_rtg_valid_normalized_util, + + TP_PROTO(unsigned int id, unsigned int nr_running, + const struct cpumask *rtg_cpus, unsigned int valid), + + TP_ARGS(id, nr_running, rtg_cpus, valid), + + TP_STRUCT__entry( + __field(unsigned int, id) + __field(unsigned int, nr_running) + __bitmask(cpus, num_possible_cpus()) + __field(unsigned int, valid) + ), + + TP_fast_assign( + __entry->id = id; + __entry->nr_running = nr_running; + __assign_bitmask(cpus, cpumask_bits(rtg_cpus), num_possible_cpus()); + __entry->valid = valid; + ), + + TP_printk("id=%d nr_running=%d cpus=%s valid=%d", + __entry->id, __entry->nr_running, + __get_bitmask(cpus), __entry->valid) +); +#endif /* _TRACE_RTG_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/sched/rtg/rtg.c b/kernel/sched/rtg/rtg.c index e2cf2cdab65c..dabadd54e59c 100644 --- a/kernel/sched/rtg/rtg.c +++ b/kernel/sched/rtg/rtg.c @@ -6,6 +6,9 @@ #include #include #include +#define CREATE_TRACE_POINTS +#include +#undef CREATE_TRACE_POINTS #include "../sched.h" #include "rtg.h" @@ -717,8 +720,10 @@ int find_rtg_cpu(struct task_struct *p) if (is_reserved(i)) continue; - if (idle_cpu(i) || (i == task_cpu(p) && p->state == TASK_RUNNING)) + if (idle_cpu(i) || (i == task_cpu(p) && p->state == TASK_RUNNING)) { + trace_find_rtg_cpu(p, preferred_cpus, "prefer_idle", i); return i; + } } for_each_cpu(i, &search_cpus) { @@ -747,8 +752,12 @@ int find_rtg_cpu(struct task_struct *p) } } - if (idle_backup_cpu != -1) + if (idle_backup_cpu != -1) { + trace_find_rtg_cpu(p, preferred_cpus, "idle_backup", idle_backup_cpu); return idle_backup_cpu; + } + + trace_find_rtg_cpu(p, preferred_cpus, "max_spare", max_spare_cap_cpu); return max_spare_cap_cpu; } @@ -804,12 +813,14 @@ static inline bool valid_normalized_util(struct related_thread_group *grp) get_task_struct(p); if (p->state == TASK_RUNNING) cpumask_set_cpu(task_cpu(p), &rtg_cpus); + trace_sched_rtg_task_each(grp->id, grp->nr_running, p); put_task_struct(p); } valid = cpumask_intersects(&rtg_cpus, &grp->preferred_cluster->cpus); } + trace_sched_rtg_valid_normalized_util(grp->id, grp->nr_running, &rtg_cpus, valid); return valid; } -- Gitee From ed7f8892c89b36e44efe317bdd29c42a77606601 Mon Sep 17 00:00:00 2001 From: waterwin Date: Thu, 17 Feb 2022 12:42:29 +0000 Subject: [PATCH 040/113] hmdfs: Bugfix when calling setattr in local dir ohos inclusion category: bugfix issue: #I4U6XK CVE: NA ---------------------------------------------- hmdfs return error 12 when calling setattr in local dir Signed-off-by: qianjiaxing --- fs/hmdfs/hmdfs_dentryfile.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/hmdfs/hmdfs_dentryfile.c b/fs/hmdfs/hmdfs_dentryfile.c index 98b215ba2d8e..e034cb8071f0 100644 --- a/fs/hmdfs/hmdfs_dentryfile.c +++ b/fs/hmdfs/hmdfs_dentryfile.c @@ -875,6 +875,8 @@ int update_inode_to_dentry(struct dentry *child_dentry, struct inode *inode) struct hmdfs_dcache_lookup_ctx ctx; parent_dentry = child_dentry->d_parent; + if (hmdfs_d(parent_dentry)->dentry_type == HMDFS_LAYER_FIRST_DEVICE) + return 0; relative_path = hmdfs_get_dentry_relative_path(parent_dentry); if (!relative_path) -- Gitee From 73cff27fa4bfd65e4a1a08b28b741efcd6a07dc3 Mon Sep 17 00:00:00 2001 From: zhizhimeimei6 Date: Thu, 17 Feb 2022 10:04:05 +0800 Subject: [PATCH 041/113] zerohung: fix CROSS_COMPILE error ohos inclusion category: bugfix issue: #I4U4X3 CVE: NA ----------------- Using notifier_call replaces the orignal way Signed-off-by: zhizhimeimei6 --- .../zerohung/watchpoint/hung_wp_screen.c | 36 ++++++++++++++++--- drivers/video/backlight/backlight.c | 10 ------ include/dfx/hung_wp_screen.h | 2 -- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/drivers/staging/zerohung/watchpoint/hung_wp_screen.c b/drivers/staging/zerohung/watchpoint/hung_wp_screen.c index 9b295fed067d..e76fe54f9d20 100644 --- a/drivers/staging/zerohung/watchpoint/hung_wp_screen.c +++ b/drivers/staging/zerohung/watchpoint/hung_wp_screen.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include #include @@ -61,7 +63,7 @@ static unsigned int headevt; static int *check_off_point; struct work_struct powerkeyevent_sendwork; struct work_struct lpressevent_sendwork; - +static struct notifier_block hung_wp_screen_setblank_ncb; static void zrhung_lpressevent_send_work(struct work_struct *work) { @@ -157,13 +159,19 @@ static void zrhung_powerkeyevent_handler(void) zrhung_new_powerkeyevent(curtime); } -void hung_wp_screen_setblank(int blank) +static int hung_wp_screen_setblank(struct notifier_block *self, unsigned long event, void *data) { unsigned long flags; + struct fb_event *evdata = data; + int blank; if (!init_done) - return; + return 0; + if (event != FB_EVENT_BLANK) + return 0; + + blank = *(int *)evdata->data; spin_lock_irqsave(&(g_hung_data.lock), flags); g_hung_data.fb_blank = blank; if (((g_hung_data.check_id == ZRHUNG_WP_SCREENON) && (blank == 0)) || @@ -173,6 +181,8 @@ void hung_wp_screen_setblank(int blank) g_hung_data.check_id = ZRHUNG_WP_NONE; } spin_unlock_irqrestore(&(g_hung_data.lock), flags); + + return 0; } static void hung_wp_screen_send_work(struct work_struct *work) @@ -212,8 +222,6 @@ static void hung_wp_screen_start(int check_id) g_hung_data.timer.expires = jiffies + msecs_to_jiffies(DEFAULT_TIMEOUT * TIME_CONVERT_UNIT); add_timer(&g_hung_data.timer); pr_info("going to check ID=%d timeout=%d\n", check_id, DEFAULT_TIMEOUT); - - return; } void hung_wp_screen_powerkey_ncb(int event) @@ -271,12 +279,30 @@ static int __init hung_wp_screen_init(void) INIT_WORK(&powerkeyevent_sendwork, zrhung_powerkeyevent_send_work); INIT_WORK(&lpressevent_sendwork, zrhung_lpressevent_send_work); + hung_wp_screen_setblank_ncb.notifier_call = hung_wp_screen_setblank; + fb_register_client(&hung_wp_screen_setblank_ncb); + init_done = true; pr_info("%s done\n", __func__); return 0; } +static void __exit hung_wp_screen_exit(void) +{ + fb_unregister_client(&hung_wp_screen_setblank_ncb); + + cancel_work_sync(&lpressevent_sendwork); + cancel_work_sync(&powerkeyevent_sendwork); + cancel_work_sync(&g_hung_data.send_work); + + destroy_workqueue(g_hung_data.workq); + + del_timer_sync(&g_hung_data.timer); + del_timer_sync(&g_hung_data.long_press_timer); +} + module_init(hung_wp_screen_init); +module_exit(hung_wp_screen_exit); MODULE_AUTHOR("OHOS"); MODULE_DESCRIPTION("Reporting the frozen screen alarm event"); diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 4bda5fc5e624..537fe1b376ad 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -22,10 +22,6 @@ #include #endif -#ifdef CONFIG_DFX_ZEROHUNG -#include -#endif - /** * DOC: overview * @@ -124,9 +120,6 @@ static int fb_notifier_callback(struct notifier_block *self, bd->props.state &= ~BL_CORE_FBBLANK; bd->props.fb_blank = FB_BLANK_UNBLANK; backlight_update_status(bd); -#ifdef CONFIG_DFX_ZEROHUNG - hung_wp_screen_setblank(fb_blank); -#endif } } else if (fb_blank != FB_BLANK_UNBLANK && bd->fb_bl_on[node]) { bd->fb_bl_on[node] = false; @@ -134,9 +127,6 @@ static int fb_notifier_callback(struct notifier_block *self, bd->props.state |= BL_CORE_FBBLANK; bd->props.fb_blank = fb_blank; backlight_update_status(bd); -#ifdef CONFIG_DFX_ZEROHUNG - hung_wp_screen_setblank(fb_blank); -#endif } } out: diff --git a/include/dfx/hung_wp_screen.h b/include/dfx/hung_wp_screen.h index 88bb62435d6a..fa383f267af1 100644 --- a/include/dfx/hung_wp_screen.h +++ b/include/dfx/hung_wp_screen.h @@ -30,7 +30,5 @@ #define WP_SCREEN_OFF_NAME "SCREEN_OFF" void hung_wp_screen_powerkey_ncb(int event); -void hung_wp_screen_setblank(int blank); -int hung_wp_screen_getbl(void); #endif /* HUNG_WP_SCREEN_H */ -- Gitee From cab34d858beb21a0ac330cdf6aacb6aea24f960f Mon Sep 17 00:00:00 2001 From: zhizhimeimei6 Date: Thu, 17 Feb 2022 11:57:10 +0800 Subject: [PATCH 042/113] zerohung,hungtask,hievent: clear code warning ohos inclusion category: bugfix issue: #I4U4X3 CVE: NA ----------------- clear code warning of zerohung, hungtask and hievent Signed-off-by: zhizhimeimei6 --- drivers/staging/hievent/Kconfig | 1 + drivers/staging/hievent/Makefile | 1 + drivers/staging/hievent/hievent_driver.c | 10 ---------- drivers/staging/hievent/hievent_driver.h | 10 ---------- drivers/staging/hievent/hiview_hievent.c | 10 ---------- drivers/staging/hievent/hiview_hievent.h | 10 ---------- drivers/staging/hungtask/Kconfig | 1 + drivers/staging/hungtask/Makefile | 1 + drivers/staging/hungtask/hungtask_base.c | 16 +++------------- drivers/staging/hungtask/hungtask_user.c | 10 ---------- drivers/staging/hungtask/hungtask_user.h | 12 +----------- .../staging/zerohung/watchpoint/hung_wp_screen.c | 10 ---------- drivers/staging/zerohung/zrhung_event.c | 12 +----------- include/dfx/hung_wp_screen.h | 12 +----------- include/dfx/hungtask_base.h | 12 +----------- include/dfx/zrhung.h | 12 +----------- 16 files changed, 12 insertions(+), 128 deletions(-) diff --git a/drivers/staging/hievent/Kconfig b/drivers/staging/hievent/Kconfig index 07834c32ba12..b445a2b90a07 100644 --- a/drivers/staging/hievent/Kconfig +++ b/drivers/staging/hievent/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 config HIEVENT tristate "Enable hievent" help diff --git a/drivers/staging/hievent/Makefile b/drivers/staging/hievent/Makefile index 3d3ff445f5c9..5b2adc23affc 100644 --- a/drivers/staging/hievent/Makefile +++ b/drivers/staging/hievent/Makefile @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_HIEVENT) += hievent_driver.o \ No newline at end of file diff --git a/drivers/staging/hievent/hievent_driver.c b/drivers/staging/hievent/hievent_driver.c index 36b0a778e04f..b65dee9392a3 100644 --- a/drivers/staging/hievent/hievent_driver.c +++ b/drivers/staging/hievent/hievent_driver.c @@ -1,16 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2021 Huawei Technologies Co., Ltd. All rights reserved. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #define pr_fmt(fmt) "hievent_driver " fmt diff --git a/drivers/staging/hievent/hievent_driver.h b/drivers/staging/hievent/hievent_driver.h index 5d52982b78f6..83c67d9d2e15 100644 --- a/drivers/staging/hievent/hievent_driver.h +++ b/drivers/staging/hievent/hievent_driver.h @@ -1,16 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2021 Huawei Technologies Co., Ltd. All rights reserved. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #ifndef HIEVENT_DRIVER_H diff --git a/drivers/staging/hievent/hiview_hievent.c b/drivers/staging/hievent/hiview_hievent.c index c72e6f2bb401..4533b6fbb759 100644 --- a/drivers/staging/hievent/hiview_hievent.c +++ b/drivers/staging/hievent/hiview_hievent.c @@ -1,16 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2021 Huawei Technologies Co., Ltd. All rights reserved. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include "hiview_hievent.h" diff --git a/drivers/staging/hievent/hiview_hievent.h b/drivers/staging/hievent/hiview_hievent.h index 358a3e8fed4e..c1c003510485 100644 --- a/drivers/staging/hievent/hiview_hievent.h +++ b/drivers/staging/hievent/hiview_hievent.h @@ -1,16 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2021 Huawei Technologies Co., Ltd. All rights reserved. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #ifndef HIVIEW_HIEVENT_H diff --git a/drivers/staging/hungtask/Kconfig b/drivers/staging/hungtask/Kconfig index c7b43fa6eb62..4e80dc9fc434 100644 --- a/drivers/staging/hungtask/Kconfig +++ b/drivers/staging/hungtask/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 config DFX_HUNGTASK bool "DFX hungtask" depends on DETECT_HUNG_TASK diff --git a/drivers/staging/hungtask/Makefile b/drivers/staging/hungtask/Makefile index 24951f2cf42c..12def220e3d6 100644 --- a/drivers/staging/hungtask/Makefile +++ b/drivers/staging/hungtask/Makefile @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DFX_HUNGTASK) += hungtask_base.o obj-$(CONFIG_DFX_HUNGTASK_USER) += hungtask_user.o diff --git a/drivers/staging/hungtask/hungtask_base.c b/drivers/staging/hungtask/hungtask_base.c index 740a5d1e2578..95c21dd045d2 100644 --- a/drivers/staging/hungtask/hungtask_base.c +++ b/drivers/staging/hungtask/hungtask_base.c @@ -1,16 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #define pr_fmt(fmt) "hungtask_base " fmt @@ -403,7 +393,7 @@ void do_show_task(struct task_struct *task, unsigned int flag, int d_state_time) upload.tgid = task->tgid; upload.duration = d_state_time; memset(upload.name, 0, sizeof(upload.name)); - strncpy(upload.name, task->comm, sizeof(task->comm)); + strncpy(upload.name, task->comm, sizeof(upload.name)); upload.flag = flag; if (task->flags & PF_FROZEN) upload.flag = (upload.flag | FLAG_PF_FROZEN); @@ -426,7 +416,7 @@ static void create_taskitem(struct task_item *taskitem, taskitem->pid = task->pid; taskitem->tgid = task->tgid; memset(taskitem->name, 0, sizeof(taskitem->name)); - strncpy(taskitem->name, task->comm, sizeof(task->comm)); + strncpy(taskitem->name, task->comm, sizeof(taskitem->name)); taskitem->switch_count = task->nvcsw + task->nivcsw; taskitem->dump_wa = 0; /* whitelist or applist task dump times */ taskitem->panic_wa = 0; /* whitelist or applist task panic times */ @@ -628,7 +618,7 @@ static void update_panic_task(struct task_item *item) upload.pid = item->pid; upload.tgid = item->tgid; memset(upload.name, 0, sizeof(upload.name)); - strncpy(upload.name, item->name, sizeof(item->name)); + strncpy(upload.name, item->name, sizeof(upload.name)); } static void deal_task(struct task_item *item, struct task_struct *task, bool is_called) diff --git a/drivers/staging/hungtask/hungtask_user.c b/drivers/staging/hungtask/hungtask_user.c index 7070ba197d9b..39b0b1bd5035 100644 --- a/drivers/staging/hungtask/hungtask_user.c +++ b/drivers/staging/hungtask/hungtask_user.c @@ -1,16 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #define pr_fmt(fmt) "hungtask_user " fmt diff --git a/drivers/staging/hungtask/hungtask_user.h b/drivers/staging/hungtask/hungtask_user.h index 3cd655cac2d5..17ea7212b21e 100644 --- a/drivers/staging/hungtask/hungtask_user.h +++ b/drivers/staging/hungtask/hungtask_user.h @@ -1,16 +1,6 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #ifndef DFX_HUNGTASK_USER_H diff --git a/drivers/staging/zerohung/watchpoint/hung_wp_screen.c b/drivers/staging/zerohung/watchpoint/hung_wp_screen.c index e76fe54f9d20..3b5f2d6daa54 100644 --- a/drivers/staging/zerohung/watchpoint/hung_wp_screen.c +++ b/drivers/staging/zerohung/watchpoint/hung_wp_screen.c @@ -1,16 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #define pr_fmt(fmt) "zrhung " fmt diff --git a/drivers/staging/zerohung/zrhung_event.c b/drivers/staging/zerohung/zrhung_event.c index 0ad2d9abb31d..be0428d4edfa 100644 --- a/drivers/staging/zerohung/zrhung_event.c +++ b/drivers/staging/zerohung/zrhung_event.c @@ -1,16 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #define pr_fmt(fmt) "zrhung " fmt @@ -117,7 +107,7 @@ struct hisysevent *create_hisysevent(const char *domain, const char *event_name) event = vmalloc(sizeof(*event)); if (!event) { pr_err("failed to vmalloc for event"); - return -ENOMEM; + return NULL; } memset(event, 0, sizeof(*event)); diff --git a/include/dfx/hung_wp_screen.h b/include/dfx/hung_wp_screen.h index fa383f267af1..8b04107e072f 100644 --- a/include/dfx/hung_wp_screen.h +++ b/include/dfx/hung_wp_screen.h @@ -1,16 +1,6 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #ifndef HUNG_WP_SCREEN_H diff --git a/include/dfx/hungtask_base.h b/include/dfx/hungtask_base.h index 5c280b5b21b5..b3cf189a0051 100644 --- a/include/dfx/hungtask_base.h +++ b/include/dfx/hungtask_base.h @@ -1,16 +1,6 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #ifndef DFX_HUNGTASK_BASE_H diff --git a/include/dfx/zrhung.h b/include/dfx/zrhung.h index 9d54df21c817..4a217c99d39c 100644 --- a/include/dfx/zrhung.h +++ b/include/dfx/zrhung.h @@ -1,16 +1,6 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2022 Huawei Technologies Co., Ltd. All rights reserved. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #ifndef ZRHUNG_H -- Gitee From e70c3b09c14ea39b38f7ed89e055b9887694b348 Mon Sep 17 00:00:00 2001 From: zhizhimeimei6 Date: Thu, 17 Feb 2022 21:31:57 +0800 Subject: [PATCH 043/113] zerohung: edit event name ohos inclusion category: bugfix issue: #I4U4X3 CVE: NA ----------------- edit event name to fit hiview Signed-off-by: zhizhimeimei6 --- include/dfx/hung_wp_screen.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/dfx/hung_wp_screen.h b/include/dfx/hung_wp_screen.h index 8b04107e072f..39bad044c942 100644 --- a/include/dfx/hung_wp_screen.h +++ b/include/dfx/hung_wp_screen.h @@ -14,8 +14,8 @@ #define ZRHUNG_WP_SCREENOFF 2 #define WP_SCREEN_DOMAIN "KERNEL_VENDOR" -#define WP_SCREEN_PWK_NAME "POWERKEY" -#define WP_SCREEN_LPRESS_NAME "LONGPRESS" +#define WP_SCREEN_PWK_NAME "POWER_KEY" +#define WP_SCREEN_LPRESS_NAME "LONG_PRESS" #define WP_SCREEN_ON_NAME "SCREEN_ON" #define WP_SCREEN_OFF_NAME "SCREEN_OFF" -- Gitee From e163ca19b50a6863ef2cf9304b1949ddcc378804 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 16 Dec 2021 08:25:12 +0100 Subject: [PATCH 044/113] xen/netback: fix rx queue stall detection stable inclusion from stable-5.10.88 commit 525875c410df5d876b9615c44885ca7640aed6f2 category: bugfix issue: #I4SHEN CVE: CVE-2021-28714 Signed-off-by: Yu Changchun --------------------------------------- commit 6032046ec4b70176d247a71836186d47b25d1684 upstream. Commit 1d5d48523900a4b ("xen-netback: require fewer guest Rx slots when not using GSO") introduced a security problem in netback, as an interface would only be regarded to be stalled if no slot is available in the rx queue ring page. In case the SKB at the head of the queued requests will need more than one rx slot and only one slot is free the stall detection logic will never trigger, as the test for that is only looking for at least one slot to be free. Fix that by testing for the needed number of slots instead of only one slot being available. In order to not have to take the rx queue lock that often, store the number of needed slots in the queue data. As all SKB dequeue operations happen in the rx queue kernel thread this is safe, as long as the number of needed slots is accessed via READ/WRITE_ONCE() only and updates are always done with the rx queue lock held. Add a small helper for obtaining the number of free slots. This is part of XSA-392 Fixes: 1d5d48523900a4b ("xen-netback: require fewer guest Rx slots when not using GSO") Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- drivers/net/xen-netback/common.h | 1 + drivers/net/xen-netback/rx.c | 65 ++++++++++++++++++++------------ 2 files changed, 42 insertions(+), 24 deletions(-) diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 8ee24e351bdc..6a9178896c90 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -203,6 +203,7 @@ struct xenvif_queue { /* Per-queue data for xenvif */ unsigned int rx_queue_max; unsigned int rx_queue_len; unsigned long last_rx_time; + unsigned int rx_slots_needed; bool stalled; struct xenvif_copy_state rx_copy; diff --git a/drivers/net/xen-netback/rx.c b/drivers/net/xen-netback/rx.c index accc991d153f..a8511e27d6c1 100644 --- a/drivers/net/xen-netback/rx.c +++ b/drivers/net/xen-netback/rx.c @@ -33,28 +33,36 @@ #include #include -static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue) +/* + * Update the needed ring page slots for the first SKB queued. + * Note that any call sequence outside the RX thread calling this function + * needs to wake up the RX thread via a call of xenvif_kick_thread() + * afterwards in order to avoid a race with putting the thread to sleep. + */ +static void xenvif_update_needed_slots(struct xenvif_queue *queue, + const struct sk_buff *skb) { - RING_IDX prod, cons; - struct sk_buff *skb; - int needed; - unsigned long flags; - - spin_lock_irqsave(&queue->rx_queue.lock, flags); + unsigned int needed = 0; - skb = skb_peek(&queue->rx_queue); - if (!skb) { - spin_unlock_irqrestore(&queue->rx_queue.lock, flags); - return false; + if (skb) { + needed = DIV_ROUND_UP(skb->len, XEN_PAGE_SIZE); + if (skb_is_gso(skb)) + needed++; + if (skb->sw_hash) + needed++; } - needed = DIV_ROUND_UP(skb->len, XEN_PAGE_SIZE); - if (skb_is_gso(skb)) - needed++; - if (skb->sw_hash) - needed++; + WRITE_ONCE(queue->rx_slots_needed, needed); +} - spin_unlock_irqrestore(&queue->rx_queue.lock, flags); +static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue) +{ + RING_IDX prod, cons; + unsigned int needed; + + needed = READ_ONCE(queue->rx_slots_needed); + if (!needed) + return false; do { prod = queue->rx.sring->req_prod; @@ -80,6 +88,9 @@ void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb) spin_lock_irqsave(&queue->rx_queue.lock, flags); + if (skb_queue_empty(&queue->rx_queue)) + xenvif_update_needed_slots(queue, skb); + __skb_queue_tail(&queue->rx_queue, skb); queue->rx_queue_len += skb->len; @@ -100,6 +111,8 @@ static struct sk_buff *xenvif_rx_dequeue(struct xenvif_queue *queue) skb = __skb_dequeue(&queue->rx_queue); if (skb) { + xenvif_update_needed_slots(queue, skb_peek(&queue->rx_queue)); + queue->rx_queue_len -= skb->len; if (queue->rx_queue_len < queue->rx_queue_max) { struct netdev_queue *txq; @@ -487,27 +500,31 @@ void xenvif_rx_action(struct xenvif_queue *queue) xenvif_rx_copy_flush(queue); } -static bool xenvif_rx_queue_stalled(struct xenvif_queue *queue) +static RING_IDX xenvif_rx_queue_slots(const struct xenvif_queue *queue) { RING_IDX prod, cons; prod = queue->rx.sring->req_prod; cons = queue->rx.req_cons; + return prod - cons; +} + +static bool xenvif_rx_queue_stalled(const struct xenvif_queue *queue) +{ + unsigned int needed = READ_ONCE(queue->rx_slots_needed); + return !queue->stalled && - prod - cons < 1 && + xenvif_rx_queue_slots(queue) < needed && time_after(jiffies, queue->last_rx_time + queue->vif->stall_timeout); } static bool xenvif_rx_queue_ready(struct xenvif_queue *queue) { - RING_IDX prod, cons; - - prod = queue->rx.sring->req_prod; - cons = queue->rx.req_cons; + unsigned int needed = READ_ONCE(queue->rx_slots_needed); - return queue->stalled && prod - cons >= 1; + return queue->stalled && xenvif_rx_queue_slots(queue) >= needed; } bool xenvif_have_rx_work(struct xenvif_queue *queue, bool test_kthread) -- Gitee From e5209ae5e6ebbb4b2dd456f41189fdee6f215a24 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 30 Nov 2021 08:36:12 +0100 Subject: [PATCH 045/113] xen/netback: don't queue unlimited number of packages stable inclusion from stable-5.10.88 commit 88f20cccbeec9a5e83621df5cc2453b5081454dc category: bugfix issue: #I4SHEN CVE: CVE-2021-28715 Signed-off-by: Yu Changchun --------------------------------------- commit be81992f9086b230623ae3ebbc85ecee4d00a3d3 upstream. In case a guest isn't consuming incoming network traffic as fast as it is coming in, xen-netback is buffering network packages in unlimited numbers today. This can result in host OOM situations. Commit f48da8b14d04ca8 ("xen-netback: fix unlimited guest Rx internal queue and carrier flapping") meant to introduce a mechanism to limit the amount of buffered data by stopping the Tx queue when reaching the data limit, but this doesn't work for cases like UDP. When hitting the limit don't queue further SKBs, but drop them instead. In order to be able to tell Rx packages have been dropped increment the rx_dropped statistics counter in this case. It should be noted that the old solution to continue queueing SKBs had the additional problem of an overflow of the 32-bit rx_queue_len value would result in intermittent Tx queue enabling. This is part of XSA-392 Fixes: f48da8b14d04ca8 ("xen-netback: fix unlimited guest Rx internal queue and carrier flapping") Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- drivers/net/xen-netback/rx.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/xen-netback/rx.c b/drivers/net/xen-netback/rx.c index a8511e27d6c1..dbac4c03d21a 100644 --- a/drivers/net/xen-netback/rx.c +++ b/drivers/net/xen-netback/rx.c @@ -88,16 +88,19 @@ void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb) spin_lock_irqsave(&queue->rx_queue.lock, flags); - if (skb_queue_empty(&queue->rx_queue)) - xenvif_update_needed_slots(queue, skb); - - __skb_queue_tail(&queue->rx_queue, skb); - - queue->rx_queue_len += skb->len; - if (queue->rx_queue_len > queue->rx_queue_max) { + if (queue->rx_queue_len >= queue->rx_queue_max) { struct net_device *dev = queue->vif->dev; netif_tx_stop_queue(netdev_get_tx_queue(dev, queue->id)); + kfree_skb(skb); + queue->vif->dev->stats.rx_dropped++; + } else { + if (skb_queue_empty(&queue->rx_queue)) + xenvif_update_needed_slots(queue, skb); + + __skb_queue_tail(&queue->rx_queue, skb); + + queue->rx_queue_len += skb->len; } spin_unlock_irqrestore(&queue->rx_queue.lock, flags); @@ -147,6 +150,7 @@ static void xenvif_rx_queue_drop_expired(struct xenvif_queue *queue) break; xenvif_rx_dequeue(queue); kfree_skb(skb); + queue->vif->dev->stats.rx_dropped++; } } -- Gitee From a2c6cbd7fa0b10f94f414b1ed6348ad85b03387b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 15 Dec 2021 09:39:37 -0500 Subject: [PATCH 046/113] net/packet: rx_owner_map depends on pg_vec stable inclusion from stable-5.10.88 commit 7da349f07e457cad135df0920a3f670e423fb5e9 category: bugfix issue: #I4SHEN CVE: CVE-2021-22600 Signed-off-by: Yu Changchun --------------------------------------- [ Upstream commit ec6af094ea28f0f2dda1a6a33b14cd57e36a9755 ] Packet sockets may switch ring versions. Avoid misinterpreting state between versions, whose fields share a union. rx_owner_map is only allocated with a packet ring (pg_vec) and both are swapped together. If pg_vec is NULL, meaning no packet ring was allocated, then neither was rx_owner_map. And the field may be old state from a tpacket_v3. Fixes: 61fad6816fc1 ("net/packet: tpacket_rcv: avoid a producer race condition") Reported-by: Syzbot Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20211215143937.106178-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yu Changchun --- net/packet/af_packet.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 08144559eed5..f78097aa403a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4461,9 +4461,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, } out_free_pg_vec: - bitmap_free(rx_owner_map); - if (pg_vec) + if (pg_vec) { + bitmap_free(rx_owner_map); free_pg_vec(pg_vec, order, req->tp_block_nr); + } out: return err; } -- Gitee From 140d04773352e50c4d368cc5a3d806781b206233 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 4 Jan 2022 14:16:03 +0000 Subject: [PATCH 047/113] bpf: Fix out of bounds access from invalid *_or_null type verification stable inclusion from stable-5.10.92 commit 35ab8c9085b0af847df7fac9571ccd26d9f0f513 category: bugfix issue: #I4SHEN CVE: CVE-2022-23222 Signed-off-by: Yu Changchun --------------------------------------- [ no upstream commit given implicitly fixed through the larger refactoring in c25b2ae136039ffa820c26138ed4a5e5f3ab3841 ] While auditing some other code, I noticed missing checks inside the pointer arithmetic simulation, more specifically, adjust_ptr_min_max_vals(). Several *_OR_NULL types are not rejected whereas they are _required_ to be rejected given the expectation is that they get promoted into a 'real' pointer type for the success case, that is, after an explicit != NULL check. One case which stands out and is accessible from unprivileged (iff enabled given disabled by default) is BPF ring buffer. From crafting a PoC, the NULL check can be bypassed through an offset, and its id marking will then lead to promotion of mem_or_null to a mem type. bpf_ringbuf_reserve() helper can trigger this case through passing of reserved flags, for example. func#0 @0 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (7a) *(u64 *)(r10 -8) = 0 1: R1=ctx(id=0,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm 1: (18) r1 = 0x0 3: R1_w=map_ptr(id=0,off=0,ks=0,vs=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm 3: (b7) r2 = 8 4: R1_w=map_ptr(id=0,off=0,ks=0,vs=0,imm=0) R2_w=invP8 R10=fp0 fp-8_w=mmmmmmmm 4: (b7) r3 = 0 5: R1_w=map_ptr(id=0,off=0,ks=0,vs=0,imm=0) R2_w=invP8 R3_w=invP0 R10=fp0 fp-8_w=mmmmmmmm 5: (85) call bpf_ringbuf_reserve#131 6: R0_w=mem_or_null(id=2,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 6: (bf) r6 = r0 7: R0_w=mem_or_null(id=2,ref_obj_id=2,off=0,imm=0) R6_w=mem_or_null(id=2,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 7: (07) r0 += 1 8: R0_w=mem_or_null(id=2,ref_obj_id=2,off=1,imm=0) R6_w=mem_or_null(id=2,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 8: (15) if r0 == 0x0 goto pc+4 R0_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R6_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 9: R0_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R6_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 9: (62) *(u32 *)(r6 +0) = 0 R0_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R6_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 10: R0_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R6_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 10: (bf) r1 = r6 11: R0_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R1_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R6_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 11: (b7) r2 = 0 12: R0_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R1_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R2_w=invP0 R6_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 12: (85) call bpf_ringbuf_submit#132 13: R6=invP(id=0) R10=fp0 fp-8=mmmmmmmm 13: (b7) r0 = 0 14: R0_w=invP0 R6=invP(id=0) R10=fp0 fp-8=mmmmmmmm 14: (95) exit from 8 to 13: safe processed 15 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 0 OK All three commits, that is b121b341e598 ("bpf: Add PTR_TO_BTF_ID_OR_NULL support"), 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it"), and the afbf21dce668 ("bpf: Support readonly/readwrite buffers in verifier") suffer the same cause and their *_OR_NULL type pendants must be rejected in adjust_ptr_min_max_vals(). Make the test more robust by reusing reg_type_may_be_null() helper such that we catch all *_OR_NULL types we have today and in future. Note that pointer arithmetic on PTR_TO_BTF_ID, PTR_TO_RDONLY_BUF, and PTR_TO_RDWR_BUF is generally allowed. Fixes: b121b341e598 ("bpf: Add PTR_TO_BTF_ID_OR_NULL support") Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Fixes: afbf21dce668 ("bpf: Support readonly/readwrite buffers in verifier") Signed-off-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- kernel/bpf/verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4215c2ff6aeb..ced1f02c43f9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6031,16 +6031,16 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, fallthrough; case PTR_TO_PACKET_END: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: +reject: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; default: + if (reg_type_may_be_null(ptr_reg->type)) + goto reject; break; } -- Gitee From 955b7af56fdac61120fe4de675b15707efe8e31e Mon Sep 17 00:00:00 2001 From: Jamie Hill-Daniel Date: Tue, 18 Jan 2022 08:06:04 +0100 Subject: [PATCH 048/113] vfs: fs_context: fix up param length parsing in legacy_parse_param stable inclusion from stable-5.10.93 commit eadde287a62e66b2f9e62d007c59a8f50d4b8413 category: bugfix issue: #I4SHEN CVE: CVE-2022-0185 Signed-off-by: Yu Changchun --------------------------------------- commit 722d94847de29310e8aa03fcbdb41fc92c521756 upstream. The "PAGE_SIZE - 2 - size" calculation in legacy_parse_param() is an unsigned type so a large value of "size" results in a high positive value instead of a negative value as expected. Fix this by getting rid of the subtraction. Signed-off-by: Jamie Hill-Daniel Signed-off-by: William Liu Tested-by: Salvatore Bonaccorso Tested-by: Thadeu Lima de Souza Cascardo Acked-by: Dan Carpenter Acked-by: Al Viro Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- fs/fs_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fs_context.c b/fs/fs_context.c index b7e43a780a62..24ce12f0db32 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -548,7 +548,7 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) param->key); } - if (len > PAGE_SIZE - 2 - size) + if (size + len + 2 > PAGE_SIZE) return invalf(fc, "VFS: Legacy: Cumulative options too large"); if (strchr(param->key, ',') || (param->type == fs_value_is_string && -- Gitee From 02b500e166faaa8119fe90288dad447b703b525b Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 27 Jan 2022 18:34:19 +1000 Subject: [PATCH 049/113] drm/vmwgfx: Fix stale file descriptors on failed usercopy stable inclusion from stable-5.10.95 commit ae2b20f27732fe92055d9e7b350abc5cdf3e2414 category: bugfix issue: #I4SHEN CVE: CVE-2022-22942 Signed-off-by: Yu Changchun --------------------------------------- commit a0f90c8815706981c483a652a6aefca51a5e191c upstream. A failing usercopy of the fence_rep object will lead to a stale entry in the file descriptor table as put_unused_fd() won't release it. This enables userland to refer to a dangling 'file' object through that still valid file descriptor, leading to all kinds of use-after-free exploitation scenarios. Fix this by deferring the call to fd_install() until after the usercopy has succeeded. Fixes: c906965dee22 ("drm/vmwgfx: Add export fence to file descriptor support") Signed-off-by: Mathias Krause Signed-off-by: Zack Rusin Signed-off-by: Dave Airlie Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 5 ++-- drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 33 +++++++++++++------------ drivers/gpu/drm/vmwgfx/vmwgfx_fence.c | 2 +- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h index 1523b51a7284..ad208a5f4ebe 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h @@ -1088,15 +1088,14 @@ extern int vmw_execbuf_fence_commands(struct drm_file *file_priv, struct vmw_private *dev_priv, struct vmw_fence_obj **p_fence, uint32_t *p_handle); -extern void vmw_execbuf_copy_fence_user(struct vmw_private *dev_priv, +extern int vmw_execbuf_copy_fence_user(struct vmw_private *dev_priv, struct vmw_fpriv *vmw_fp, int ret, struct drm_vmw_fence_rep __user *user_fence_rep, struct vmw_fence_obj *fence, uint32_t fence_handle, - int32_t out_fence_fd, - struct sync_file *sync_file); + int32_t out_fence_fd); bool vmw_cmd_describe(const void *buf, u32 *size, char const **cmd); /** diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c index 83e1b54eb864..739cbc77d886 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c @@ -3816,17 +3816,17 @@ int vmw_execbuf_fence_commands(struct drm_file *file_priv, * Also if copying fails, user-space will be unable to signal the fence object * so we wait for it immediately, and then unreference the user-space reference. */ -void +int vmw_execbuf_copy_fence_user(struct vmw_private *dev_priv, struct vmw_fpriv *vmw_fp, int ret, struct drm_vmw_fence_rep __user *user_fence_rep, struct vmw_fence_obj *fence, uint32_t fence_handle, - int32_t out_fence_fd, struct sync_file *sync_file) + int32_t out_fence_fd) { struct drm_vmw_fence_rep fence_rep; if (user_fence_rep == NULL) - return; + return 0; memset(&fence_rep, 0, sizeof(fence_rep)); @@ -3854,20 +3854,14 @@ vmw_execbuf_copy_fence_user(struct vmw_private *dev_priv, * handle. */ if (unlikely(ret != 0) && (fence_rep.error == 0)) { - if (sync_file) - fput(sync_file->file); - - if (fence_rep.fd != -1) { - put_unused_fd(fence_rep.fd); - fence_rep.fd = -1; - } - ttm_ref_object_base_unref(vmw_fp->tfile, fence_handle, TTM_REF_USAGE); VMW_DEBUG_USER("Fence copy error. Syncing.\n"); (void) vmw_fence_obj_wait(fence, false, false, VMW_FENCE_WAIT_TIMEOUT); } + + return ret ? -EFAULT : 0; } /** @@ -4209,16 +4203,23 @@ int vmw_execbuf_process(struct drm_file *file_priv, (void) vmw_fence_obj_wait(fence, false, false, VMW_FENCE_WAIT_TIMEOUT); + } + } + + ret = vmw_execbuf_copy_fence_user(dev_priv, vmw_fpriv(file_priv), ret, + user_fence_rep, fence, handle, out_fence_fd); + + if (sync_file) { + if (ret) { + /* usercopy of fence failed, put the file object */ + fput(sync_file->file); + put_unused_fd(out_fence_fd); } else { /* Link the fence with the FD created earlier */ fd_install(out_fence_fd, sync_file->file); } } - vmw_execbuf_copy_fence_user(dev_priv, vmw_fpriv(file_priv), ret, - user_fence_rep, fence, handle, out_fence_fd, - sync_file); - /* Don't unreference when handing fence out */ if (unlikely(out_fence != NULL)) { *out_fence = fence; @@ -4236,7 +4237,7 @@ int vmw_execbuf_process(struct drm_file *file_priv, */ vmw_validation_unref_lists(&val_ctx); - return 0; + return ret; out_unlock_binding: mutex_unlock(&dev_priv->binding_mutex); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c index 0f8d29397157..8bc41ec97d71 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c @@ -1171,7 +1171,7 @@ int vmw_fence_event_ioctl(struct drm_device *dev, void *data, } vmw_execbuf_copy_fence_user(dev_priv, vmw_fp, 0, user_fence_rep, fence, - handle, -1, NULL); + handle, -1); vmw_fence_obj_unreference(&fence); return 0; out_no_create: diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 312ed0881a99..e58112997c88 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -2479,7 +2479,7 @@ void vmw_kms_helper_validation_finish(struct vmw_private *dev_priv, if (file_priv) vmw_execbuf_copy_fence_user(dev_priv, vmw_fpriv(file_priv), ret, user_fence_rep, fence, - handle, -1, NULL); + handle, -1); if (out_fence) *out_fence = fence; else -- Gitee From 61e372b699f4349caef1e54ee80558471e8bc98b Mon Sep 17 00:00:00 2001 From: Janis Schoetterl-Glausch Date: Fri, 28 Jan 2022 15:06:43 +0100 Subject: [PATCH 050/113] KVM: s390: Return error on SIDA memop on normal guest stable inclusion from stable-5.10.100 commit b62267b8b06e9b8bb429ae8f962ee431e6535d60 category: bugfix issue: #I4SHEN CVE: CVE-2022-0516 Signed-off-by: Yu Changchun --------------------------------------- commit 2c212e1baedcd782b2535a3f86bc491977677c0e upstream. Refuse SIDA memops on guests which are not protected. For normal guests, the secure instruction data address designation, which determines the location we access, is not under control of KVM. Fixes: 19e122776886 (KVM: S390: protvirt: Introduce instruction data area bounce buffer) Signed-off-by: Janis Schoetterl-Glausch Cc: stable@vger.kernel.org Signed-off-by: Christian Borntraeger Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- arch/s390/kvm/kvm-s390.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 00f03f363c9b..516c0f1829af 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4649,6 +4649,8 @@ static long kvm_s390_guest_sida_op(struct kvm_vcpu *vcpu, return -EINVAL; if (mop->size + mop->sida_offset > sida_size(vcpu->arch.sie_block)) return -E2BIG; + if (!kvm_s390_pv_cpu_is_protected(vcpu)) + return -EINVAL; switch (mop->op) { case KVM_S390_MEMOP_SIDA_READ: -- Gitee From 4749d06f83ceaadf3ad0cfbf6f10f398f0fd7e72 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 20 Jan 2022 11:04:01 -0600 Subject: [PATCH 051/113] cgroup-v1: Require capabilities to set release_agent stable inclusion from stable-5.10.97 commit 1fc3444cda9a78c65b769e3fa93455e09ff7a0d3 category: bugfix issue: #I4TM31 CVE: CVE-2022-0492 Signed-off-by: Yu Changchun --------------------------------------- commit 24f6008564183aa120d07c03d9289519c2fe02af upstream. The cgroup release_agent is called with call_usermodehelper. The function call_usermodehelper starts the release_agent with a full set fo capabilities. Therefore require capabilities when setting the release_agaent. Reported-by: Tabitha Sable Tested-by: Tabitha Sable Fixes: 81a6a5cdd2c5 ("Task Control Groups: automatic userspace notification of idle cgroups") Cc: stable@vger.kernel.org # v2.6.24+ Signed-off-by: "Eric W. Biederman" Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- kernel/cgroup/cgroup-v1.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 1805c682ccc3..9f5221653f80 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -548,6 +548,14 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((of->file->f_cred->user_ns != &init_user_ns) || + !capable(CAP_SYS_ADMIN)) + return -EPERM; + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -961,6 +969,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) + return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; -- Gitee From 0413fe9272a58007ea228705772792bb5055a5d5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 6 Jan 2022 18:24:02 -0500 Subject: [PATCH 052/113] NFSv4: Handle case where the lookup of a directory fails stable inclusion from stable-5.10.96 commit ce8c552b88ca25d775ecd0a0fbef4e0e03de9ed2 category: bugfix issue: #I4TM31 CVE: CVE-2022-24448 Signed-off-by: Yu Changchun --------------------------------------- commit ac795161c93699d600db16c1a8cc23a65a1eceaf upstream. If the application sets the O_DIRECTORY flag, and tries to open a regular file, nfs_atomic_open() will punt to doing a regular lookup. If the server then returns a regular file, we will happily return a file descriptor with uninitialised open state. The fix is to return the expected ENOTDIR error in these cases. Reported-by: Lyu Tao Fixes: 0dd2b474d0b6 ("nfs: implement i_op->atomic_open()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- fs/nfs/dir.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c837675cd395..6cba7437cfdc 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1778,6 +1778,19 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, no_open: res = nfs_lookup(dir, dentry, lookup_flags); + if (!res) { + inode = d_inode(dentry); + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !S_ISDIR(inode->i_mode)) + res = ERR_PTR(-ENOTDIR); + } else if (!IS_ERR(res)) { + inode = d_inode(res); + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !S_ISDIR(inode->i_mode)) { + dput(res); + res = ERR_PTR(-ENOTDIR); + } + } if (switched) { d_lookup_done(dentry); if (!res) -- Gitee From 29667a2593e94437997d0b1a2921f903436e4c76 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 6 Jan 2022 18:24:03 -0500 Subject: [PATCH 053/113] NFSv4: nfs_atomic_open() can race when looking up a non-regular file stable inclusion from stable-5.10.96 commit 87880e3803ced144ed5957d91a0facb52d4399d2 category: bugfix issue: #I4TM31 CVE: CVE-2022-24448 Signed-off-by: Yu Changchun --------------------------------------- commit 1751fc1db36f6f411709e143d5393f92d12137a9 upstream. If the file type changes back to being a regular file on the server between the failed OPEN and our LOOKUP, then we need to re-run the OPEN. Fixes: 0dd2b474d0b6 ("nfs: implement i_op->atomic_open()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- fs/nfs/dir.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 6cba7437cfdc..0844de0290fd 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1783,12 +1783,17 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, if ((lookup_flags & LOOKUP_DIRECTORY) && inode && !S_ISDIR(inode->i_mode)) res = ERR_PTR(-ENOTDIR); + else if (inode && S_ISREG(inode->i_mode)) + res = ERR_PTR(-EOPENSTALE); } else if (!IS_ERR(res)) { inode = d_inode(res); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && !S_ISDIR(inode->i_mode)) { dput(res); res = ERR_PTR(-ENOTDIR); + } else if (inode && S_ISREG(inode->i_mode)) { + dput(res); + res = ERR_PTR(-EOPENSTALE); } } if (switched) { -- Gitee From 3591af0053c8b2a32804dab15d01fc3369611993 Mon Sep 17 00:00:00 2001 From: Dai Li Date: Mon, 21 Feb 2022 17:27:47 +0800 Subject: [PATCH 054/113] sched: Introduce frame-based related thread group ohos inclusion category: feature issue: #I4U089 CVE: NA ------------------------------------------- Support frame-based related thread group scheduling. Signed-off-by: Dai Li Signed-off-by: Li Ming --- include/linux/sched/frame_rtg.h | 26 ++++ include/linux/sched/rtg.h | 1 + kernel/sched/rtg/Kconfig | 9 ++ kernel/sched/rtg/Makefile | 1 + kernel/sched/rtg/frame_rtg.c | 219 ++++++++++++++++++++++++++++++++ kernel/sched/rtg/frame_rtg.h | 53 ++++++++ 6 files changed, 309 insertions(+) create mode 100644 include/linux/sched/frame_rtg.h create mode 100644 kernel/sched/rtg/frame_rtg.c create mode 100644 kernel/sched/rtg/frame_rtg.h diff --git a/include/linux/sched/frame_rtg.h b/include/linux/sched/frame_rtg.h new file mode 100644 index 000000000000..44387dc5654a --- /dev/null +++ b/include/linux/sched/frame_rtg.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Frame declaration + * + * Copyright (c) 2022-2023 Huawei Technologies Co., Ltd. + */ + +#ifndef __SCHED_FRAME_RTG_H +#define __SCHED_FRAME_RTG_H + +#ifdef CONFIG_SCHED_RTG_FRAME + +#define MAX_TID_NUM 5 + +struct frame_info { + rwlock_t lock; + struct related_thread_group *rtg; + struct task_struct *thread[MAX_TID_NUM]; + int thread_num; + unsigned int frame_rate; // frame rate + u64 frame_time; +}; + +struct frame_info *rtg_frame_info(int id); +#endif +#endif diff --git a/include/linux/sched/rtg.h b/include/linux/sched/rtg.h index 735b8ccae745..ec738f49fd1e 100644 --- a/include/linux/sched/rtg.h +++ b/include/linux/sched/rtg.h @@ -47,6 +47,7 @@ struct related_thread_group { unsigned long freq_update_interval; /* in nanoseconds */ u64 last_util_update_time; u64 last_freq_update_time; + void *private_data; }; struct rtg_class { diff --git a/kernel/sched/rtg/Kconfig b/kernel/sched/rtg/Kconfig index 3e5acad17ac5..837c0341c514 100644 --- a/kernel/sched/rtg/Kconfig +++ b/kernel/sched/rtg/Kconfig @@ -22,4 +22,13 @@ config SCHED_RTG_CGROUP If set, support for adding the tasks which belong to co-located cgroup to DEFAULT_CGROUP_COLOC RTG. +config SCHED_RTG_FRAME + bool "Frame-based Related Thread Group" + depends on SCHED_RTG + default n + help + Support frame-based related thread group scheduling. + If set, you can set the task to RTG and kernel will + statistic the load per frame. + endmenu diff --git a/kernel/sched/rtg/Makefile b/kernel/sched/rtg/Makefile index a911575b0734..13795817f087 100644 --- a/kernel/sched/rtg/Makefile +++ b/kernel/sched/rtg/Makefile @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_SCHED_RTG) += rtg.o +obj-$(CONFIG_SCHED_RTG_FRAME) += frame_rtg.o diff --git a/kernel/sched/rtg/frame_rtg.c b/kernel/sched/rtg/frame_rtg.c new file mode 100644 index 000000000000..1377140da6d7 --- /dev/null +++ b/kernel/sched/rtg/frame_rtg.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Frame-based load tracking for rt_frame and RTG + * + * Copyright (c) 2022-2023 Huawei Technologies Co., Ltd. + */ + +#include "frame_rtg.h" +#include "rtg.h" + +#include + +static struct multi_frame_id_manager g_id_manager = { + .id_map = {0}, + .offset = 0, + .lock = __RW_LOCK_UNLOCKED(g_id_manager.lock) +}; + +static struct frame_info g_multi_frame_info[MULTI_FRAME_NUM]; + +bool is_frame_rtg(int id) +{ + return (id >= MULTI_FRAME_ID) && + (id < (MULTI_FRAME_ID + MULTI_FRAME_NUM)); +} + +static struct related_thread_group *frame_rtg(int id) +{ + if (!is_frame_rtg(id)) + return NULL; + + return lookup_related_thread_group(id); +} + +struct frame_info *rtg_frame_info(int id) +{ + if (!is_frame_rtg(id)) + return NULL; + + return rtg_active_multi_frame_info(id); +} + +static int alloc_rtg_id(void) +{ + unsigned int id_offset; + int id; + + write_lock(&g_id_manager.lock); + id_offset = find_next_zero_bit(g_id_manager.id_map, MULTI_FRAME_NUM, + g_id_manager.offset); + if (id_offset >= MULTI_FRAME_NUM) { + id_offset = find_first_zero_bit(g_id_manager.id_map, + MULTI_FRAME_NUM); + if (id_offset >= MULTI_FRAME_NUM) { + write_unlock(&g_id_manager.lock); + return -EINVAL; + } + } + + set_bit(id_offset, g_id_manager.id_map); + g_id_manager.offset = id_offset; + id = id_offset + MULTI_FRAME_ID; + write_unlock(&g_id_manager.lock); + pr_debug("[FRAME_RTG] %s id_offset=%u, id=%d\n", __func__, id_offset, id); + + return id; +} + +static void free_rtg_id(int id) +{ + unsigned int id_offset = id - MULTI_FRAME_ID; + + if (id_offset >= MULTI_FRAME_NUM) { + pr_err("[FRAME_RTG] %s id_offset is invalid, id=%d, id_offset=%u.\n", + __func__, id, id_offset); + return; + } + + pr_debug("[FRAME_RTG] %s id=%d id_offset=%u\n", __func__, id, id_offset); + write_lock(&g_id_manager.lock); + clear_bit(id_offset, g_id_manager.id_map); + write_unlock(&g_id_manager.lock); +} + +int set_frame_rate(struct frame_info *frame_info, int rate) +{ + int id; + + if ((rate < MIN_FRAME_RATE) || (rate > MAX_FRAME_RATE)) { + pr_err("[FRAME_RTG]: %s invalid QOS(rate) value\n", + __func__); + return -EINVAL; + } + + if (!frame_info || !frame_info->rtg) + return -EINVAL; + + frame_info->frame_rate = (unsigned int)rate; + frame_info->frame_time = frame_info->frame_time = div_u64(NSEC_PER_SEC, rate); + id = frame_info->rtg->id; + + return 0; +} + +int alloc_multi_frame_info(void) +{ + struct frame_info *frame_info = NULL; + int id; + + id = alloc_rtg_id(); + if (id < 0) + return id; + + frame_info = rtg_frame_info(id); + if (!frame_info) { + free_rtg_id(id); + return -EINVAL; + } + + set_frame_rate(frame_info, DEFAULT_FRAME_RATE); + + return id; +} + +void release_multi_frame_info(int id) +{ + if ((id < MULTI_FRAME_ID) || (id >= MULTI_FRAME_ID + MULTI_FRAME_NUM)) { + pr_err("[FRAME_RTG] %s frame(id=%d) not found.\n", __func__, id); + return; + } + + read_lock(&g_id_manager.lock); + if (!test_bit(id - MULTI_FRAME_ID, g_id_manager.id_map)) { + read_unlock(&g_id_manager.lock); + return; + } + read_unlock(&g_id_manager.lock); + + pr_debug("[FRAME_RTG] %s release frame(id=%d).\n", __func__, id); + free_rtg_id(id); +} + +void clear_multi_frame_info(void) +{ + write_lock(&g_id_manager.lock); + bitmap_zero(g_id_manager.id_map, MULTI_FRAME_NUM); + g_id_manager.offset = 0; + write_unlock(&g_id_manager.lock); +} + +struct frame_info *rtg_active_multi_frame_info(int id) +{ + struct frame_info *frame_info = NULL; + + if ((id < MULTI_FRAME_ID) || (id >= MULTI_FRAME_ID + MULTI_FRAME_NUM)) + return NULL; + + read_lock(&g_id_manager.lock); + if (test_bit(id - MULTI_FRAME_ID, g_id_manager.id_map)) + frame_info = &g_multi_frame_info[id - MULTI_FRAME_ID]; + read_unlock(&g_id_manager.lock); + if (!frame_info) + pr_debug("[FRAME_RTG] %s frame %d has been released\n", + __func__, id); + + return frame_info; +} + +struct frame_info *rtg_multi_frame_info(int id) +{ + if ((id < MULTI_FRAME_ID) || (id >= MULTI_FRAME_ID + MULTI_FRAME_NUM)) + return NULL; + + return &g_multi_frame_info[id - MULTI_FRAME_ID]; +} + +static int _init_frame_info(struct frame_info *frame_info, int id) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + + memset(frame_info, 0, sizeof(struct frame_info)); + rwlock_init(&frame_info->lock); + + write_lock(&frame_info->lock); + frame_info->frame_rate = DEFAULT_FRAME_RATE; + frame_info->frame_time = div_u64(NSEC_PER_SEC, frame_info->frame_rate); + frame_info->thread_num = 0; + + grp = frame_rtg(id); + if (unlikely(!grp)) { + write_unlock(&frame_info->lock); + return -EINVAL; + } + + raw_spin_lock_irqsave(&grp->lock, flags); + grp->private_data = frame_info; + raw_spin_unlock_irqrestore(&grp->lock, flags); + + frame_info->rtg = grp; + write_unlock(&frame_info->lock); + + return 0; +} + +static int __init init_frame_info(void) +{ + int ret = 0; + int id; + + for (id = MULTI_FRAME_ID; id < (MULTI_FRAME_ID + MULTI_FRAME_NUM); id++) { + if (ret != 0) + break; + ret = _init_frame_info(rtg_multi_frame_info(id), id); + } + + return ret; +} +late_initcall(init_frame_info); diff --git a/kernel/sched/rtg/frame_rtg.h b/kernel/sched/rtg/frame_rtg.h new file mode 100644 index 000000000000..56d95c9d45ec --- /dev/null +++ b/kernel/sched/rtg/frame_rtg.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Frame declaration + * + * Copyright (c) 2022-2023 Huawei Technologies Co., Ltd. + */ + +#ifndef __FRAME_RTG_H +#define __FRAME_RTG_H + +#include +#include +#include +#include + +#define MULTI_FRAME_ID (DEFAULT_CGROUP_COLOC_ID + 1) +#define MULTI_FRAME_NUM (MAX_NUM_CGROUP_COLOC_ID - DEFAULT_CGROUP_COLOC_ID - 1) + +#define FRAME_START (1 << 0) +#define FRAME_END (1 << 1) +#define FRAME_INVALID (1 << 2) + +#define DEFAULT_FRAME_RATE 60 +#define MIN_FRAME_RATE 1 +#define MAX_FRAME_RATE 120 + +/* MARGIN value : [-100, 100] */ +#define DEFAULT_VLOAD_MARGIN 16 +#define MIN_VLOAD_MARGIN (-100) +#define MAX_VLOAD_MARGIN 0xffff + +#define FRAME_MAX_VLOAD SCHED_CAPACITY_SCALE +#define FRAME_MAX_LOAD SCHED_CAPACITY_SCALE +#define FRAME_UTIL_INVALID_FACTOR 4 +#define FRAME_DEFAULT_MIN_UTIL 0 +#define FRAME_DEFAULT_MAX_UTIL SCHED_CAPACITY_SCALE +#define FRAME_DEFAULT_MIN_PREV_UTIL 0 +#define FRAME_DEFAULT_MAX_PREV_UTIL SCHED_CAPACITY_SCALE + +struct multi_frame_id_manager { + DECLARE_BITMAP(id_map, MULTI_FRAME_NUM); + unsigned int offset; + rwlock_t lock; +}; + +bool is_frame_rtg(int id); +int set_frame_rate(struct frame_info *frame_info, int rate); +int alloc_multi_frame_info(void); +struct frame_info *rtg_active_multi_frame_info(int id); +struct frame_info *rtg_multi_frame_info(int id); +void release_multi_frame_info(int id); +void clear_multi_frame_info(void); +#endif -- Gitee From 87ac739f168abe4427bea4b53841c6e4d21478e9 Mon Sep 17 00:00:00 2001 From: Dai Li Date: Mon, 21 Feb 2022 17:32:53 +0800 Subject: [PATCH 055/113] sched: Add trace point for frame RTG ohos inclusion category: feature issue: #I4U089 CVE: NA ------------------------------------------- Add rtg_frame_sched trace point. Signed-off-by: Dai Li Signed-off-by: Li Ming --- include/trace/events/rtg.h | 28 ++++++++++++++++++++++++++++ kernel/sched/rtg/frame_rtg.c | 2 ++ 2 files changed, 30 insertions(+) diff --git a/include/trace/events/rtg.h b/include/trace/events/rtg.h index 12422d2c3ee2..b885e328de12 100644 --- a/include/trace/events/rtg.h +++ b/include/trace/events/rtg.h @@ -7,6 +7,7 @@ #include #include +#include struct rq; @@ -111,6 +112,33 @@ TRACE_EVENT(sched_rtg_valid_normalized_util, __entry->id, __entry->nr_running, __get_bitmask(cpus), __entry->valid) ); + +#ifdef CONFIG_SCHED_RTG_FRAME +TRACE_EVENT(rtg_frame_sched, + + TP_PROTO(int rtgid, const char *s, s64 value), + + TP_ARGS(rtgid, s, value), + TP_STRUCT__entry( + __field(int, rtgid) + __field(struct frame_info *, frame) + __field(pid_t, pid) + __string(str, s) + __field(s64, value) + ), + + TP_fast_assign( + __assign_str(str, s); + __entry->rtgid = rtgid != -1 ? rtgid : (current->grp ? current->grp->id : 0); + __entry->frame = rtg_frame_info(rtgid); + __entry->pid = __entry->frame ? ((__entry->frame->thread[0]) ? + ((__entry->frame->thread[0])->pid) : + current->tgid) : current->tgid; + __entry->value = value; + ), + TP_printk("C|%d|%s_%d|%lld", __entry->pid, __get_str(str), __entry->rtgid, __entry->value) +); +#endif #endif /* _TRACE_RTG_H */ /* This part must be outside protection */ diff --git a/kernel/sched/rtg/frame_rtg.c b/kernel/sched/rtg/frame_rtg.c index 1377140da6d7..b29c0fc7ffd7 100644 --- a/kernel/sched/rtg/frame_rtg.c +++ b/kernel/sched/rtg/frame_rtg.c @@ -9,6 +9,7 @@ #include "rtg.h" #include +#include static struct multi_frame_id_manager g_id_manager = { .id_map = {0}, @@ -98,6 +99,7 @@ int set_frame_rate(struct frame_info *frame_info, int rate) frame_info->frame_rate = (unsigned int)rate; frame_info->frame_time = frame_info->frame_time = div_u64(NSEC_PER_SEC, rate); id = frame_info->rtg->id; + trace_rtg_frame_sched(id, "FRAME_QOS", rate); return 0; } -- Gitee From 10d04a4accd3c34319a7d47c03e9206c64016bca Mon Sep 17 00:00:00 2001 From: Dai Li Date: Mon, 21 Feb 2022 17:34:14 +0800 Subject: [PATCH 056/113] sched: Support adding tasks to frame RTG ohos inclusion category: feature issue: #I4U089 CVE: NA ------------------------------------------- Add interfaces for frame RTG: 1. add threads to frame RTG; 2. change the priority of frame RTG. Signed-off-by: Dai Li Signed-off-by: Li Ming --- include/linux/sched/frame_rtg.h | 1 + kernel/sched/rtg/frame_rtg.c | 185 ++++++++++++++++++++++++++++++++ kernel/sched/rtg/frame_rtg.h | 22 ++++ 3 files changed, 208 insertions(+) diff --git a/include/linux/sched/frame_rtg.h b/include/linux/sched/frame_rtg.h index 44387dc5654a..322f59a09c2b 100644 --- a/include/linux/sched/frame_rtg.h +++ b/include/linux/sched/frame_rtg.h @@ -15,6 +15,7 @@ struct frame_info { rwlock_t lock; struct related_thread_group *rtg; + int prio; struct task_struct *thread[MAX_TID_NUM]; int thread_num; unsigned int frame_rate; // frame rate diff --git a/kernel/sched/rtg/frame_rtg.c b/kernel/sched/rtg/frame_rtg.c index b29c0fc7ffd7..19429fdc5f93 100644 --- a/kernel/sched/rtg/frame_rtg.c +++ b/kernel/sched/rtg/frame_rtg.c @@ -10,6 +10,8 @@ #include #include +#include <../kernel/sched/sched.h> +#include static struct multi_frame_id_manager g_id_manager = { .id_map = {0}, @@ -176,6 +178,188 @@ struct frame_info *rtg_multi_frame_info(int id) return &g_multi_frame_info[id - MULTI_FRAME_ID]; } +static void do_update_frame_task_prio(struct frame_info *frame_info, + struct task_struct *task, int prio) +{ + int policy = SCHED_NORMAL; + struct sched_param sp = {0}; + + policy = SCHED_FIFO | SCHED_RESET_ON_FORK; + sp.sched_priority = MAX_USER_RT_PRIO - 1 - prio; + sched_setscheduler_nocheck(task, policy, &sp); +} + +static void update_frame_task_prio(struct frame_info *frame_info, int prio) +{ + int i; + struct task_struct *thread = NULL; + + for (i = 0; i < MAX_TID_NUM; i++) { + thread = frame_info->thread[i]; + if (thread) + do_update_frame_task_prio(frame_info, thread, prio); + } +} + +void set_frame_prio(struct frame_info *frame_info, int prio) +{ + if (!frame_info) + return; + + write_lock(&frame_info->lock); + if (frame_info->prio == prio) + goto out; + + update_frame_task_prio(frame_info, prio); + frame_info->prio = prio; +out: + write_unlock(&frame_info->lock); +} + +static int do_set_rtg_sched(struct task_struct *task, bool is_rtg, + int grp_id, int prio) +{ + int err; + int policy = SCHED_NORMAL; + int grpid = DEFAULT_RTG_GRP_ID; + bool is_rt_task = (prio != NOT_RT_PRIO); + struct sched_param sp = {0}; + + if (is_rtg) { + if (is_rt_task) { + policy = SCHED_FIFO | SCHED_RESET_ON_FORK; + sp.sched_priority = MAX_USER_RT_PRIO - 1 - prio; + } + grpid = grp_id; + } + err = sched_setscheduler_nocheck(task, policy, &sp); + if (err < 0) { + pr_err("[FRAME_RTG]: %s task:%d setscheduler err:%d\n", + __func__, task->pid, err); + return err; + } + err = sched_set_group_id(task, grpid); + if (err < 0) { + pr_err("[FRAME_RTG]: %s task:%d set_group_id err:%d\n", + __func__, task->pid, err); + if (is_rtg) { + policy = SCHED_NORMAL; + sp.sched_priority = 0; + sched_setscheduler_nocheck(task, policy, &sp); + } + } + + return err; +} + +static int set_rtg_sched(struct task_struct *task, bool is_rtg, + int grp_id, int prio) +{ + int err = -1; + bool is_rt_task = (prio != NOT_RT_PRIO); + + if (!task) + return err; + + if (is_rt_task && is_rtg && ((prio < 0) || + (prio > MAX_USER_RT_PRIO - 1))) + return err; + /* + * if CONFIG_HW_FUTEX_PI is set, task->prio and task->sched_class + * may be modified by rtmutex. So we use task->policy instead. + */ + if (is_rtg && (!fair_policy(task->policy) || (task->flags & PF_EXITING))) + return err; + + if (in_interrupt()) { + pr_err("[FRAME_RTG]: %s is in interrupt\n", __func__); + return err; + } + + return do_set_rtg_sched(task, is_rtg, grp_id, prio); +} + +static bool set_frame_rtg_thread(int grp_id, struct task_struct *task, + bool is_rtg, int prio) +{ + int depth; + + if (!task) + return false; + depth = task->rtg_depth; + if (is_rtg) + task->rtg_depth = STATIC_RTG_DEPTH; + else + task->rtg_depth = 0; + + if (set_rtg_sched(task, is_rtg, grp_id, prio) < 0) { + task->rtg_depth = depth; + return false; + } + + return true; +} + +struct task_struct *update_frame_thread(struct frame_info *frame_info, + int old_prio, int prio, int pid, + struct task_struct *old_task) +{ + struct task_struct *task = NULL; + int new_prio = prio; + bool update_ret = false; + + if (pid > 0) { + if (old_task && (pid == old_task->pid) && (old_prio == new_prio)) + return old_task; + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + } + set_frame_rtg_thread(frame_info->rtg->id, old_task, false, NOT_RT_PRIO); + update_ret = set_frame_rtg_thread(frame_info->rtg->id, task, true, new_prio); + if (old_task) + put_task_struct(old_task); + if (!update_ret) + return NULL; + + return task; +} + +void update_frame_thread_info(struct frame_info *frame_info, + struct frame_thread_info *frame_thread_info) +{ + int i; + int old_prio; + int prio; + int thread_num; + int real_thread; + + if (!frame_info || !frame_thread_info || + frame_thread_info->thread_num < 0) + return; + + prio = frame_thread_info->prio; + thread_num = frame_thread_info->thread_num; + if (thread_num > MAX_TID_NUM) + thread_num = MAX_TID_NUM; + + write_lock(&frame_info->lock); + old_prio = frame_info->prio; + real_thread = 0; + for (i = 0; i < thread_num; i++) { + frame_info->thread[i] = update_frame_thread(frame_info, old_prio, prio, + frame_thread_info->thread[i], + frame_info->thread[i]); + if (frame_info->thread[i] && (frame_thread_info->thread[i] > 0)) + real_thread++; + } + frame_info->prio = prio; + frame_info->thread_num = real_thread; + write_unlock(&frame_info->lock); +} + static int _init_frame_info(struct frame_info *frame_info, int id) { struct related_thread_group *grp = NULL; @@ -188,6 +372,7 @@ static int _init_frame_info(struct frame_info *frame_info, int id) frame_info->frame_rate = DEFAULT_FRAME_RATE; frame_info->frame_time = div_u64(NSEC_PER_SEC, frame_info->frame_rate); frame_info->thread_num = 0; + frame_info->prio = NOT_RT_PRIO; grp = frame_rtg(id); if (unlikely(!grp)) { diff --git a/kernel/sched/rtg/frame_rtg.h b/kernel/sched/rtg/frame_rtg.h index 56d95c9d45ec..a75c5bd85ad0 100644 --- a/kernel/sched/rtg/frame_rtg.h +++ b/kernel/sched/rtg/frame_rtg.h @@ -16,6 +16,9 @@ #define MULTI_FRAME_ID (DEFAULT_CGROUP_COLOC_ID + 1) #define MULTI_FRAME_NUM (MAX_NUM_CGROUP_COLOC_ID - DEFAULT_CGROUP_COLOC_ID - 1) +#define NOT_RT_PRIO (-1) +#define STATIC_RTG_DEPTH (-1) + #define FRAME_START (1 << 0) #define FRAME_END (1 << 1) #define FRAME_INVALID (1 << 2) @@ -37,6 +40,19 @@ #define FRAME_DEFAULT_MIN_PREV_UTIL 0 #define FRAME_DEFAULT_MAX_PREV_UTIL SCHED_CAPACITY_SCALE +enum rtg_type { + VIP = 0, + TOP_TASK_KEY, + NORMAL_TASK, + RTG_TYPE_MAX, +}; + +struct frame_thread_info { + int prio; + int thread[MAX_TID_NUM]; + int thread_num; +}; + struct multi_frame_id_manager { DECLARE_BITMAP(id_map, MULTI_FRAME_NUM); unsigned int offset; @@ -50,4 +66,10 @@ struct frame_info *rtg_active_multi_frame_info(int id); struct frame_info *rtg_multi_frame_info(int id); void release_multi_frame_info(int id); void clear_multi_frame_info(void); +void set_frame_prio(struct frame_info *frame_info, int prio); +struct task_struct *update_frame_thread(struct frame_info *frame_info, + int old_prio, int prio, int pid, + struct task_struct *old_task); +void update_frame_thread_info(struct frame_info *frame_info, + struct frame_thread_info *frame_thread_info); #endif -- Gitee From 1b8c3ed80f78a856e5608407a667489b4dc2f338 Mon Sep 17 00:00:00 2001 From: Dai Li Date: Mon, 21 Feb 2022 17:36:30 +0800 Subject: [PATCH 057/113] sched: Support limiting the number of real-time threads in group ohos inclusion category: feature issue: #I4U089 CVE: NA ------------------------------------------- Limit the number of real-time threads in group when CONFIG_SCHED_RTG_RT_THREAD_LIMIT is enabled. Signed-off-by: Dai Li Signed-off-by: Li Ming --- include/linux/sched/frame_rtg.h | 2 + kernel/sched/rtg/Kconfig | 6 ++ kernel/sched/rtg/frame_rtg.c | 163 +++++++++++++++++++++++++++++++- kernel/sched/rtg/frame_rtg.h | 11 +++ kernel/sched/rtg/rtg.c | 8 ++ 5 files changed, 186 insertions(+), 4 deletions(-) diff --git a/include/linux/sched/frame_rtg.h b/include/linux/sched/frame_rtg.h index 322f59a09c2b..19dcf1398576 100644 --- a/include/linux/sched/frame_rtg.h +++ b/include/linux/sched/frame_rtg.h @@ -20,6 +20,8 @@ struct frame_info { int thread_num; unsigned int frame_rate; // frame rate u64 frame_time; + atomic_t curr_rt_thread_num; + atomic_t max_rt_thread_num; }; struct frame_info *rtg_frame_info(int id); diff --git a/kernel/sched/rtg/Kconfig b/kernel/sched/rtg/Kconfig index 837c0341c514..1cb0c4298b09 100644 --- a/kernel/sched/rtg/Kconfig +++ b/kernel/sched/rtg/Kconfig @@ -31,4 +31,10 @@ config SCHED_RTG_FRAME If set, you can set the task to RTG and kernel will statistic the load per frame. +config SCHED_RTG_RT_THREAD_LIMIT + bool "Limit the number of RT threads in groups" + depends on SCHED_RTG_FRAME + default n + help + If set, limit the number of RT threads in frame RTG. endmenu diff --git a/kernel/sched/rtg/frame_rtg.c b/kernel/sched/rtg/frame_rtg.c index 19429fdc5f93..00f5e04e05b2 100644 --- a/kernel/sched/rtg/frame_rtg.c +++ b/kernel/sched/rtg/frame_rtg.c @@ -21,6 +21,89 @@ static struct multi_frame_id_manager g_id_manager = { static struct frame_info g_multi_frame_info[MULTI_FRAME_NUM]; +static bool is_rtg_rt_task(struct task_struct *task) +{ + bool ret = false; + + if (!task) + return ret; + + ret = ((task->prio < MAX_RT_PRIO) && + (task->rtg_depth == STATIC_RTG_DEPTH)); + + return ret; +} + +#ifdef CONFIG_SCHED_RTG_RT_THREAD_LIMIT +static atomic_t g_rtg_rt_thread_num = ATOMIC_INIT(0); + +static unsigned int _get_rtg_rt_thread_num(struct related_thread_group *grp) +{ + unsigned int rtg_rt_thread_num = 0; + struct task_struct *p = NULL; + + if (list_empty(&grp->tasks)) + goto out; + + list_for_each_entry(p, &grp->tasks, grp_list) { + if (is_rtg_rt_task(p)) + ++rtg_rt_thread_num; + } + +out: + return rtg_rt_thread_num; +} + +static unsigned int get_rtg_rt_thread_num(void) +{ + struct related_thread_group *grp = NULL; + unsigned int total_rtg_rt_thread_num = 0; + unsigned long flag; + unsigned int i; + + for (i = MULTI_FRAME_ID; i < MULTI_FRAME_ID + MULTI_FRAME_NUM; i++) { + grp = lookup_related_thread_group(i); + if (grp == NULL) + continue; + raw_spin_lock_irqsave(&grp->lock, flag); + total_rtg_rt_thread_num += _get_rtg_rt_thread_num(grp); + raw_spin_unlock_irqrestore(&grp->lock, flag); + } + + return total_rtg_rt_thread_num; +} + +static void inc_rtg_rt_thread_num(void) +{ + atomic_inc(&g_rtg_rt_thread_num); +} + +static void dec_rtg_rt_thread_num(void) +{ + atomic_dec_if_positive(&g_rtg_rt_thread_num); +} + +static int test_and_read_rtg_rt_thread_num(void) +{ + if (atomic_read(&g_rtg_rt_thread_num) >= RTG_MAX_RT_THREAD_NUM) + atomic_set(&g_rtg_rt_thread_num, get_rtg_rt_thread_num()); + + return atomic_read(&g_rtg_rt_thread_num); +} + +int read_rtg_rt_thread_num(void) +{ + return atomic_read(&g_rtg_rt_thread_num); +} +#else +static inline void inc_rtg_rt_thread_num(void) { } +static inline void dec_rtg_rt_thread_num(void) { } +static inline int test_and_read_rtg_rt_thread_num(void) +{ + return 0; +} +#endif + bool is_frame_rtg(int id) { return (id >= MULTI_FRAME_ID) && @@ -122,6 +205,8 @@ int alloc_multi_frame_info(void) } set_frame_rate(frame_info, DEFAULT_FRAME_RATE); + atomic_set(&frame_info->curr_rt_thread_num, 0); + atomic_set(&frame_info->max_rt_thread_num, DEFAULT_MAX_RT_THREAD); return id; } @@ -183,10 +268,43 @@ static void do_update_frame_task_prio(struct frame_info *frame_info, { int policy = SCHED_NORMAL; struct sched_param sp = {0}; + bool is_rt_task = (prio != NOT_RT_PRIO); + bool need_dec_flag = false; + bool need_inc_flag = false; + int err; - policy = SCHED_FIFO | SCHED_RESET_ON_FORK; - sp.sched_priority = MAX_USER_RT_PRIO - 1 - prio; - sched_setscheduler_nocheck(task, policy, &sp); + trace_rtg_frame_sched(frame_info->rtg->id, "rtg_rt_thread_num", + read_rtg_rt_thread_num()); + /* change policy to RT */ + if (is_rt_task && (atomic_read(&frame_info->curr_rt_thread_num) < + atomic_read(&frame_info->max_rt_thread_num))) { + /* change policy from CFS to RT */ + if (!is_rtg_rt_task(task)) { + if (test_and_read_rtg_rt_thread_num() >= RTG_MAX_RT_THREAD_NUM) + goto out; + need_inc_flag = true; + } + /* change RT priority */ + policy = SCHED_FIFO | SCHED_RESET_ON_FORK; + sp.sched_priority = MAX_USER_RT_PRIO - 1 - prio; + atomic_inc(&frame_info->curr_rt_thread_num); + } else { + /* change policy from RT to CFS */ + if (!is_rt_task && is_rtg_rt_task(task)) + need_dec_flag = true; + } +out: + trace_rtg_frame_sched(frame_info->rtg->id, "rtg_rt_thread_num", + read_rtg_rt_thread_num()); + trace_rtg_frame_sched(frame_info->rtg->id, "curr_rt_thread_num", + atomic_read(&frame_info->curr_rt_thread_num)); + err = sched_setscheduler_nocheck(task, policy, &sp); + if (err == 0) { + if (need_dec_flag) + dec_rtg_rt_thread_num(); + else if (need_inc_flag) + inc_rtg_rt_thread_num(); + } } static void update_frame_task_prio(struct frame_info *frame_info, int prio) @@ -194,6 +312,9 @@ static void update_frame_task_prio(struct frame_info *frame_info, int prio) int i; struct task_struct *thread = NULL; + /* reset curr_rt_thread_num */ + atomic_set(&frame_info->curr_rt_thread_num, 0); + for (i = 0; i < MAX_TID_NUM; i++) { thread = frame_info->thread[i]; if (thread) @@ -227,9 +348,13 @@ static int do_set_rtg_sched(struct task_struct *task, bool is_rtg, if (is_rtg) { if (is_rt_task) { + if (test_and_read_rtg_rt_thread_num() >= RTG_MAX_RT_THREAD_NUM) + // rtg_rt_thread_num is inavailable, set policy to CFS + goto skip_setpolicy; policy = SCHED_FIFO | SCHED_RESET_ON_FORK; sp.sched_priority = MAX_USER_RT_PRIO - 1 - prio; } +skip_setpolicy: grpid = grp_id; } err = sched_setscheduler_nocheck(task, policy, &sp); @@ -248,6 +373,14 @@ static int do_set_rtg_sched(struct task_struct *task, bool is_rtg, sched_setscheduler_nocheck(task, policy, &sp); } } + if (err == 0) { + if (is_rtg) { + if (policy != SCHED_NORMAL) + inc_rtg_rt_thread_num(); + } else { + dec_rtg_rt_thread_num(); + } + } return err; } @@ -305,18 +438,37 @@ struct task_struct *update_frame_thread(struct frame_info *frame_info, struct task_struct *old_task) { struct task_struct *task = NULL; + bool is_rt_task = (prio != NOT_RT_PRIO); int new_prio = prio; bool update_ret = false; if (pid > 0) { - if (old_task && (pid == old_task->pid) && (old_prio == new_prio)) + if (old_task && (pid == old_task->pid) && (old_prio == new_prio)) { + if (is_rt_task && atomic_read(&frame_info->curr_rt_thread_num) < + atomic_read(&frame_info->max_rt_thread_num)) + atomic_inc(&frame_info->curr_rt_thread_num); + trace_rtg_frame_sched(frame_info->rtg->id, "curr_rt_thread_num", + atomic_read(&frame_info->curr_rt_thread_num)); return old_task; + } rcu_read_lock(); task = find_task_by_vpid(pid); if (task) get_task_struct(task); rcu_read_unlock(); } + if (task && is_rt_task) { + if (atomic_read(&frame_info->curr_rt_thread_num) < + atomic_read(&frame_info->max_rt_thread_num)) + atomic_inc(&frame_info->curr_rt_thread_num); + else + new_prio = NOT_RT_PRIO; + } + trace_rtg_frame_sched(frame_info->rtg->id, "curr_rt_thread_num", + atomic_read(&frame_info->curr_rt_thread_num)); + trace_rtg_frame_sched(frame_info->rtg->id, "rtg_rt_thread_num", + read_rtg_rt_thread_num()); + set_frame_rtg_thread(frame_info->rtg->id, old_task, false, NOT_RT_PRIO); update_ret = set_frame_rtg_thread(frame_info->rtg->id, task, true, new_prio); if (old_task) @@ -345,6 +497,8 @@ void update_frame_thread_info(struct frame_info *frame_info, if (thread_num > MAX_TID_NUM) thread_num = MAX_TID_NUM; + // reset curr_rt_thread_num + atomic_set(&frame_info->curr_rt_thread_num, 0); write_lock(&frame_info->lock); old_prio = frame_info->prio; real_thread = 0; @@ -373,6 +527,7 @@ static int _init_frame_info(struct frame_info *frame_info, int id) frame_info->frame_time = div_u64(NSEC_PER_SEC, frame_info->frame_rate); frame_info->thread_num = 0; frame_info->prio = NOT_RT_PRIO; + atomic_set(&(frame_info->curr_rt_thread_num), 0); grp = frame_rtg(id); if (unlikely(!grp)) { diff --git a/kernel/sched/rtg/frame_rtg.h b/kernel/sched/rtg/frame_rtg.h index a75c5bd85ad0..d5f7bb17a4a6 100644 --- a/kernel/sched/rtg/frame_rtg.h +++ b/kernel/sched/rtg/frame_rtg.h @@ -40,6 +40,9 @@ #define FRAME_DEFAULT_MIN_PREV_UTIL 0 #define FRAME_DEFAULT_MAX_PREV_UTIL SCHED_CAPACITY_SCALE +#define DEFAULT_MAX_RT_THREAD 2 +#define RTG_MAX_RT_THREAD_NUM CONFIG_NR_CPUS + enum rtg_type { VIP = 0, TOP_TASK_KEY, @@ -72,4 +75,12 @@ struct task_struct *update_frame_thread(struct frame_info *frame_info, struct task_struct *old_task); void update_frame_thread_info(struct frame_info *frame_info, struct frame_thread_info *frame_thread_info); +#ifdef CONFIG_SCHED_RTG_RT_THREAD_LIMIT +int read_rtg_rt_thread_num(void); +#else +static inline int read_rtg_rt_thread_num(void) +{ + return 0; +} +#endif #endif diff --git a/kernel/sched/rtg/rtg.c b/kernel/sched/rtg/rtg.c index dabadd54e59c..91e2c6abfa4e 100644 --- a/kernel/sched/rtg/rtg.c +++ b/kernel/sched/rtg/rtg.c @@ -14,6 +14,10 @@ #include "rtg.h" #include "../walt.h" +#ifdef CONFIG_SCHED_RTG_FRAME +#include "frame_rtg.h" +#endif + #define ADD_TASK 0 #define REM_TASK 1 @@ -1125,6 +1129,10 @@ static void print_rtg_info(struct seq_file *file, grp->util_invalid_interval / NSEC_PER_MSEC); seq_printf_rtg(file, "RTG_CLUSTER : %d\n", grp->preferred_cluster ? grp->preferred_cluster->id : -1); +#ifdef CONFIG_SCHED_RTG_RT_THREAD_LIMIT + seq_printf_rtg(file, "RTG_RT_THREAD_NUM : %d/%d\n", + read_rtg_rt_thread_num(), RTG_MAX_RT_THREAD_NUM); +#endif } static char rtg_task_state_to_char(const struct task_struct *tsk) -- Gitee From 75a45686857ec291d6f54122294bf77cb76987a1 Mon Sep 17 00:00:00 2001 From: Dai Li Date: Mon, 21 Feb 2022 17:38:06 +0800 Subject: [PATCH 058/113] sched: Introduce frame-based load tracking ohos inclusion category: feature issue: #I4U089 CVE: NA ------------------------------------------- 1.Use rtg load tracking in struct frame_info and introduce frame vload to assist load calculation. 2.Fix bug for updating cpufreq when preferred_cluster changed Signed-off-by: Dai Li Signed-off-by: Li Ming --- include/linux/sched/frame_rtg.h | 44 ++++ kernel/sched/cpufreq_schedutil.c | 21 +- kernel/sched/rtg/frame_rtg.c | 384 +++++++++++++++++++++++++++++-- kernel/sched/rtg/frame_rtg.h | 7 + 4 files changed, 441 insertions(+), 15 deletions(-) diff --git a/include/linux/sched/frame_rtg.h b/include/linux/sched/frame_rtg.h index 19dcf1398576..af8a1dd820cb 100644 --- a/include/linux/sched/frame_rtg.h +++ b/include/linux/sched/frame_rtg.h @@ -13,6 +13,14 @@ #define MAX_TID_NUM 5 struct frame_info { + /* + * use rtg load tracking in frame_info + * rtg->curr_window_load -=> the workload of current frame + * rtg->prev_window_load -=> the workload of last frame + * rtg->curr_window_exec -=> the thread's runtime of current frame + * rtg->prev_window_exec -=> the thread's runtime of last frame + * rtg->prev_window_time -=> the actual time of the last frame + */ rwlock_t lock; struct related_thread_group *rtg; int prio; @@ -22,8 +30,44 @@ struct frame_info { u64 frame_time; atomic_t curr_rt_thread_num; atomic_t max_rt_thread_num; + atomic_t frame_sched_state; + atomic_t start_frame_freq; + atomic_t frame_state; + + /* + * frame_vload : the emergency level of current frame. + * max_vload_time : the timeline frame_load increase to FRAME_MAX_VLOAD + * it's always equal to 2 * frame_time / NSEC_PER_MSEC + * + * The closer to the deadline, the higher emergency of current + * frame, so the frame_vload is only related to frame time, + * and grown with time. + */ + u64 frame_vload; + int vload_margin; + int max_vload_time; + + u64 frame_util; + unsigned long status; + unsigned long prev_fake_load_util; + unsigned long prev_frame_load_util; + unsigned long prev_frame_time; + unsigned long prev_frame_exec; + unsigned long prev_frame_load; + unsigned int frame_min_util; + unsigned int frame_max_util; + unsigned int prev_min_util; + unsigned int prev_max_util; + + bool margin_imme; + bool timestamp_skipped; }; struct frame_info *rtg_frame_info(int id); +static inline +struct related_thread_group *frame_info_rtg(const struct frame_info *frame_info) +{ + return frame_info->rtg; +} #endif #endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 742ed2fe50de..5fbf2207c0b2 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -456,6 +456,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, bool force_update = false; #ifdef CONFIG_SCHED_RTG + unsigned long irq_flag; + force_update = flags & SCHED_CPUFREQ_FORCE_UPDATE; #endif @@ -490,9 +492,17 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (sg_policy->policy->fast_switch_enabled) { sugov_fast_switch(sg_policy, time, next_f); } else { +#ifdef CONFIG_SCHED_RTG + raw_spin_lock_irqsave(&sg_policy->update_lock, irq_flag); +#else raw_spin_lock(&sg_policy->update_lock); +#endif sugov_deferred_update(sg_policy, time, next_f); +#ifdef CONFIG_SCHED_RTG + raw_spin_unlock_irqrestore(&sg_policy->update_lock, irq_flag); +#else raw_spin_unlock(&sg_policy->update_lock); +#endif } } @@ -532,11 +542,16 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned int next_f; bool force_update = false; +#ifdef CONFIG_SCHED_RTG + unsigned long irq_flag; +#endif #ifdef CONFIG_SCHED_RTG force_update = flags & SCHED_CPUFREQ_FORCE_UPDATE; -#endif + raw_spin_lock_irqsave(&sg_policy->update_lock, irq_flag); +#else raw_spin_lock(&sg_policy->update_lock); +#endif sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -557,7 +572,11 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) sugov_deferred_update(sg_policy, time, next_f); } +#ifdef CONFIG_SCHED_RTG + raw_spin_unlock_irqrestore(&sg_policy->update_lock, irq_flag); +#else raw_spin_unlock(&sg_policy->update_lock); +#endif } static void sugov_work(struct kthread_work *work) diff --git a/kernel/sched/rtg/frame_rtg.c b/kernel/sched/rtg/frame_rtg.c index 00f5e04e05b2..e9ab0ef25f5b 100644 --- a/kernel/sched/rtg/frame_rtg.c +++ b/kernel/sched/rtg/frame_rtg.c @@ -183,8 +183,12 @@ int set_frame_rate(struct frame_info *frame_info, int rate) frame_info->frame_rate = (unsigned int)rate; frame_info->frame_time = frame_info->frame_time = div_u64(NSEC_PER_SEC, rate); + frame_info->max_vload_time = + frame_info->frame_time / NSEC_PER_MSEC + + frame_info->vload_margin; id = frame_info->rtg->id; trace_rtg_frame_sched(id, "FRAME_QOS", rate); + trace_rtg_frame_sched(id, "FRAME_MAX_TIME", frame_info->max_vload_time); return 0; } @@ -445,7 +449,8 @@ struct task_struct *update_frame_thread(struct frame_info *frame_info, if (pid > 0) { if (old_task && (pid == old_task->pid) && (old_prio == new_prio)) { if (is_rt_task && atomic_read(&frame_info->curr_rt_thread_num) < - atomic_read(&frame_info->max_rt_thread_num)) + atomic_read(&frame_info->max_rt_thread_num) && + (atomic_read(&frame_info->frame_sched_state) == 1)) atomic_inc(&frame_info->curr_rt_thread_num); trace_rtg_frame_sched(frame_info->rtg->id, "curr_rt_thread_num", atomic_read(&frame_info->curr_rt_thread_num)); @@ -457,20 +462,24 @@ struct task_struct *update_frame_thread(struct frame_info *frame_info, get_task_struct(task); rcu_read_unlock(); } - if (task && is_rt_task) { - if (atomic_read(&frame_info->curr_rt_thread_num) < - atomic_read(&frame_info->max_rt_thread_num)) - atomic_inc(&frame_info->curr_rt_thread_num); - else - new_prio = NOT_RT_PRIO; - } - trace_rtg_frame_sched(frame_info->rtg->id, "curr_rt_thread_num", - atomic_read(&frame_info->curr_rt_thread_num)); - trace_rtg_frame_sched(frame_info->rtg->id, "rtg_rt_thread_num", - read_rtg_rt_thread_num()); + trace_rtg_frame_sched(frame_info->rtg->id, "FRAME_SCHED_ENABLE", + atomic_read(&frame_info->frame_sched_state)); + if (atomic_read(&frame_info->frame_sched_state) == 1) { + if (task && is_rt_task) { + if (atomic_read(&frame_info->curr_rt_thread_num) < + atomic_read(&frame_info->max_rt_thread_num)) + atomic_inc(&frame_info->curr_rt_thread_num); + else + new_prio = NOT_RT_PRIO; + } + trace_rtg_frame_sched(frame_info->rtg->id, "curr_rt_thread_num", + atomic_read(&frame_info->curr_rt_thread_num)); + trace_rtg_frame_sched(frame_info->rtg->id, "rtg_rt_thread_num", + read_rtg_rt_thread_num()); - set_frame_rtg_thread(frame_info->rtg->id, old_task, false, NOT_RT_PRIO); - update_ret = set_frame_rtg_thread(frame_info->rtg->id, task, true, new_prio); + set_frame_rtg_thread(frame_info->rtg->id, old_task, false, NOT_RT_PRIO); + update_ret = set_frame_rtg_thread(frame_info->rtg->id, task, true, new_prio); + } if (old_task) put_task_struct(old_task); if (!update_ret) @@ -514,6 +523,340 @@ void update_frame_thread_info(struct frame_info *frame_info, write_unlock(&frame_info->lock); } +static void do_set_frame_sched_state(struct frame_info *frame_info, + struct task_struct *task, + bool enable, int prio) +{ + int new_prio = prio; + bool is_rt_task = (prio != NOT_RT_PRIO); + + if (enable && is_rt_task) { + if (atomic_read(&frame_info->curr_rt_thread_num) < + atomic_read(&frame_info->max_rt_thread_num)) + atomic_inc(&frame_info->curr_rt_thread_num); + else + new_prio = NOT_RT_PRIO; + } + trace_rtg_frame_sched(frame_info->rtg->id, "curr_rt_thread_num", + atomic_read(&frame_info->curr_rt_thread_num)); + trace_rtg_frame_sched(frame_info->rtg->id, "rtg_rt_thread_num", + read_rtg_rt_thread_num()); + set_frame_rtg_thread(frame_info->rtg->id, task, enable, new_prio); +} + +void set_frame_sched_state(struct frame_info *frame_info, bool enable) +{ + atomic_t *frame_sched_state = NULL; + int prio; + int i; + + if (!frame_info || !frame_info->rtg) + return; + + frame_sched_state = &(frame_info->frame_sched_state); + if (enable) { + if (atomic_read(frame_sched_state) == 1) + return; + atomic_set(frame_sched_state, 1); + trace_rtg_frame_sched(frame_info->rtg->id, "FRAME_SCHED_ENABLE", 1); + + frame_info->prev_fake_load_util = 0; + frame_info->prev_frame_load_util = 0; + frame_info->frame_vload = 0; + frame_info_rtg_load(frame_info)->curr_window_load = 0; + } else { + if (atomic_read(frame_sched_state) == 0) + return; + atomic_set(frame_sched_state, 0); + trace_rtg_frame_sched(frame_info->rtg->id, "FRAME_SCHED_ENABLE", 0); + + (void)sched_set_group_normalized_util(frame_info->rtg->id, + 0, RTG_FREQ_NORMAL_UPDATE); + trace_rtg_frame_sched(frame_info->rtg->id, "preferred_cluster", + INVALID_PREFERRED_CLUSTER); + frame_info->status = FRAME_END; + } + + /* reset curr_rt_thread_num */ + atomic_set(&frame_info->curr_rt_thread_num, 0); + write_lock(&frame_info->lock); + prio = frame_info->prio; + for (i = 0; i < MAX_TID_NUM; i++) { + if (frame_info->thread[i]) + do_set_frame_sched_state(frame_info, frame_info->thread[i], + enable, prio); + } + write_unlock(&frame_info->lock); + + trace_rtg_frame_sched(frame_info->rtg->id, "FRAME_STATUS", + frame_info->status); + trace_rtg_frame_sched(frame_info->rtg->id, "frame_status", + frame_info->status); +} + +static inline bool check_frame_util_invalid(const struct frame_info *frame_info, + u64 timeline) +{ + return ((frame_info_rtg(frame_info)->util_invalid_interval <= timeline) && + (frame_info_rtg_load(frame_info)->curr_window_exec * FRAME_UTIL_INVALID_FACTOR + <= timeline)); +} + +static u64 calc_prev_fake_load_util(const struct frame_info *frame_info) +{ + u64 prev_frame_load = frame_info->prev_frame_load; + u64 prev_frame_time = max_t(unsigned long, frame_info->prev_frame_time, + frame_info->frame_time); + u64 frame_util = 0; + + if (prev_frame_time > 0) + frame_util = div_u64((prev_frame_load << SCHED_CAPACITY_SHIFT), + prev_frame_time); + frame_util = clamp_t(unsigned long, frame_util, + frame_info->prev_min_util, + frame_info->prev_max_util); + + return frame_util; +} + +static u64 calc_prev_frame_load_util(const struct frame_info *frame_info) +{ + u64 prev_frame_load = frame_info->prev_frame_load; + u64 frame_time = frame_info->frame_time; + u64 frame_util = 0; + + if (prev_frame_load >= frame_time) + frame_util = FRAME_MAX_LOAD; + else + frame_util = div_u64((prev_frame_load << SCHED_CAPACITY_SHIFT), + frame_info->frame_time); + frame_util = clamp_t(unsigned long, frame_util, + frame_info->prev_min_util, + frame_info->prev_max_util); + + return frame_util; +} + +/* last frame load tracking */ +static void update_frame_prev_load(struct frame_info *frame_info, bool fake) +{ + /* last frame load tracking */ + frame_info->prev_frame_exec = + frame_info_rtg_load(frame_info)->prev_window_exec; + frame_info->prev_frame_time = + frame_info_rtg(frame_info)->prev_window_time; + frame_info->prev_frame_load = + frame_info_rtg_load(frame_info)->prev_window_load; + + if (fake) + frame_info->prev_fake_load_util = + calc_prev_fake_load_util(frame_info); + else + frame_info->prev_frame_load_util = + calc_prev_frame_load_util(frame_info); +} + +static void do_frame_end(struct frame_info *frame_info, bool fake) +{ + unsigned long prev_util; + int id = frame_info->rtg->id; + + frame_info->status = FRAME_END; + trace_rtg_frame_sched(id, "frame_status", frame_info->status); + + /* last frame load tracking */ + update_frame_prev_load(frame_info, fake); + + /* reset frame_info */ + frame_info->frame_vload = 0; + + /* reset frame_min_util */ + frame_info->frame_min_util = 0; + + if (fake) + prev_util = frame_info->prev_fake_load_util; + else + prev_util = frame_info->prev_frame_load_util; + + frame_info->frame_util = clamp_t(unsigned long, prev_util, + frame_info->frame_min_util, + frame_info->frame_max_util); + + trace_rtg_frame_sched(id, "frame_last_task_time", + frame_info->prev_frame_exec); + trace_rtg_frame_sched(id, "frame_last_time", frame_info->prev_frame_time); + trace_rtg_frame_sched(id, "frame_last_load", frame_info->prev_frame_load); + trace_rtg_frame_sched(id, "frame_last_load_util", + frame_info->prev_frame_load_util); + trace_rtg_frame_sched(id, "frame_util", frame_info->frame_util); + trace_rtg_frame_sched(id, "frame_vload", frame_info->frame_vload); +} + +/* + * frame_load : calculate frame load using exec util + */ +static inline u64 calc_frame_exec(const struct frame_info *frame_info) +{ + if (frame_info->frame_time > 0) + return div_u64((frame_info_rtg_load(frame_info)->curr_window_exec << + SCHED_CAPACITY_SHIFT), frame_info->frame_time); + else + return 0; +} + +/* + * real_util: + * max(last_util, virtual_util, boost_util, phase_util, frame_min_util) + */ +static u64 calc_frame_util(const struct frame_info *frame_info, bool fake) +{ + unsigned long load_util; + + if (fake) + load_util = frame_info->prev_fake_load_util; + else + load_util = frame_info->prev_frame_load_util; + + load_util = max_t(unsigned long, load_util, frame_info->frame_vload); + load_util = clamp_t(unsigned long, load_util, + frame_info->frame_min_util, + frame_info->frame_max_util); + + return load_util; +} + +/* + * frame_vload [0~1024] + * vtime: now - timestamp + * max_time: frame_info->frame_time + vload_margin + * load = F(vtime) + * = vtime ^ 2 - vtime * max_time + FRAME_MAX_VLOAD * vtime / max_time; + * = vtime * (vtime + FRAME_MAX_VLOAD / max_time - max_time); + * [0, 0] -=> [max_time, FRAME_MAX_VLOAD] + * + */ +static u64 calc_frame_vload(const struct frame_info *frame_info, u64 timeline) +{ + u64 vload; + int vtime = div_u64(timeline, NSEC_PER_MSEC); + int max_time = frame_info->max_vload_time; + int factor; + + if ((max_time <= 0) || (vtime > max_time)) + return FRAME_MAX_VLOAD; + + factor = vtime + FRAME_MAX_VLOAD / max_time; + /* margin maybe negative */ + if ((vtime <= 0) || (factor <= max_time)) + return 0; + + vload = (u64)vtime * (u64)(factor - max_time); + + return vload; +} + +static int update_frame_info_tick_inner(int id, struct frame_info *frame_info, + u64 timeline) +{ + switch (frame_info->status) { + case FRAME_INVALID: + case FRAME_END: + if (timeline >= frame_info->frame_time) { + /* + * fake FRAME_END here to rollover frame_window. + */ + sched_set_group_window_rollover(id); + do_frame_end(frame_info, true); + } else { + frame_info->frame_vload = calc_frame_exec(frame_info); + frame_info->frame_util = + calc_frame_util(frame_info, true); + } + + /* when not in boost, start tick timer */ + break; + case FRAME_START: + /* check frame_util invalid */ + if (!check_frame_util_invalid(frame_info, timeline)) { + /* frame_vload statistic */ + frame_info->frame_vload = calc_frame_vload(frame_info, timeline); + /* frame_util statistic */ + frame_info->frame_util = + calc_frame_util(frame_info, false); + } else { + frame_info->status = FRAME_INVALID; + trace_rtg_frame_sched(id, "FRAME_STATUS", + frame_info->status); + trace_rtg_frame_sched(id, "frame_status", + frame_info->status); + + /* + * trigger FRAME_END to rollover frame_window, + * we treat FRAME_INVALID as FRAME_END. + */ + sched_set_group_window_rollover(id); + do_frame_end(frame_info, false); + } + break; + default: + return -EINVAL; + } + + return 0; +} + +static inline struct frame_info *rtg_frame_info_inner( + const struct related_thread_group *grp) +{ + return (struct frame_info *)grp->private_data; +} + +/* + * update CPUFREQ and PLACEMENT when frame task running (in tick) and migration + */ +static void update_frame_info_tick(struct related_thread_group *grp) +{ + u64 window_start; + u64 wallclock; + u64 timeline; + struct frame_info *frame_info = NULL; + int id = grp->id; + + rcu_read_lock(); + frame_info = rtg_frame_info_inner(grp); + window_start = grp->window_start; + rcu_read_unlock(); + if (unlikely(!frame_info)) + return; + + if (atomic_read(&frame_info->frame_sched_state) == 0) + return; + trace_rtg_frame_sched(id, "frame_status", frame_info->status); + + wallclock = ktime_get_ns(); + timeline = wallclock - window_start; + + trace_rtg_frame_sched(id, "update_curr_pid", current->pid); + trace_rtg_frame_sched(id, "frame_timeline", div_u64(timeline, NSEC_PER_MSEC)); + + if (update_frame_info_tick_inner(grp->id, frame_info, timeline) == -EINVAL) + return; + + trace_rtg_frame_sched(id, "frame_vload", frame_info->frame_vload); + trace_rtg_frame_sched(id, "frame_util", frame_info->frame_util); + + sched_set_group_normalized_util(grp->id, + frame_info->frame_util, RTG_FREQ_NORMAL_UPDATE); + + if (grp->preferred_cluster) + trace_rtg_frame_sched(id, "preferred_cluster", + grp->preferred_cluster->id); +} + +const struct rtg_class frame_rtg_class = { + .sched_update_rtg_tick = update_frame_info_tick, +}; + static int _init_frame_info(struct frame_info *frame_info, int id) { struct related_thread_group *grp = NULL; @@ -528,6 +871,18 @@ static int _init_frame_info(struct frame_info *frame_info, int id) frame_info->thread_num = 0; frame_info->prio = NOT_RT_PRIO; atomic_set(&(frame_info->curr_rt_thread_num), 0); + atomic_set(&(frame_info->frame_sched_state), 0); + frame_info->vload_margin = DEFAULT_VLOAD_MARGIN; + frame_info->max_vload_time = + div_u64(frame_info->frame_time, NSEC_PER_MSEC) + + frame_info->vload_margin; + frame_info->frame_min_util = FRAME_DEFAULT_MIN_UTIL; + frame_info->frame_max_util = FRAME_DEFAULT_MAX_UTIL; + frame_info->prev_min_util = FRAME_DEFAULT_MIN_PREV_UTIL; + frame_info->prev_max_util = FRAME_DEFAULT_MAX_PREV_UTIL; + frame_info->margin_imme = false; + frame_info->timestamp_skipped = false; + frame_info->status = FRAME_END; grp = frame_rtg(id); if (unlikely(!grp)) { @@ -537,6 +892,7 @@ static int _init_frame_info(struct frame_info *frame_info, int id) raw_spin_lock_irqsave(&grp->lock, flags); grp->private_data = frame_info; + grp->rtg_class = &frame_rtg_class; raw_spin_unlock_irqrestore(&grp->lock, flags); frame_info->rtg = grp; diff --git a/kernel/sched/rtg/frame_rtg.h b/kernel/sched/rtg/frame_rtg.h index d5f7bb17a4a6..96e0f6f1eb8e 100644 --- a/kernel/sched/rtg/frame_rtg.h +++ b/kernel/sched/rtg/frame_rtg.h @@ -42,6 +42,7 @@ #define DEFAULT_MAX_RT_THREAD 2 #define RTG_MAX_RT_THREAD_NUM CONFIG_NR_CPUS +#define INVALID_PREFERRED_CLUSTER 10 enum rtg_type { VIP = 0, @@ -83,4 +84,10 @@ static inline int read_rtg_rt_thread_num(void) return 0; } #endif +static inline +struct group_ravg *frame_info_rtg_load(const struct frame_info *frame_info) +{ + return &frame_info_rtg(frame_info)->ravg; +} +void set_frame_sched_state(struct frame_info *frame_info, bool enable); #endif -- Gitee From 14a14259a2587a5fa6b983618de5a36f9c49c4db Mon Sep 17 00:00:00 2001 From: Dai Li Date: Mon, 21 Feb 2022 17:39:06 +0800 Subject: [PATCH 059/113] sched: Add utility functions for frame-aware scheduling ohos inclusion category: feature issue: #I4U089 CVE: NA ------------------------------------------- Support setting min/max util and margin for frame RTG. Signed-off-by: Dai Li Signed-off-by: Li Ming --- include/linux/sched/frame_rtg.h | 1 + kernel/sched/rtg/frame_rtg.c | 256 +++++++++++++++++++++++++++++++- kernel/sched/rtg/frame_rtg.h | 11 ++ 3 files changed, 267 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/frame_rtg.h b/include/linux/sched/frame_rtg.h index af8a1dd820cb..71ecf65c599e 100644 --- a/include/linux/sched/frame_rtg.h +++ b/include/linux/sched/frame_rtg.h @@ -58,6 +58,7 @@ struct frame_info { unsigned int frame_max_util; unsigned int prev_min_util; unsigned int prev_max_util; + unsigned int frame_boost_min_util; bool margin_imme; bool timestamp_skipped; diff --git a/kernel/sched/rtg/frame_rtg.c b/kernel/sched/rtg/frame_rtg.c index e9ab0ef25f5b..53e97b2a9c25 100644 --- a/kernel/sched/rtg/frame_rtg.c +++ b/kernel/sched/rtg/frame_rtg.c @@ -184,7 +184,7 @@ int set_frame_rate(struct frame_info *frame_info, int rate) frame_info->frame_rate = (unsigned int)rate; frame_info->frame_time = frame_info->frame_time = div_u64(NSEC_PER_SEC, rate); frame_info->max_vload_time = - frame_info->frame_time / NSEC_PER_MSEC + + div_u64(frame_info->frame_time, NSEC_PER_MSEC) + frame_info->vload_margin; id = frame_info->rtg->id; trace_rtg_frame_sched(id, "FRAME_QOS", rate); @@ -811,6 +811,12 @@ static inline struct frame_info *rtg_frame_info_inner( return (struct frame_info *)grp->private_data; } +static inline void frame_boost(struct frame_info *frame_info) +{ + if (frame_info->frame_util < frame_info->frame_boost_min_util) + frame_info->frame_util = frame_info->frame_boost_min_util; +} + /* * update CPUFREQ and PLACEMENT when frame task running (in tick) and migration */ @@ -842,6 +848,7 @@ static void update_frame_info_tick(struct related_thread_group *grp) if (update_frame_info_tick_inner(grp->id, frame_info, timeline) == -EINVAL) return; + frame_boost(frame_info); trace_rtg_frame_sched(id, "frame_vload", frame_info->frame_vload); trace_rtg_frame_sched(id, "frame_util", frame_info->frame_util); @@ -857,6 +864,253 @@ const struct rtg_class frame_rtg_class = { .sched_update_rtg_tick = update_frame_info_tick, }; +int set_frame_margin(struct frame_info *frame_info, int margin) +{ + int id; + + if ((margin < MIN_VLOAD_MARGIN) || (margin > MAX_VLOAD_MARGIN)) { + pr_err("[FRAME_RTG]: %s invalid MARGIN value\n", + __func__); + return -EINVAL; + } + + if (!frame_info || !frame_info->rtg) + return -EINVAL; + + frame_info->vload_margin = margin; + frame_info->max_vload_time = + div_u64(frame_info->frame_time, NSEC_PER_MSEC) + + frame_info->vload_margin; + id = frame_info->rtg->id; + trace_rtg_frame_sched(id, "FRAME_MARGIN", margin); + trace_rtg_frame_sched(id, "FRAME_MAX_TIME", frame_info->max_vload_time); + + return 0; +} + +static void set_frame_start(struct frame_info *frame_info) +{ + int id = frame_info->rtg->id; + + if (likely(frame_info->status == FRAME_START)) { + /* + * START -=> START -=> ...... + * FRMAE_START is + * the end of last frame + * the start of the current frame + */ + update_frame_prev_load(frame_info, false); + } else if ((frame_info->status == FRAME_END) || + (frame_info->status == FRAME_INVALID)) { + /* START -=> END -=> [START] + * FRAME_START is + * only the start of current frame + * we shoudn't tracking the last rtg-window + * [FRAME_END, FRAME_START] + * it's not an available frame window + */ + update_frame_prev_load(frame_info, true); + frame_info->status = FRAME_START; + } + trace_rtg_frame_sched(id, "FRAME_STATUS", frame_info->status); + trace_rtg_frame_sched(id, "frame_last_task_time", + frame_info->prev_frame_exec); + trace_rtg_frame_sched(id, "frame_last_time", frame_info->prev_frame_time); + trace_rtg_frame_sched(id, "frame_last_load", frame_info->prev_frame_load); + trace_rtg_frame_sched(id, "frame_last_load_util", + frame_info->prev_frame_load_util); + + /* new_frame_start */ + if (!frame_info->margin_imme) { + frame_info->frame_vload = 0; + frame_info->frame_util = clamp_t(unsigned long, + frame_info->prev_frame_load_util, + frame_info->frame_min_util, + frame_info->frame_max_util); + } else { + frame_info->frame_vload = calc_frame_vload(frame_info, 0); + frame_info->frame_util = calc_frame_util(frame_info, false); + } + + trace_rtg_frame_sched(id, "frame_vload", frame_info->frame_vload); +} + +static void set_frame_end(struct frame_info *frame_info) +{ + trace_rtg_frame_sched(frame_info->rtg->id, "FRAME_STATUS", FRAME_END); + do_frame_end(frame_info, false); +} + +static int update_frame_timestamp(unsigned long status, + struct frame_info *frame_info, struct related_thread_group *grp) +{ + int id = frame_info->rtg->id; + + /* SCHED_FRAME timestamp */ + switch (status) { + case FRAME_START: + /* collect frame_info when frame_end timestamp coming */ + set_frame_start(frame_info); + break; + case FRAME_END: + /* FRAME_END should only set and update freq once */ + if (unlikely(frame_info->status == FRAME_END)) + return 0; + set_frame_end(frame_info); + break; + default: + pr_err("[FRAME_RTG]: %s invalid timestamp(status)\n", + __func__); + return -EINVAL; + } + + frame_boost(frame_info); + trace_rtg_frame_sched(id, "frame_util", frame_info->frame_util); + + /* update cpufreq force when frame_stop */ + sched_set_group_normalized_util(grp->id, + frame_info->frame_util, RTG_FREQ_FORCE_UPDATE); + if (grp->preferred_cluster) + trace_rtg_frame_sched(id, "preferred_cluster", + grp->preferred_cluster->id); + + return 0; +} + +static int set_frame_status(struct frame_info *frame_info, unsigned long status) +{ + struct related_thread_group *grp = NULL; + int id; + + if (!frame_info) + return -EINVAL; + + grp = frame_info->rtg; + if (unlikely(!grp)) + return -EINVAL; + + if (atomic_read(&frame_info->frame_sched_state) == 0) + return -EINVAL; + + if (!(status & FRAME_SETTIME) || + (status == (unsigned long)FRAME_SETTIME_PARAM)) { + pr_err("[FRAME_RTG]: %s invalid timetsamp(status)\n", + __func__); + return -EINVAL; + } + + if (status & FRAME_TIMESTAMP_SKIP_START) { + frame_info->timestamp_skipped = true; + status &= ~FRAME_TIMESTAMP_SKIP_START; + } else if (status & FRAME_TIMESTAMP_SKIP_END) { + frame_info->timestamp_skipped = false; + status &= ~FRAME_TIMESTAMP_SKIP_END; + } else if (frame_info->timestamp_skipped) { + /* + * skip the following timestamp until + * FRAME_TIMESTAMP_SKIPPED reset + */ + return 0; + } + id = grp->id; + trace_rtg_frame_sched(id, "FRAME_TIMESTAMP_SKIPPED", + frame_info->timestamp_skipped); + trace_rtg_frame_sched(id, "FRAME_MAX_UTIL", frame_info->frame_max_util); + + if (status & FRAME_USE_MARGIN_IMME) { + frame_info->margin_imme = true; + status &= ~FRAME_USE_MARGIN_IMME; + } else { + frame_info->margin_imme = false; + } + trace_rtg_frame_sched(id, "FRAME_MARGIN_IMME", frame_info->margin_imme); + trace_rtg_frame_sched(id, "FRAME_TIMESTAMP", status); + + return update_frame_timestamp(status, frame_info, grp); +} + +int set_frame_timestamp(struct frame_info *frame_info, unsigned long timestamp) +{ + int ret; + + if (!frame_info || !frame_info->rtg) + return -EINVAL; + + if (atomic_read(&frame_info->frame_sched_state) == 0) + return -EINVAL; + + ret = sched_set_group_window_rollover(frame_info->rtg->id); + if (!ret) + ret = set_frame_status(frame_info, timestamp); + + return ret; +} + +int set_frame_min_util(struct frame_info *frame_info, int min_util, bool is_boost) +{ + int id; + + if (unlikely((min_util < 0) || (min_util > SCHED_CAPACITY_SCALE))) { + pr_err("[FRAME_RTG]: %s invalid min_util value\n", + __func__); + return -EINVAL; + } + + if (!frame_info || !frame_info->rtg) + return -EINVAL; + + id = frame_info->rtg->id; + if (is_boost) { + frame_info->frame_boost_min_util = min_util; + trace_rtg_frame_sched(id, "FRAME_BOOST_MIN_UTIL", min_util); + } else { + frame_info->frame_min_util = min_util; + + frame_info->frame_util = calc_frame_util(frame_info, false); + trace_rtg_frame_sched(id, "frame_util", frame_info->frame_util); + sched_set_group_normalized_util(id, + frame_info->frame_util, RTG_FREQ_FORCE_UPDATE); + } + + return 0; +} + +int set_frame_max_util(struct frame_info *frame_info, int max_util) +{ + int id; + + if ((max_util < 0) || (max_util > SCHED_CAPACITY_SCALE)) { + pr_err("[FRAME_RTG]: %s invalid max_util value\n", + __func__); + return -EINVAL; + } + + if (!frame_info || !frame_info->rtg) + return -EINVAL; + + frame_info->frame_max_util = max_util; + id = frame_info->rtg->id; + trace_rtg_frame_sched(id, "FRAME_MAX_UTIL", frame_info->frame_max_util); + + return 0; +} + +struct frame_info *lookup_frame_info_by_grp_id(int grp_id) +{ + if (grp_id >= (MULTI_FRAME_ID + MULTI_FRAME_NUM) || (grp_id <= 0)) + return NULL; + if (grp_id >= MULTI_FRAME_ID) { + read_lock(&g_id_manager.lock); + if (!test_bit(grp_id - MULTI_FRAME_ID, g_id_manager.id_map)) { + read_unlock(&g_id_manager.lock); + return NULL; + } + read_unlock(&g_id_manager.lock); + return rtg_frame_info(grp_id); + } else + return rtg_frame_info(grp_id); +} + static int _init_frame_info(struct frame_info *frame_info, int id) { struct related_thread_group *grp = NULL; diff --git a/kernel/sched/rtg/frame_rtg.h b/kernel/sched/rtg/frame_rtg.h index 96e0f6f1eb8e..5aea6cdc9ee8 100644 --- a/kernel/sched/rtg/frame_rtg.h +++ b/kernel/sched/rtg/frame_rtg.h @@ -22,6 +22,12 @@ #define FRAME_START (1 << 0) #define FRAME_END (1 << 1) #define FRAME_INVALID (1 << 2) +#define FRAME_USE_MARGIN_IMME (1 << 4) +#define FRAME_TIMESTAMP_SKIP_START (1 << 5) +#define FRAME_TIMESTAMP_SKIP_END (1 << 6) +#define FRAME_SETTIME (FRAME_START | FRAME_END | \ + FRAME_USE_MARGIN_IMME) +#define FRAME_SETTIME_PARAM (-1) #define DEFAULT_FRAME_RATE 60 #define MIN_FRAME_RATE 1 @@ -90,4 +96,9 @@ struct group_ravg *frame_info_rtg_load(const struct frame_info *frame_info) return &frame_info_rtg(frame_info)->ravg; } void set_frame_sched_state(struct frame_info *frame_info, bool enable); +int set_frame_margin(struct frame_info *frame_info, int margin); +int set_frame_timestamp(struct frame_info *frame_info, unsigned long timestamp); +int set_frame_max_util(struct frame_info *frame_info, int max_util); +int set_frame_min_util(struct frame_info *frame_info, int min_util, bool is_boost); +struct frame_info *lookup_frame_info_by_grp_id(int grp_id); #endif -- Gitee From b847bcfc1b883d1fdc7999ab4e49e32b9b9d86b6 Mon Sep 17 00:00:00 2001 From: Dai Li Date: Mon, 21 Feb 2022 17:45:11 +0800 Subject: [PATCH 060/113] sched: Introduce rtg ctrl interface ohos inclusion category: feature issue: #I4U089 CVE: NA ------------------------------------------- Introduce the control interface for rtg Signed-off-by: Dai Li Signed-off-by: Li Ming --- include/linux/sched/rtg_ctrl.h | 37 +++++++++ kernel/sched/rtg/Makefile | 2 +- kernel/sched/rtg/rtg_ctrl.c | 142 +++++++++++++++++++++++++++++++++ kernel/sched/rtg/rtg_ctrl.h | 43 ++++++++++ 4 files changed, 223 insertions(+), 1 deletion(-) create mode 100644 include/linux/sched/rtg_ctrl.h create mode 100644 kernel/sched/rtg/rtg_ctrl.c create mode 100644 kernel/sched/rtg/rtg_ctrl.h diff --git a/include/linux/sched/rtg_ctrl.h b/include/linux/sched/rtg_ctrl.h new file mode 100644 index 000000000000..e8a611608a4c --- /dev/null +++ b/include/linux/sched/rtg_ctrl.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * rtg control interface + * + * Copyright (c) 2022-2023 Huawei Technologies Co., Ltd. + */ + +#ifndef __SCHED_RTG_CTL_H +#define __SCHED_RTG_CTL_H + +#include + +#define SYSTEM_SERVER_UID 1000 +#define MIN_APP_UID 10000 +#define MAX_BOOST_DURATION_MS 5000 + +#define RTG_SCHED_IPC_MAGIC 0XAB + +#define CMD_ID_SET_ENABLE \ + _IOWR(RTG_SCHED_IPC_MAGIC, SET_ENABLE, struct rtg_enable_data) + +enum ioctl_abi_format { + IOCTL_ABI_ARM32, + IOCTL_ABI_AARCH64, +}; + +enum rtg_sched_cmdid { + SET_ENABLE = 1, + RTG_CTRL_MAX_NR, +}; + +struct rtg_enable_data { + int enable; + int len; + char *data; +}; +#endif diff --git a/kernel/sched/rtg/Makefile b/kernel/sched/rtg/Makefile index 13795817f087..4d55523d1f32 100644 --- a/kernel/sched/rtg/Makefile +++ b/kernel/sched/rtg/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_SCHED_RTG) += rtg.o -obj-$(CONFIG_SCHED_RTG_FRAME) += frame_rtg.o +obj-$(CONFIG_SCHED_RTG_FRAME) += frame_rtg.o rtg_ctrl.o diff --git a/kernel/sched/rtg/rtg_ctrl.c b/kernel/sched/rtg/rtg_ctrl.c new file mode 100644 index 000000000000..89d639031479 --- /dev/null +++ b/kernel/sched/rtg/rtg_ctrl.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * rtg control entry + * + * Copyright (c) 2022-2023 Huawei Technologies Co., Ltd. + */ + +#include "rtg.h" +#include "rtg_ctrl.h" + +#include +#include +#include +#include +#include + +atomic_t g_rtg_enable = ATOMIC_INIT(0); +typedef long (*rtg_ctrl_func)(int abi, void __user *arg); + +static rtg_ctrl_func g_func_array[RTG_CTRL_MAX_NR] = { + NULL, /* reserved */ + ctrl_set_enable, // 1 +}; + +static void rtg_enable(const struct rtg_enable_data *data) +{ + char temp[MAX_DATA_LEN]; + + if (atomic_read(&g_rtg_enable) == 1) { + pr_info("[SCHED_RTG] already enabled!\n"); + return; + } + if ((data->len <= 0) || (data->len >= MAX_DATA_LEN)) { + pr_err("[SCHED_RTG] %s data len invalid\n", __func__); + return; + } + if (copy_from_user(&temp, (void __user *)data->data, data->len)) { + pr_err("[SCHED_RTG] %s copy user data failed\n", __func__); + return; + } + + atomic_set(&g_rtg_enable, 1); + pr_info("[SCHED_RTG] enabled!\n"); +} + +long ctrl_set_enable(int abi, void __user *uarg) +{ + struct rtg_enable_data rs_enable; + + if (copy_from_user(&rs_enable, uarg, sizeof(rs_enable))) { + pr_err("[SCHED_RTG] CMD_ID_SET_ENABLE copy data failed\n"); + return -INVALID_ARG; + } + rtg_enable(&rs_enable); + + return SUCC; +} + +static long do_proc_rtg_ioctl(int abi, struct file *file, unsigned int cmd, unsigned long arg) +{ + void __user *uarg = (void __user *)(uintptr_t)arg; + unsigned int func_id = _IOC_NR(cmd); + + if (uarg == NULL) { + pr_err("[SCHED_RTG] %s: invalid user uarg\n", __func__); + return -EINVAL; + } + + if ((cmd != CMD_ID_SET_ENABLE) && !atomic_read(&g_rtg_enable)) { + pr_err("[SCHED_RTG] Rtg not enabled yet.\n"); + return -RTG_DISABLED; + } + + if (_IOC_TYPE(cmd) != RTG_SCHED_IPC_MAGIC) { + pr_err("[SCHED_RTG] %s: RTG_SCHED_IPC_MAGIC fail, TYPE=%d\n", + __func__, _IOC_TYPE(cmd)); + return -INVALID_MAGIC; + } + if (func_id >= RTG_CTRL_MAX_NR) { + pr_err("[SCHED_RTG] %s: RTG_MAX_NR fail, _IOC_NR(cmd)=%d, MAX_NR=%d\n", + __func__, _IOC_NR(cmd), RTG_CTRL_MAX_NR); + return -INVALID_CMD; + } + + if (g_func_array[func_id] != NULL) + return (*g_func_array[func_id])(abi, uarg); + + return -EINVAL; +} + +static int proc_rtg_open(struct inode *inode, struct file *filp) +{ + return SUCC; +} + +static int proc_rtg_release(struct inode *inode, struct file *filp) +{ + return SUCC; +} + +static long proc_rtg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + return do_proc_rtg_ioctl(IOCTL_ABI_AARCH64, file, cmd, arg); +} + +#ifdef CONFIG_COMPAT +static long proc_rtg_compat_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + return do_proc_rtg_ioctl(IOCTL_ABI_ARM32, file, cmd, + (unsigned long)(compat_ptr((compat_uptr_t)arg))); +} +#endif + +static const struct file_operations rtg_ctrl_fops = { + .open = proc_rtg_open, + .release = proc_rtg_release, + .unlocked_ioctl = proc_rtg_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = proc_rtg_compat_ioctl, +#endif +}; + +static struct miscdevice rtg_ctrl_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sched_rtg_ctrl", + .fops = &rtg_ctrl_fops, + .mode = 0666, +}; + +static int __init rtg_ctrl_dev_init(void) +{ + return misc_register(&rtg_ctrl_device); +} + +static void __exit rtg_ctrl_dev_exit(void) +{ + misc_deregister(&rtg_ctrl_device); +} + +module_init(rtg_ctrl_dev_init); +module_exit(rtg_ctrl_dev_exit); diff --git a/kernel/sched/rtg/rtg_ctrl.h b/kernel/sched/rtg/rtg_ctrl.h new file mode 100644 index 000000000000..2705c722f04f --- /dev/null +++ b/kernel/sched/rtg/rtg_ctrl.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * rtg control interface + * + * Copyright (c) 2022-2023 Huawei Technologies Co., Ltd. + */ + +#ifndef __RTG_CTL_H +#define __RTG_CTL_H + +#include +#include +#include + +#include "frame_rtg.h" + +/* set rtg */ +#define INVALID_VALUE 0xffff +#define DEFAULT_RT_PRIO 97 + +#define MAX_DATA_LEN 256 +#define DECIMAL 10 +#define DEFAULT_MAX_UTIL 1024 +#define MAX_SUBPROCESS_NUM 8 + +#define RTG_ID_INVALID (-1) +#define DEFAULT_MAX_RT_FRAME 3 +#define MAX_RT_THREAD (MAX_TID_NUM + 2) +#define INIT_VALUE (-1) +#define UPDATE_RTG_FRAME (1 << 0) +#define ADD_RTG_FRAME (1 << 1) +#define CLEAR_RTG_FRAME (1 << 2) + +/* rtg_ctrl func list */ +long ctrl_set_enable(int abi, void __user *uarg); +enum rtg_err_no { + SUCC = 0, + RTG_DISABLED = 1, + INVALID_ARG, + INVALID_MAGIC, + INVALID_CMD, +}; +#endif -- Gitee From db5b90b5b400bddbc3c50fcb79ad9a2404f68530 Mon Sep 17 00:00:00 2001 From: Dai Li Date: Mon, 21 Feb 2022 22:02:04 +0800 Subject: [PATCH 061/113] sched: Add frame RTG's enable config ohos inclusion category: feature issue: #I4U089 CVE: NA ------------------------------------------- add enable function and do some settings in it Signed-off-by: Dai Li --- kernel/sched/rtg/rtg_ctrl.c | 213 ++++++++++++++++++++++++++++++++++-- kernel/sched/rtg/rtg_ctrl.h | 18 ++- 2 files changed, 220 insertions(+), 11 deletions(-) diff --git a/kernel/sched/rtg/rtg_ctrl.c b/kernel/sched/rtg/rtg_ctrl.c index 89d639031479..a8b1691074ad 100644 --- a/kernel/sched/rtg/rtg_ctrl.c +++ b/kernel/sched/rtg/rtg_ctrl.c @@ -15,35 +15,119 @@ #include atomic_t g_rtg_enable = ATOMIC_INIT(0); +atomic_t g_enable_type = ATOMIC_INIT(ALL_ENABLE); // default: all enable +static atomic_t g_rt_frame_num = ATOMIC_INIT(0); +static int g_frame_max_util = DEFAULT_MAX_UTIL; typedef long (*rtg_ctrl_func)(int abi, void __user *arg); +static long ctrl_set_enable(int abi, void __user *uarg); + static rtg_ctrl_func g_func_array[RTG_CTRL_MAX_NR] = { NULL, /* reserved */ ctrl_set_enable, // 1 }; -static void rtg_enable(const struct rtg_enable_data *data) +static int init_proc_state(const int *config, int len); +static void deinit_proc_state(void); + +static int set_enable_config(char *config_str) +{ + char *p = NULL; + char *tmp = NULL; + int value; + int config[RTG_CONFIG_NUM]; + int i; + int ret = 0; + + for (i = 0; i < RTG_CONFIG_NUM; i++) + config[i] = INVALID_VALUE; + /* eg: key1:value1;key2:value2;key3:value3 */ + for (p = strsep(&config_str, ";"); p != NULL; + p = strsep(&config_str, ";")) { + tmp = strsep(&p, ":"); + if ((tmp == NULL) || (p == NULL)) + continue; + if (kstrtoint((const char *)p, DECIMAL, &value)) + return -INVALID_ARG; + + if (!strcmp(tmp, "sched_cycle")) + config[RTG_FREQ_CYCLE] = value; + else if (!strcmp(tmp, "frame_max_util")) + config[RTG_FRAME_MAX_UTIL] = value; + else if (!strcmp(tmp, "invalid_interval")) + config[RTG_INVALID_INTERVAL] = value; + else if (!strcmp(tmp, "enable_type")) + atomic_set(&g_enable_type, value); + else + continue; + } + + for (i = 0; i < RTG_CONFIG_NUM; i++) + pr_info("[SCHED_RTG] config[%d] = %d\n", i, config[i]); + + ret = init_proc_state(config, RTG_CONFIG_NUM); + + return ret; +} + +static void rtg_enable(int abi, const struct rtg_enable_data *data) { char temp[MAX_DATA_LEN]; + int ret = -1; if (atomic_read(&g_rtg_enable) == 1) { pr_info("[SCHED_RTG] already enabled!\n"); return; } + if ((data->len <= 0) || (data->len >= MAX_DATA_LEN)) { pr_err("[SCHED_RTG] %s data len invalid\n", __func__); return; } - if (copy_from_user(&temp, (void __user *)data->data, data->len)) { + + switch (abi) { + case IOCTL_ABI_ARM32: + ret = copy_from_user(&temp, + (void __user *)compat_ptr((compat_uptr_t)data->data), data->len); + break; + case IOCTL_ABI_AARCH64: + ret = copy_from_user(&temp, (void __user *)data->data, data->len); + break; + default: + pr_err("[SCHED_RTG] abi format error\n"); + break; + } + if (ret) { pr_err("[SCHED_RTG] %s copy user data failed\n", __func__); return; } +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wincompatible-pointer-types" + temp[data->len] = '\0'; + + if (set_enable_config(&temp) != SUCC) { + pr_err("[SCHED_RTG] %s failed!\n", __func__); + return; + } +#pragma GCC diagnostic pop + atomic_set(&g_rtg_enable, 1); pr_info("[SCHED_RTG] enabled!\n"); } -long ctrl_set_enable(int abi, void __user *uarg) +static void rtg_disable(void) +{ + if (atomic_read(&g_rtg_enable) == 0) { + pr_info("[SCHED_RTG] already disabled!\n"); + return; + } + pr_info("[SCHED_RTG] disabled!\n"); + atomic_set(&g_rtg_enable, 0); + deinit_proc_state(); +} + +static long ctrl_set_enable(int abi, void __user *uarg) { struct rtg_enable_data rs_enable; @@ -51,11 +135,34 @@ long ctrl_set_enable(int abi, void __user *uarg) pr_err("[SCHED_RTG] CMD_ID_SET_ENABLE copy data failed\n"); return -INVALID_ARG; } - rtg_enable(&rs_enable); + if (rs_enable.enable == 1) + rtg_enable(abi, &rs_enable); + else + rtg_disable(); return SUCC; } +static void clear_rtg_frame_thread(struct frame_info *frame_info, bool reset) +{ + struct frame_thread_info frame_thread_info; + int i; + + if (!reset && frame_info) + frame_thread_info.prio = frame_info->prio; + else + frame_thread_info.prio = NOT_RT_PRIO; + for (i = 0; i < MAX_TID_NUM; i++) + frame_thread_info.thread[i] = -1; + frame_thread_info.thread_num = MAX_TID_NUM; + update_frame_thread_info(frame_info, &frame_thread_info); + if (reset) { + atomic_set(&frame_info->max_rt_thread_num, DEFAULT_MAX_RT_THREAD); + atomic_set(&frame_info->frame_sched_state, 0); + trace_rtg_frame_sched(frame_info->rtg->id, "FRAME_SCHED_ENABLE", 0); + } +} + static long do_proc_rtg_ioctl(int abi, struct file *file, unsigned int cmd, unsigned long arg) { void __user *uarg = (void __user *)(uintptr_t)arg; @@ -66,16 +173,17 @@ static long do_proc_rtg_ioctl(int abi, struct file *file, unsigned int cmd, unsi return -EINVAL; } - if ((cmd != CMD_ID_SET_ENABLE) && !atomic_read(&g_rtg_enable)) { - pr_err("[SCHED_RTG] Rtg not enabled yet.\n"); - return -RTG_DISABLED; - } - if (_IOC_TYPE(cmd) != RTG_SCHED_IPC_MAGIC) { pr_err("[SCHED_RTG] %s: RTG_SCHED_IPC_MAGIC fail, TYPE=%d\n", __func__, _IOC_TYPE(cmd)); return -INVALID_MAGIC; } + + if ((func_id != SET_ENABLE) && !atomic_read(&g_rtg_enable)) { + pr_err("[SCHED_RTG] CMD_ID %x error: Rtg not enabled yet.\n", cmd); + return -RTG_DISABLED; + } + if (func_id >= RTG_CTRL_MAX_NR) { pr_err("[SCHED_RTG] %s: RTG_MAX_NR fail, _IOC_NR(cmd)=%d, MAX_NR=%d\n", __func__, _IOC_NR(cmd), RTG_CTRL_MAX_NR); @@ -88,6 +196,93 @@ static long do_proc_rtg_ioctl(int abi, struct file *file, unsigned int cmd, unsi return -EINVAL; } +static void reset_frame_info(struct frame_info *frame_info) +{ + clear_rtg_frame_thread(frame_info, true); + atomic_set(&frame_info->frame_state, -1); + atomic_set(&frame_info->curr_rt_thread_num, 0); + atomic_set(&frame_info->max_rt_thread_num, DEFAULT_MAX_RT_THREAD); +} + +static int do_init_proc_state(int rtgid, const int *config, int len) +{ + struct related_thread_group *grp = NULL; + struct frame_info *frame_info = NULL; + + grp = lookup_related_thread_group(rtgid); + if (unlikely(!grp)) + return -EINVAL; + + frame_info = (struct frame_info *)grp->private_data; + if (!frame_info) + return -EINVAL; + + reset_frame_info(frame_info); + + if ((config[RTG_FREQ_CYCLE] >= MIN_FREQ_CYCLE) && + (config[RTG_FREQ_CYCLE] <= MAX_FREQ_CYCLE)) + sched_set_group_freq_update_interval(rtgid, + (unsigned int)config[RTG_FREQ_CYCLE]); + else + sched_set_group_freq_update_interval(rtgid, + DEFAULT_FREQ_CYCLE); + + if (config[RTG_INVALID_INTERVAL] != INVALID_VALUE) + sched_set_group_util_invalid_interval(rtgid, + config[RTG_INVALID_INTERVAL]); + else + sched_set_group_util_invalid_interval(rtgid, + DEFAULT_INVALID_INTERVAL); + + set_frame_max_util(frame_info, g_frame_max_util); + + return SUCC; +} + +static int init_proc_state(const int *config, int len) +{ + int ret; + int id; + + if ((config == NULL) || (len != RTG_CONFIG_NUM)) + return -INVALID_ARG; + + if ((config[RTG_FRAME_MAX_UTIL] > 0) && + (config[RTG_FRAME_MAX_UTIL] < DEFAULT_MAX_UTIL)) + g_frame_max_util = config[RTG_FRAME_MAX_UTIL]; + + for (id = MULTI_FRAME_ID; id < (MULTI_FRAME_ID + MULTI_FRAME_NUM); id++) { + ret = do_init_proc_state(id, config, len); + if (ret) { + pr_err("[SCHED_RTG] init proc state for FRAME_ID=%d failed, ret=%d\n", + id, ret); + return ret; + } + } + atomic_set(&g_rt_frame_num, 0); + + return SUCC; +} + +static void deinit_proc_state(void) +{ + int id; + struct frame_info *frame_info = NULL; + struct related_thread_group *grp = NULL; + + for (id = MULTI_FRAME_ID; id < (MULTI_FRAME_ID + MULTI_FRAME_NUM); id++) { + grp = lookup_related_thread_group(id); + if (unlikely(!grp)) + return; + + frame_info = (struct frame_info *)grp->private_data; + if (frame_info) + reset_frame_info(frame_info); + } + clear_multi_frame_info(); + atomic_set(&g_rt_frame_num, 0); +} + static int proc_rtg_open(struct inode *inode, struct file *filp) { return SUCC; diff --git a/kernel/sched/rtg/rtg_ctrl.h b/kernel/sched/rtg/rtg_ctrl.h index 2705c722f04f..b2c0101ca419 100644 --- a/kernel/sched/rtg/rtg_ctrl.h +++ b/kernel/sched/rtg/rtg_ctrl.h @@ -31,8 +31,18 @@ #define ADD_RTG_FRAME (1 << 1) #define CLEAR_RTG_FRAME (1 << 2) -/* rtg_ctrl func list */ -long ctrl_set_enable(int abi, void __user *uarg); +#define DEFAULT_FREQ_CYCLE 4 +#define MIN_FREQ_CYCLE 1 +#define MAX_FREQ_CYCLE 16 +#define DEFAULT_INVALID_INTERVAL 50 + +enum rtg_config { + RTG_FREQ_CYCLE, + RTG_FRAME_MAX_UTIL, + RTG_INVALID_INTERVAL, + RTG_CONFIG_NUM, +}; + enum rtg_err_no { SUCC = 0, RTG_DISABLED = 1, @@ -40,4 +50,8 @@ enum rtg_err_no { INVALID_MAGIC, INVALID_CMD, }; +enum enable_type { + ALL_ENABLE = 1, + ENABLE_MAX +}; #endif -- Gitee From e935501de4b479a0d1e2fde610ea094e30f48989 Mon Sep 17 00:00:00 2001 From: Dai Li Date: Mon, 21 Feb 2022 22:49:20 +0800 Subject: [PATCH 062/113] sched: Add group control for frame RTG ohos inclusion category: feature issue: #I4U089 CVE: NA ------------------------------------------- Add, clear, destroy frame RTG interface Signed-off-by: Dai Li --- include/linux/sched/rtg_ctrl.h | 12 ++ kernel/sched/rtg/rtg_ctrl.c | 247 +++++++++++++++++++++++++++++++++ kernel/sched/rtg/rtg_ctrl.h | 21 +++ 3 files changed, 280 insertions(+) diff --git a/include/linux/sched/rtg_ctrl.h b/include/linux/sched/rtg_ctrl.h index e8a611608a4c..5414b861f59c 100644 --- a/include/linux/sched/rtg_ctrl.h +++ b/include/linux/sched/rtg_ctrl.h @@ -18,6 +18,8 @@ #define CMD_ID_SET_ENABLE \ _IOWR(RTG_SCHED_IPC_MAGIC, SET_ENABLE, struct rtg_enable_data) +#define CMD_ID_SET_RTG \ + _IOWR(RTG_SCHED_IPC_MAGIC, SET_RTG, struct rtg_str_data) enum ioctl_abi_format { IOCTL_ABI_ARM32, @@ -26,9 +28,19 @@ enum ioctl_abi_format { enum rtg_sched_cmdid { SET_ENABLE = 1, + SET_RTG, RTG_CTRL_MAX_NR, }; +/* proc_state */ +enum grp_ctrl_cmd { + CMD_CREATE_RTG_GRP, + CMD_ADD_RTG_THREAD, + CMD_REMOVE_RTG_THREAD, + CMD_CLEAR_RTG_GRP, + CMD_DESTROY_RTG_GRP +}; + struct rtg_enable_data { int enable; int len; diff --git a/kernel/sched/rtg/rtg_ctrl.c b/kernel/sched/rtg/rtg_ctrl.c index a8b1691074ad..4d5c909aa788 100644 --- a/kernel/sched/rtg/rtg_ctrl.c +++ b/kernel/sched/rtg/rtg_ctrl.c @@ -18,13 +18,16 @@ atomic_t g_rtg_enable = ATOMIC_INIT(0); atomic_t g_enable_type = ATOMIC_INIT(ALL_ENABLE); // default: all enable static atomic_t g_rt_frame_num = ATOMIC_INIT(0); static int g_frame_max_util = DEFAULT_MAX_UTIL; +static int g_max_rt_frames = DEFAULT_MAX_RT_FRAME; typedef long (*rtg_ctrl_func)(int abi, void __user *arg); static long ctrl_set_enable(int abi, void __user *uarg); +static long ctrl_set_rtg(int abi, void __user *uarg); static rtg_ctrl_func g_func_array[RTG_CTRL_MAX_NR] = { NULL, /* reserved */ ctrl_set_enable, // 1 + ctrl_set_rtg, }; static int init_proc_state(const int *config, int len); @@ -127,6 +130,70 @@ static void rtg_disable(void) deinit_proc_state(); } +static inline bool is_rt_type(int type) +{ + return (type >= VIP && type < NORMAL_TASK); +} + +static int do_update_rt_frame_num(struct frame_info *frame_info, int new_type) +{ + int old_type; + int ret = SUCC; + + read_lock(&frame_info->lock); + old_type = frame_info->prio - DEFAULT_RT_PRIO; + if (is_rt_type(new_type) == is_rt_type(old_type)) + goto out; + + if (is_rt_type(old_type)) { + if (atomic_read(&g_rt_frame_num) > 0) + atomic_dec(&g_rt_frame_num); + } else if (is_rt_type(new_type)) { + if (atomic_read(&g_rt_frame_num) < g_max_rt_frames) { + atomic_inc(&g_rt_frame_num); + } else { + pr_err("[SCHED_RTG]: %s g_max_rt_frames is %d\n", + __func__, g_max_rt_frames); + ret = -INVALID_ARG; + } + } +out: + read_unlock(&frame_info->lock); + + return ret; +} + +static int update_rt_frame_num(struct frame_info *frame_info, int new_type, int cmd) +{ + int ret = SUCC; + + switch (cmd) { + case UPDATE_RTG_FRAME: + ret = do_update_rt_frame_num(frame_info, new_type); + break; + case ADD_RTG_FRAME: + if (is_rt_type(new_type)) { + if (atomic_read(&g_rt_frame_num) >= g_max_rt_frames) { + pr_err("[SCHED_RTG] g_max_rt_frames is %d!\n", g_max_rt_frames); + ret = -INVALID_ARG; + } else { + atomic_inc(&g_rt_frame_num); + } + } + break; + case CLEAR_RTG_FRAME: + if ((atomic_read(&g_rt_frame_num) > 0) && is_rt_type(new_type)) + atomic_dec(&g_rt_frame_num); + break; + default: + return -INVALID_ARG; + } + trace_rtg_frame_sched(frame_info->rtg->id, "g_rt_frame_num", atomic_read(&g_rt_frame_num)); + trace_rtg_frame_sched(frame_info->rtg->id, "g_max_rt_frames", g_max_rt_frames); + + return ret; +} + static long ctrl_set_enable(int abi, void __user *uarg) { struct rtg_enable_data rs_enable; @@ -163,6 +230,186 @@ static void clear_rtg_frame_thread(struct frame_info *frame_info, bool reset) } } +static void copy_proc_from_rsdata(struct rtg_proc_data *proc_info, + const struct rtg_grp_data *rs_data) +{ + memset(proc_info, 0, sizeof(struct rtg_proc_data)); + proc_info->type = VIP; + proc_info->rtcnt = DEFAULT_MAX_RT_THREAD; + if ((rs_data->grp_type > 0) && (rs_data->grp_type < RTG_TYPE_MAX)) + proc_info->type = rs_data->grp_type; + if ((rs_data->rt_cnt > 0) && (rs_data->rt_cnt < DEFAULT_MAX_RT_THREAD)) + proc_info->rtcnt = rs_data->rt_cnt; +} + +static void init_frame_thread_info(struct frame_thread_info *frame_thread_info, + const struct rtg_proc_data *proc_info) +{ + int i; + int type = proc_info->type; + + frame_thread_info->prio = (type == NORMAL_TASK ? NOT_RT_PRIO : (type + DEFAULT_RT_PRIO)); + for (i = 0; i < MAX_TID_NUM; i++) + frame_thread_info->thread[i] = proc_info->thread[i]; + frame_thread_info->thread_num = MAX_TID_NUM; +} + +static int parse_create_rtg_grp(const struct rtg_grp_data *rs_data) +{ + struct rtg_proc_data proc_info; + struct frame_info *frame_info; + struct frame_thread_info frame_thread_info; + + copy_proc_from_rsdata(&proc_info, rs_data); + proc_info.rtgid = alloc_multi_frame_info(); + frame_info = rtg_frame_info(proc_info.rtgid); + if (!frame_info) { + pr_err("[SCHED_RTG] no free multi frame.\n"); + return -NO_FREE_MULTI_FRAME; + } + atomic_set(&frame_info->max_rt_thread_num, proc_info.rtcnt); + if (update_rt_frame_num(frame_info, rs_data->grp_type, ADD_RTG_FRAME)) { + release_multi_frame_info(proc_info.rtgid); + return -NO_RT_FRAME; + } + init_frame_thread_info(&frame_thread_info, &proc_info); + update_frame_thread_info(frame_info, &frame_thread_info); + atomic_set(&frame_info->frame_sched_state, 1); + pr_info("[SCHED_RTG] %s rtgid=%d, type=%d, prio=%d, threadnum=%d\n", + __func__, proc_info.rtgid, rs_data->grp_type, + frame_thread_info.prio, frame_thread_info.thread_num); + + return proc_info.rtgid; +} + +static int parse_add_rtg_thread(const struct rtg_grp_data *rs_data) +{ + struct rtg_proc_data proc_info; + struct frame_info *frame_info; + int add_index; + int add_num; + int prio; + int fail_num = 0; + int i; + + if ((rs_data->grp_id <= 0) || (rs_data->grp_id >= MAX_NUM_CGROUP_COLOC_ID)) + return -INVALID_ARG; + copy_proc_from_rsdata(&proc_info, rs_data); + frame_info = lookup_frame_info_by_grp_id(rs_data->grp_id); + if (!frame_info) { + pr_err("[SCHED_RTG] grp not created yet.\n"); + return -INVALID_ARG; + } + write_lock(&frame_info->lock); + add_num = rs_data->tid_num; + if ((frame_info->thread_num < 0) || (add_num < 0)) { + pr_err("[SCHED_RTG] Unexception err: frame_info num < 0.\n"); + write_unlock(&frame_info->lock); + return -INVALID_RTG_ID; + } + if (frame_info->thread_num + add_num > MAX_TID_NUM) { + pr_err("[SCHED_RTG] frame info thread up to max already.\n"); + write_unlock(&frame_info->lock); + return -INVALID_RTG_ID; + } + add_index = frame_info->thread_num; + prio = frame_info->prio; + for (i = 0; i < add_num; i++) { + frame_info->thread[add_index] = update_frame_thread(frame_info, prio, prio, + rs_data->tids[i], + frame_info->thread[add_index]); + if (frame_info->thread[add_index]) { + frame_info->thread_num++; + add_index = frame_info->thread_num; + } else { + fail_num++; + } + } + write_unlock(&frame_info->lock); + + return fail_num; +} + +static int parse_remove_thread(const struct rtg_grp_data *rs_data) +{ + pr_err("[SCHED_RTG] frame rtg not support remove single yet.\n"); + + return -INVALID_ARG; +} + +static int do_clear_or_destroy_grp(const struct rtg_grp_data *rs_data, bool destroy) +{ + struct frame_info *frame_info; + int type; + int id = rs_data->grp_id; + + if (!is_frame_rtg(id)) { + pr_err("[SCHED_RTG] Failed to destroy rtg group %d!\n", id); + return -INVALID_ARG; + } + + frame_info = rtg_frame_info(id); + if (!frame_info) { + pr_err("[SCHED_RTG] Failed to destroy rtg group %d: grp not exist.\n", id); + return -INVALID_ARG; + } + + type = frame_info->prio - DEFAULT_RT_PRIO; + if (destroy) { + clear_rtg_frame_thread(frame_info, true); + release_multi_frame_info(id); + update_rt_frame_num(frame_info, type, CLEAR_RTG_FRAME); + } else { + clear_rtg_frame_thread(frame_info, false); + } + pr_info("[SCHED_RTG] %s clear frame(id=%d)\n", __func__, id); + + return SUCC; +} + +static int parse_clear_grp(const struct rtg_grp_data *rs_data) +{ + return do_clear_or_destroy_grp(rs_data, false); +} + +static int parse_destroy_grp(const struct rtg_grp_data *rs_data) +{ + return do_clear_or_destroy_grp(rs_data, true); +} + +long ctrl_set_rtg(int abi, void __user *uarg) +{ + struct rtg_grp_data rs_data; + long ret; + + if (copy_from_user(&rs_data, uarg, sizeof(rs_data))) { + pr_err("[SCHED_RTG] CMD_ID_SET_RTG copy data failed\n"); + return -INVALID_ARG; + } + + switch (rs_data.rtg_cmd) { + case CMD_CREATE_RTG_GRP: + ret = parse_create_rtg_grp(&rs_data); + break; + case CMD_ADD_RTG_THREAD: + ret = parse_add_rtg_thread(&rs_data); + break; + case CMD_REMOVE_RTG_THREAD: + ret = parse_remove_thread(&rs_data); + break; + case CMD_CLEAR_RTG_GRP: + ret = parse_clear_grp(&rs_data); + break; + case CMD_DESTROY_RTG_GRP: + ret = parse_destroy_grp(&rs_data); + break; + default: + return -INVALID_ARG; + } + + return ret; +} + static long do_proc_rtg_ioctl(int abi, struct file *file, unsigned int cmd, unsigned long arg) { void __user *uarg = (void __user *)(uintptr_t)arg; diff --git a/kernel/sched/rtg/rtg_ctrl.h b/kernel/sched/rtg/rtg_ctrl.h index b2c0101ca419..3860a0c61669 100644 --- a/kernel/sched/rtg/rtg_ctrl.h +++ b/kernel/sched/rtg/rtg_ctrl.h @@ -49,9 +49,30 @@ enum rtg_err_no { INVALID_ARG, INVALID_MAGIC, INVALID_CMD, + FRAME_ERR_PID = 100, + NO_FREE_MULTI_FRAME, + NOT_MULTI_FRAME, + INVALID_RTG_ID, + NO_RT_FRAME, }; enum enable_type { ALL_ENABLE = 1, ENABLE_MAX }; + +struct rtg_grp_data { + int rtg_cmd; + int grp_id; + int grp_type; + int rt_cnt; + int tid_num; + int tids[MAX_TID_NUM]; +}; + +struct rtg_proc_data { + int rtgid; + int type; + int thread[MAX_TID_NUM]; + int rtcnt; +}; #endif -- Gitee From bd993fe8756fece632d7abd7973b188abd8f38a1 Mon Sep 17 00:00:00 2001 From: Dai Li Date: Mon, 21 Feb 2022 22:54:15 +0800 Subject: [PATCH 063/113] sched: Add frame RTG attribute control interface ohos inclusion category: feature issue: #I4U089 CVE: NA ------------------------------------------- Set some attribute for frame RTG Signed-off-by: Dai Li --- include/linux/sched/rtg_ctrl.h | 12 ++ kernel/sched/rtg/rtg_ctrl.c | 213 +++++++++++++++++++++++++++++++++ 2 files changed, 225 insertions(+) diff --git a/include/linux/sched/rtg_ctrl.h b/include/linux/sched/rtg_ctrl.h index 5414b861f59c..a1583e765234 100644 --- a/include/linux/sched/rtg_ctrl.h +++ b/include/linux/sched/rtg_ctrl.h @@ -20,6 +20,10 @@ _IOWR(RTG_SCHED_IPC_MAGIC, SET_ENABLE, struct rtg_enable_data) #define CMD_ID_SET_RTG \ _IOWR(RTG_SCHED_IPC_MAGIC, SET_RTG, struct rtg_str_data) +#define CMD_ID_SET_CONFIG \ + _IOWR(RTG_SCHED_IPC_MAGIC, SET_CONFIG, struct rtg_str_data) +#define CMD_ID_SET_RTG_ATTR \ + _IOWR(RTG_SCHED_IPC_MAGIC, SET_RTG_ATTR, struct rtg_str_data) enum ioctl_abi_format { IOCTL_ABI_ARM32, @@ -29,6 +33,8 @@ enum ioctl_abi_format { enum rtg_sched_cmdid { SET_ENABLE = 1, SET_RTG, + SET_CONFIG, + SET_RTG_ATTR, RTG_CTRL_MAX_NR, }; @@ -46,4 +52,10 @@ struct rtg_enable_data { int len; char *data; }; + +struct rtg_str_data { + int type; + int len; + char *data; +}; #endif diff --git a/kernel/sched/rtg/rtg_ctrl.c b/kernel/sched/rtg/rtg_ctrl.c index 4d5c909aa788..75494c4a2c7a 100644 --- a/kernel/sched/rtg/rtg_ctrl.c +++ b/kernel/sched/rtg/rtg_ctrl.c @@ -23,11 +23,15 @@ typedef long (*rtg_ctrl_func)(int abi, void __user *arg); static long ctrl_set_enable(int abi, void __user *uarg); static long ctrl_set_rtg(int abi, void __user *uarg); +static long ctrl_set_config(int abi, void __user *uarg); +static long ctrl_set_rtg_attr(int abi, void __user *uarg); static rtg_ctrl_func g_func_array[RTG_CTRL_MAX_NR] = { NULL, /* reserved */ ctrl_set_enable, // 1 ctrl_set_rtg, + ctrl_set_config, + ctrl_set_rtg_attr, }; static int init_proc_state(const int *config, int len); @@ -88,6 +92,8 @@ static void rtg_enable(int abi, const struct rtg_enable_data *data) return; } +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpointer-to-int-cast" switch (abi) { case IOCTL_ABI_ARM32: ret = copy_from_user(&temp, @@ -104,6 +110,7 @@ static void rtg_enable(int abi, const struct rtg_enable_data *data) pr_err("[SCHED_RTG] %s copy user data failed\n", __func__); return; } +#pragma GCC diagnostic pop #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wincompatible-pointer-types" @@ -210,6 +217,212 @@ static long ctrl_set_enable(int abi, void __user *uarg) return SUCC; } +static int parse_config(const struct rtg_str_data *rs_data) +{ + int len; + char *p = NULL; + char *tmp = NULL; + char *data = NULL; + int value; + + if (rs_data == NULL) + return -INVALID_ARG; + data = rs_data->data; + len = rs_data->len; + if ((data == NULL) || (strlen(data) != len)) //lint !e737 + return -INVALID_ARG; + /* + * eg: rtframe:4; + */ + for (p = strsep(&data, ";"); p != NULL; p = strsep(&data, ";")) { + tmp = strsep(&p, ":"); + if ((tmp == NULL) || (p == NULL)) + continue; + if (kstrtoint((const char *)p, DECIMAL, &value)) + return -INVALID_ARG; + if (!strcmp(tmp, "rtframe")) { + if (value > 0 && value <= MULTI_FRAME_NUM) { + g_max_rt_frames = value; + } else { + pr_err("[SCHED_RTG]%s invalid max_rt_frame:%d, MULTI_FRAME_NUM=%d\n", + __func__, value, MULTI_FRAME_NUM); + return -INVALID_ARG; + } + } + } + + return SUCC; +} + +static long ctrl_set_config(int abi, void __user *uarg) +{ + struct rtg_str_data rs; + char temp[MAX_DATA_LEN]; + long ret = SUCC; + + if (uarg == NULL) + return -INVALID_ARG; + + if (copy_from_user(&rs, uarg, sizeof(rs))) { + pr_err("[SCHED_RTG] CMD_ID_SET_CONFIG copy data failed\n"); + return -INVALID_ARG; + } + if ((rs.len <= 0) || (rs.len >= MAX_DATA_LEN)) { + pr_err("[SCHED_RTG] CMD_ID_SET_CONFIG data len invalid\n"); + return -INVALID_ARG; + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpointer-to-int-cast" + switch (abi) { + case IOCTL_ABI_ARM32: + ret = copy_from_user(&temp, + (void __user *)compat_ptr((compat_uptr_t)rs.data), rs.len); + break; + case IOCTL_ABI_AARCH64: + ret = copy_from_user(&temp, (void __user *)rs.data, rs.len); + break; + default: + pr_err("[SCHED_RTG] abi format error\n"); + return -INVALID_ARG; + } + if (ret) { + pr_err("[SCHED_RTG] CMD_ID_SET_CONFIG copy rs.data failed\n"); + return -INVALID_ARG; + } +#pragma GCC diagnostic pop + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wincompatible-pointer-types" + temp[rs.len] = '\0'; + rs.data = &temp; +#pragma GCC diagnostic pop + + return parse_config(&rs); +} + +static inline bool is_valid_type(int type) +{ + return (type >= VIP && type < RTG_TYPE_MAX); +} + +static int parse_rtg_attr(const struct rtg_str_data *rs_data) +{ + char *p = NULL; + char *tmp = NULL; + char *data = NULL; + int value; + struct frame_info *frame_info = NULL; + int rate = -1; + int type = -1; + + if (rs_data == NULL) { + pr_err("[SCHED_RTG] rtg attr: rs_data is null!\n"); + return -INVALID_ARG; + } + + data = rs_data->data; + if ((data == NULL) || (rs_data->len <= 0) || + (rs_data->len > MAX_DATA_LEN)) { + pr_err("[SCHED_RTG] rtg attr: rs_data len err!\n"); + return -INVALID_ARG; + } + + // eg: rtgId:xx;rate:xx;type:xx; + for (p = strsep(&data, ";"); p != NULL; p = strsep(&data, ";")) { + tmp = strsep(&p, ":"); + if ((tmp == NULL) || (p == NULL)) + continue; + if (kstrtoint((const char *)p, DECIMAL, &value)) { + pr_err("[SCHED_RTG] rtg attr: rs_data format err!\n"); + return -INVALID_ARG; + } + if (!strcmp(tmp, "rtgId")) { + frame_info = rtg_frame_info(value); + } else if (!strcmp(tmp, "rate")) { + rate = value; + } else if (!strcmp(tmp, "type")) { + if (is_valid_type(value)) { + type = value; + } else { + pr_err("[SCHED_RTG] invalid type : %d\n", value); + return -INVALID_ARG; + } + } else { + pr_err("[SCHED_RTG] parse rtg attr failed!\n"); + return -INVALID_ARG; + } + } + + if (!frame_info) { + pr_err("[SCHED_RTG] rtg attr: invalid args!\n"); + return -INVALID_ARG; + } + + if (rate > 0) + set_frame_rate(frame_info, rate); + + if (is_valid_type(type)) { + if (update_rt_frame_num(frame_info, type, UPDATE_RTG_FRAME)) { + pr_err("[SCHED_RTG] set rtg attr failed!\n"); + return -INVALID_ARG; + } + + set_frame_prio(frame_info, (type == NORMAL_TASK ? + NOT_RT_PRIO : (type + DEFAULT_RT_PRIO))); + } + + return SUCC; +} + +static long ctrl_set_rtg_attr(int abi, void __user *uarg) +{ + struct rtg_str_data rs; + char temp[MAX_DATA_LEN]; + int ret; + + if (uarg == NULL) + return -INVALID_ARG; + + if (copy_from_user(&rs, uarg, sizeof(rs))) { + pr_err("[SCHED_RTG] CMD_ID_SET_RTG_ATTR copy data failed\n"); + return -INVALID_ARG; + } + if ((rs.len <= 0) || (rs.len >= MAX_DATA_LEN)) { + pr_err("[SCHED_RTG] CMD_ID_SET_RTG_ATTR data len invalid\n"); + return -INVALID_ARG; + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpointer-to-int-cast" + switch (abi) { + case IOCTL_ABI_ARM32: + ret = copy_from_user(&temp, + (void __user *)compat_ptr((compat_uptr_t)rs.data), rs.len); + break; + case IOCTL_ABI_AARCH64: + ret = copy_from_user(&temp, (void __user *)rs.data, rs.len); + break; + default: + pr_err("[SCHED_RTG] abi format error\n"); + return -INVALID_ARG; + } +#pragma GCC diagnostic pop + + if (ret) { + pr_err("[SCHED_RTG] CMD_ID_SET_RTG_ATTR copy rs.data failed with ret %d\n", ret); + return -INVALID_ARG; + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wincompatible-pointer-types" + temp[rs.len] = '\0'; + rs.data = &temp; +#pragma GCC diagnostic pop + + return parse_rtg_attr(&rs); +} + static void clear_rtg_frame_thread(struct frame_info *frame_info, bool reset) { struct frame_thread_info frame_thread_info; -- Gitee From f91530dfefba64820c73959c2f07a68606054179 Mon Sep 17 00:00:00 2001 From: Dai Li Date: Mon, 21 Feb 2022 22:59:20 +0800 Subject: [PATCH 064/113] sched: Add frame process control for frame RTG ohos inclusion category: feature issue: #I4U089 CVE: NA ------------------------------------------- Set frame begin, end and frame margin during frame-drawing Signed-off-by: Dai Li --- include/linux/sched/rtg_ctrl.h | 15 +++++ kernel/sched/rtg/rtg_ctrl.c | 114 +++++++++++++++++++++++++++++++++ kernel/sched/rtg/rtg_ctrl.h | 12 ++++ 3 files changed, 141 insertions(+) diff --git a/include/linux/sched/rtg_ctrl.h b/include/linux/sched/rtg_ctrl.h index a1583e765234..60dcef7e3f73 100644 --- a/include/linux/sched/rtg_ctrl.h +++ b/include/linux/sched/rtg_ctrl.h @@ -24,6 +24,13 @@ _IOWR(RTG_SCHED_IPC_MAGIC, SET_CONFIG, struct rtg_str_data) #define CMD_ID_SET_RTG_ATTR \ _IOWR(RTG_SCHED_IPC_MAGIC, SET_RTG_ATTR, struct rtg_str_data) +#define CMD_ID_BEGIN_FRAME_FREQ \ + _IOWR(RTG_SCHED_IPC_MAGIC, BEGIN_FRAME_FREQ, struct proc_state_data) +#define CMD_ID_END_FRAME_FREQ \ + _IOWR(RTG_SCHED_IPC_MAGIC, END_FRAME_FREQ, struct proc_state_data) +#define CMD_ID_END_SCENE \ + _IOWR(RTG_SCHED_IPC_MAGIC, END_SCENE, struct proc_state_data) +#define CMD_ID_SET_MIN_UTIL \ enum ioctl_abi_format { IOCTL_ABI_ARM32, @@ -35,6 +42,9 @@ enum rtg_sched_cmdid { SET_RTG, SET_CONFIG, SET_RTG_ATTR, + BEGIN_FRAME_FREQ = 5, + END_FRAME_FREQ, + END_SCENE, RTG_CTRL_MAX_NR, }; @@ -58,4 +68,9 @@ struct rtg_str_data { int len; char *data; }; + +struct proc_state_data { + int grp_id; + int state_param; +}; #endif diff --git a/kernel/sched/rtg/rtg_ctrl.c b/kernel/sched/rtg/rtg_ctrl.c index 75494c4a2c7a..869d7d1c2e44 100644 --- a/kernel/sched/rtg/rtg_ctrl.c +++ b/kernel/sched/rtg/rtg_ctrl.c @@ -25,6 +25,9 @@ static long ctrl_set_enable(int abi, void __user *uarg); static long ctrl_set_rtg(int abi, void __user *uarg); static long ctrl_set_config(int abi, void __user *uarg); static long ctrl_set_rtg_attr(int abi, void __user *uarg); +static long ctrl_begin_frame(int abi, void __user *uarg); +static long ctrl_end_frame(int abi, void __user *uarg); +static long ctrl_end_scene(int abi, void __user *uarg); static rtg_ctrl_func g_func_array[RTG_CTRL_MAX_NR] = { NULL, /* reserved */ @@ -32,6 +35,9 @@ static rtg_ctrl_func g_func_array[RTG_CTRL_MAX_NR] = { ctrl_set_rtg, ctrl_set_config, ctrl_set_rtg_attr, + ctrl_begin_frame, // 5 + ctrl_end_frame, + ctrl_end_scene, }; static int init_proc_state(const int *config, int len); @@ -423,6 +429,114 @@ static long ctrl_set_rtg_attr(int abi, void __user *uarg) return parse_rtg_attr(&rs); } +static void start_frame_freq(struct frame_info *frame_info) +{ + if (!frame_info) + return; + + if (atomic_read(&frame_info->start_frame_freq) == 0) { + atomic_set(&frame_info->start_frame_freq, 1); + set_frame_sched_state(frame_info, true); + } +} + +static void set_frame(struct frame_info *frame_info, int margin) +{ + if (!frame_info) + return; + + atomic_set(&frame_info->frame_state, FRAME_DRAWING); + if (set_frame_margin(frame_info, margin) == SUCC) + set_frame_timestamp(frame_info, FRAME_START); +} + +static void reset_frame(struct frame_info *frame_info) +{ + if (!frame_info) + return; + + if (atomic_read(&frame_info->frame_state) == FRAME_END_STATE) { + pr_debug("[SCHED_RTG]: Frame state is already reset\n"); + return; + } + + atomic_set(&frame_info->frame_state, FRAME_END_STATE); + set_frame_timestamp(frame_info, FRAME_END); +} + +int update_frame_state(int grp_id, int margin, bool in_frame) +{ + struct frame_info *frame_info = NULL; + + frame_info = lookup_frame_info_by_grp_id(grp_id); + if (!frame_info || !frame_info->rtg) + return -INVALID_RTG_ID; + + if (in_frame) { + start_frame_freq(frame_info); + set_frame(frame_info, margin); + trace_rtg_frame_sched(grp_id, "margin", margin); + } else { + reset_frame(frame_info); + } + + return SUCC; +} + +static long ctrl_frame_state(void __user *uarg, bool is_enter) +{ + struct proc_state_data state_data; + + if (uarg == NULL) + return -INVALID_ARG; + + if (copy_from_user(&state_data, uarg, sizeof(state_data))) { + pr_err("[SCHED_RTG] CMD_ID_FRAME_FREQ copy data failed\n"); + return -INVALID_ARG; + } + + return update_frame_state(state_data.grp_id, state_data.state_param, is_enter); +} + +static long ctrl_begin_frame(int abi, void __user *uarg) +{ + return ctrl_frame_state(uarg, true); +} + +static long ctrl_end_frame(int abi, void __user *uarg) +{ + return ctrl_frame_state(uarg, false); +} + +static int stop_frame_freq(int gid) +{ + struct frame_info *frame_info = NULL; + + frame_info = lookup_frame_info_by_grp_id(gid); + if (!frame_info) + return -INVALID_RTG_ID; + + atomic_set(&frame_info->start_frame_freq, 0); + set_frame_sched_state(frame_info, false); + + return 0; +} + +static long ctrl_end_scene(int abi, void __user *uarg) +{ + int rtg_id; + + if (uarg == NULL) + return -INVALID_ARG; + + if (copy_from_user(&rtg_id, uarg, sizeof(int))) { + pr_err("[SCHED_RTG] CMD_ID_END_SCENE copy data failed\n"); + return -INVALID_ARG; + } + + return stop_frame_freq(rtg_id); +} + static void clear_rtg_frame_thread(struct frame_info *frame_info, bool reset) { struct frame_thread_info frame_thread_info; diff --git a/kernel/sched/rtg/rtg_ctrl.h b/kernel/sched/rtg/rtg_ctrl.h index 3860a0c61669..df8ac420d156 100644 --- a/kernel/sched/rtg/rtg_ctrl.h +++ b/kernel/sched/rtg/rtg_ctrl.h @@ -36,6 +36,18 @@ #define MAX_FREQ_CYCLE 16 #define DEFAULT_INVALID_INTERVAL 50 +/* proc_state */ +enum proc_state { + STATE_MIN = 0, + FRAME_DRAWING, + FRAME_RME_MAX = 19, + /* rme end */ + FRAME_END_STATE = FRAME_RME_MAX + 1, + + FRAME_CLICK = 100, + STATE_MAX, +}; + enum rtg_config { RTG_FREQ_CYCLE, RTG_FRAME_MAX_UTIL, -- Gitee From 20a2886840a42a1954339f2d0c892fbe6e2e0837 Mon Sep 17 00:00:00 2001 From: Dai Li Date: Mon, 21 Feb 2022 23:03:11 +0800 Subject: [PATCH 065/113] sched: Add margin and util set for frame RTG ohos inclusion category: feature issue: #I4U089 CVE: NA ------------------------------------------- Set margin or min_util for frame RTG Signed-off-by: Dai Li --- include/linux/sched/rtg_ctrl.h | 5 +++ kernel/sched/rtg/rtg_ctrl.c | 60 ++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/include/linux/sched/rtg_ctrl.h b/include/linux/sched/rtg_ctrl.h index 60dcef7e3f73..6a7ec9a3e850 100644 --- a/include/linux/sched/rtg_ctrl.h +++ b/include/linux/sched/rtg_ctrl.h @@ -31,6 +31,9 @@ #define CMD_ID_END_SCENE \ _IOWR(RTG_SCHED_IPC_MAGIC, END_SCENE, struct proc_state_data) #define CMD_ID_SET_MIN_UTIL \ + _IOWR(RTG_SCHED_IPC_MAGIC, SET_MIN_UTIL, struct proc_state_data) +#define CMD_ID_SET_MARGIN \ + _IOWR(RTG_SCHED_IPC_MAGIC, SET_MARGIN, struct proc_state_data) enum ioctl_abi_format { IOCTL_ABI_ARM32, @@ -45,6 +48,8 @@ enum rtg_sched_cmdid { BEGIN_FRAME_FREQ = 5, END_FRAME_FREQ, END_SCENE, + SET_MIN_UTIL, + SET_MARGIN, RTG_CTRL_MAX_NR, }; diff --git a/kernel/sched/rtg/rtg_ctrl.c b/kernel/sched/rtg/rtg_ctrl.c index 869d7d1c2e44..e93b37fb8d87 100644 --- a/kernel/sched/rtg/rtg_ctrl.c +++ b/kernel/sched/rtg/rtg_ctrl.c @@ -28,6 +28,8 @@ static long ctrl_set_rtg_attr(int abi, void __user *uarg); static long ctrl_begin_frame(int abi, void __user *uarg); static long ctrl_end_frame(int abi, void __user *uarg); static long ctrl_end_scene(int abi, void __user *uarg); +static long ctrl_set_min_util(int abi, void __user *uarg); +static long ctrl_set_margin(int abi, void __user *uarg); static rtg_ctrl_func g_func_array[RTG_CTRL_MAX_NR] = { NULL, /* reserved */ @@ -38,6 +40,8 @@ static rtg_ctrl_func g_func_array[RTG_CTRL_MAX_NR] = { ctrl_begin_frame, // 5 ctrl_end_frame, ctrl_end_scene, + ctrl_set_min_util, + ctrl_set_margin, }; static int init_proc_state(const int *config, int len); @@ -537,6 +541,62 @@ static long ctrl_end_scene(int abi, void __user *uarg) return stop_frame_freq(rtg_id); } +static int set_min_util(int gid, int min_util) +{ + struct frame_info *frame_info = NULL; + + frame_info = lookup_frame_info_by_grp_id(gid); + if (!frame_info) + return -FRAME_ERR_PID; + + set_frame_min_util(frame_info, min_util, false); + + return SUCC; +} + +static long ctrl_set_min_util(int abi, void __user *uarg) +{ + struct proc_state_data state_data; + + if (uarg == NULL) + return -INVALID_ARG; + + if (copy_from_user(&state_data, uarg, sizeof(state_data))) { + pr_err("[SCHED_RTG] CMD_ID_SET_MIN_UTIL copy data failed\n"); + return -INVALID_ARG; + } + + return set_min_util(state_data.grp_id, state_data.state_param); +} + +static int set_margin(int grp_id, int margin) +{ + struct frame_info *frame_info = NULL; + + frame_info = lookup_frame_info_by_grp_id(grp_id); + if (!frame_info) + return -FRAME_ERR_PID; + + set_frame_margin(frame_info, margin); + + return SUCC; +} + +static long ctrl_set_margin(int abi, void __user *uarg) +{ + struct proc_state_data state_data; + + if (uarg == NULL) + return -INVALID_ARG; + + if (copy_from_user(&state_data, uarg, sizeof(state_data))) { + pr_err("[SCHED_RTG] CMD_ID_SET_MARGIN copy data failed\n"); + return -INVALID_ARG; + } + + return set_margin(state_data.grp_id, state_data.state_param); +} + static void clear_rtg_frame_thread(struct frame_info *frame_info, bool reset) { struct frame_thread_info frame_thread_info; -- Gitee From 55f9a347f9e8503e762c9cda8503e4abff4a5766 Mon Sep 17 00:00:00 2001 From: Dai Li Date: Mon, 21 Feb 2022 23:06:09 +0800 Subject: [PATCH 066/113] sched: Add search interface for frame RTG ohos inclusion category: feature issue: #I4U089 CVE: NA ------------------------------------------- 1.List frame RTG or it's threads 2.Offer a thread id to search it's rtg id 3.Check if rtg is enabled Signed-off-by: Dai Li --- include/linux/sched/rtg_ctrl.h | 12 +++++ kernel/sched/rtg/frame_rtg.c | 50 +++++++++++++++++++ kernel/sched/rtg/frame_rtg.h | 7 +++ kernel/sched/rtg/rtg_ctrl.c | 89 ++++++++++++++++++++++++++++++++++ 4 files changed, 158 insertions(+) diff --git a/include/linux/sched/rtg_ctrl.h b/include/linux/sched/rtg_ctrl.h index 6a7ec9a3e850..0e346ff49fe4 100644 --- a/include/linux/sched/rtg_ctrl.h +++ b/include/linux/sched/rtg_ctrl.h @@ -34,6 +34,14 @@ _IOWR(RTG_SCHED_IPC_MAGIC, SET_MIN_UTIL, struct proc_state_data) #define CMD_ID_SET_MARGIN \ _IOWR(RTG_SCHED_IPC_MAGIC, SET_MARGIN, struct proc_state_data) +#define CMD_ID_LIST_RTG \ + _IOWR(RTG_SCHED_IPC_MAGIC, LIST_RTG, struct rtg_info) +#define CMD_ID_LIST_RTG_THREAD \ + _IOWR(RTG_SCHED_IPC_MAGIC, LIST_RTG_THREAD, struct rtg_grp_data) +#define CMD_ID_SEARCH_RTG \ + _IOWR(RTG_SCHED_IPC_MAGIC, SEARCH_RTG, struct proc_state_data) +#define CMD_ID_GET_ENABLE \ + _IOWR(RTG_SCHED_IPC_MAGIC, GET_ENABLE, struct rtg_enable_data) enum ioctl_abi_format { IOCTL_ABI_ARM32, @@ -50,6 +58,10 @@ enum rtg_sched_cmdid { END_SCENE, SET_MIN_UTIL, SET_MARGIN, + LIST_RTG = 10, + LIST_RTG_THREAD, + SEARCH_RTG, + GET_ENABLE, RTG_CTRL_MAX_NR, }; diff --git a/kernel/sched/rtg/frame_rtg.c b/kernel/sched/rtg/frame_rtg.c index 53e97b2a9c25..89561c84774e 100644 --- a/kernel/sched/rtg/frame_rtg.c +++ b/kernel/sched/rtg/frame_rtg.c @@ -311,6 +311,56 @@ static void do_update_frame_task_prio(struct frame_info *frame_info, } } +int list_rtg_group(struct rtg_info *rs_data) +{ + int i; + int num = 0; + + read_lock(&g_id_manager.lock); + for (i = MULTI_FRAME_ID; i < MULTI_FRAME_ID + MULTI_FRAME_NUM; i++) { + if (test_bit(i - MULTI_FRAME_ID, g_id_manager.id_map)) { + rs_data->rtgs[num] = i; + num++; + } + } + read_unlock(&g_id_manager.lock); + rs_data->rtg_num = num; + + return num; +} + +int search_rtg(int pid) +{ + struct rtg_info grp_info; + struct frame_info *frame_info = NULL; + int i = 0; + int j = 0; + + grp_info.rtg_num = 0; + read_lock(&g_id_manager.lock); + for (i = MULTI_FRAME_ID; i < MULTI_FRAME_ID + MULTI_FRAME_NUM; i++) { + if (test_bit(i - MULTI_FRAME_ID, g_id_manager.id_map)) { + grp_info.rtgs[grp_info.rtg_num] = i; + grp_info.rtg_num++; + } + } + read_unlock(&g_id_manager.lock); + for (i = 0; i < grp_info.rtg_num; i++) { + frame_info = lookup_frame_info_by_grp_id(grp_info.rtgs[i]); + if (!frame_info) { + pr_err("[FRAME_RTG] unexpected grp %d find error.", i); + return -EINVAL; + } + + for (j = 0; j < frame_info->thread_num; j++) { + if (frame_info->thread[j] && frame_info->thread[j]->pid == pid) + return grp_info.rtgs[i]; + } + } + + return 0; +} + static void update_frame_task_prio(struct frame_info *frame_info, int prio) { int i; diff --git a/kernel/sched/rtg/frame_rtg.h b/kernel/sched/rtg/frame_rtg.h index 5aea6cdc9ee8..049bd8865249 100644 --- a/kernel/sched/rtg/frame_rtg.h +++ b/kernel/sched/rtg/frame_rtg.h @@ -69,6 +69,11 @@ struct multi_frame_id_manager { rwlock_t lock; }; +struct rtg_info { + int rtg_num; + int rtgs[MULTI_FRAME_NUM]; +}; + bool is_frame_rtg(int id); int set_frame_rate(struct frame_info *frame_info, int rate); int alloc_multi_frame_info(void); @@ -101,4 +106,6 @@ int set_frame_timestamp(struct frame_info *frame_info, unsigned long timestamp); int set_frame_max_util(struct frame_info *frame_info, int max_util); int set_frame_min_util(struct frame_info *frame_info, int min_util, bool is_boost); struct frame_info *lookup_frame_info_by_grp_id(int grp_id); +int list_rtg_group(struct rtg_info *rs_data); +int search_rtg(int pid); #endif diff --git a/kernel/sched/rtg/rtg_ctrl.c b/kernel/sched/rtg/rtg_ctrl.c index e93b37fb8d87..0e87dc7162c4 100644 --- a/kernel/sched/rtg/rtg_ctrl.c +++ b/kernel/sched/rtg/rtg_ctrl.c @@ -30,6 +30,10 @@ static long ctrl_end_frame(int abi, void __user *uarg); static long ctrl_end_scene(int abi, void __user *uarg); static long ctrl_set_min_util(int abi, void __user *uarg); static long ctrl_set_margin(int abi, void __user *uarg); +static long ctrl_list_rtg(int abi, void __user *uarg); +static long ctrl_list_rtg_thread(int abi, void __user *uarg); +static long ctrl_search_rtg(int abi, void __user *uarg); +static long ctrl_get_enable(int abi, void __user *uarg); static rtg_ctrl_func g_func_array[RTG_CTRL_MAX_NR] = { NULL, /* reserved */ @@ -42,11 +46,20 @@ static rtg_ctrl_func g_func_array[RTG_CTRL_MAX_NR] = { ctrl_end_scene, ctrl_set_min_util, ctrl_set_margin, + ctrl_list_rtg, // 10 + ctrl_list_rtg_thread, + ctrl_search_rtg, + ctrl_get_enable }; static int init_proc_state(const int *config, int len); static void deinit_proc_state(void); +int get_enable_type(void) +{ + return atomic_read(&g_enable_type); +} + static int set_enable_config(char *config_str) { char *p = NULL; @@ -227,6 +240,11 @@ static long ctrl_set_enable(int abi, void __user *uarg) return SUCC; } +static long ctrl_get_enable(int abi, void __user *uarg) +{ + return get_enable_type(); +} + static int parse_config(const struct rtg_str_data *rs_data) { int len; @@ -797,6 +815,77 @@ long ctrl_set_rtg(int abi, void __user *uarg) return ret; } +static long ctrl_list_rtg(int abi, void __user *uarg) +{ + struct rtg_info rs_data; + long ret; + + if (copy_from_user(&rs_data, uarg, sizeof(rs_data))) { + pr_err("[SCHED_RTG] CMD_ID_LIST_RTG copy data failed\n"); + return -INVALID_ARG; + } + ret = list_rtg_group(&rs_data); + if (copy_to_user(uarg, &rs_data, sizeof(rs_data))) { + pr_err("[SCHED_RTG]] CMD_ID_LIST_RTG send data failed\n"); + return -INVALID_ARG; + } + + return ret; +} + +static int list_rtg_thread(struct rtg_grp_data *rs_data) +{ + int num = 0; + int grp_id = rs_data->grp_id; + struct frame_info *frame_info = NULL; + int i; + + frame_info = lookup_frame_info_by_grp_id(grp_id); + if (!frame_info) { + pr_err("[SCHED_RTG] Look up for grp %d failed!\n", grp_id); + return -INVALID_ARG; + } + for (i = 0; i < frame_info->thread_num; i++) { + if (frame_info->thread[i]) { + rs_data->tids[num] = frame_info->thread[i]->pid; + num++; + } + } + rs_data->tid_num = num; + + return num; +} + +static long ctrl_list_rtg_thread(int abi, void __user *uarg) +{ + struct rtg_grp_data rs_data; + long ret; + + if (copy_from_user(&rs_data, uarg, sizeof(rs_data))) { + pr_err("[SCHED_RTG] CMD_ID_LIST_RTG_THREAD copy data failed\n"); + return -INVALID_ARG; + } + ret = list_rtg_thread(&rs_data); + if (copy_to_user(uarg, &rs_data, sizeof(rs_data))) { + pr_err("[SCHED_RTG]] CMD_ID_LIST_RTG_THREAD send data failed\n"); + return -INVALID_ARG; + } + + return ret; +} + +static long ctrl_search_rtg(int abi, void __user *uarg) +{ + struct proc_state_data search_data; + + if (copy_from_user(&search_data, uarg, sizeof(search_data))) { + pr_err("[SCHED_RTG] CMD_ID_SEARCH_RTG copy data failed\n"); + return -INVALID_ARG; + } + + return search_rtg(search_data.state_param); +} + static long do_proc_rtg_ioctl(int abi, struct file *file, unsigned int cmd, unsigned long arg) { void __user *uarg = (void __user *)(uintptr_t)arg; -- Gitee From 6d4e7dacb3693783dd4be6d8cadc9b9c8282ea0e Mon Sep 17 00:00:00 2001 From: CY Fan Date: Fri, 25 Feb 2022 11:50:13 +0800 Subject: [PATCH 067/113] hyperhold: fix panic when setting group to readonly ohos inclusion category: bugfix issue: #I4V8DK CVE: NA ----------------- Added the judgment of whether wbgrp is enabled. Signed-off-by: CY Fan --- drivers/block/zram/zram_group/group_writeback.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/zram/zram_group/group_writeback.c b/drivers/block/zram/zram_group/group_writeback.c index 9ccbadbf4020..abc8a5c4c663 100644 --- a/drivers/block/zram/zram_group/group_writeback.c +++ b/drivers/block/zram/zram_group/group_writeback.c @@ -457,6 +457,8 @@ u64 write_group_objs(struct zram *zram, u16 gid, u64 req_size) if (!CHECK(zram->zgrp, "zram group is not enable!\n")) return 0; + if (!CHECK(zram->zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return 0; if (!CHECK_BOUND(gid, 1, zram->zgrp->nr_grp - 1)) return 0; -- Gitee From 1da88e1abe6064106efd395e4b54138f7d52d244 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Tue, 15 Feb 2022 04:20:12 +0800 Subject: [PATCH 068/113] sched: optimization for Enery Aware Scheduling(EAS) ohos inclusion category: feature issue: #I4TNS2 CVE: NA Signed-off-by: Hu Zhaodong ------------------------------------------- EAS scheduler optimization Allow running cfs task migration in clock interrupt Signed-off-by: Satya Durga Srinivasu Prabhala Signed-off-by: Vikram Mulukutla Signed-off-by: Srinath Sridharan --- init/Kconfig | 6 ++ kernel/sched/core.c | 35 +++++++++++ kernel/sched/fair.c | 141 ++++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 7 +++ 4 files changed, 188 insertions(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 1d248e9c5a89..db7449e779c6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -860,6 +860,12 @@ config UCLAMP_BUCKETS_COUNT source "kernel/sched/rtg/Kconfig" +config SCHED_EAS + bool "EAS scheduler optimization" + default n + help + Check and migrate the CFS process to a more suitable CPU in the tick. + endmenu # diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8e506f6efc73..471b2129ea84 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4134,6 +4134,11 @@ void scheduler_tick(void) #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq); + +#ifdef CONFIG_SCHED_EAS + if (curr->sched_class->check_for_migration) + curr->sched_class->check_for_migration(rq, curr); +#endif #endif } @@ -7025,6 +7030,32 @@ void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, attach_tasks_core(&tasks, rq); } +#ifdef CONFIG_SCHED_EAS +static void clear_eas_migration_request(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + clear_reserved(cpu); + if (rq->push_task) { + struct task_struct *push_task = NULL; + + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->push_task) { + clear_reserved(rq->push_cpu); + push_task = rq->push_task; + rq->push_task = NULL; + } + rq->active_balance = 0; + raw_spin_unlock_irqrestore(&rq->lock, flags); + if (push_task) + put_task_struct(push_task); + } +} +#else +static inline void clear_eas_migration_request(int cpu) {} +#endif + #ifdef CONFIG_CPU_ISOLATION_OPT int do_isolation_work_cpu_stop(void *data) { @@ -7058,6 +7089,7 @@ int do_isolation_work_cpu_stop(void *data) set_rq_online(rq); rq_unlock(rq, &rf); + clear_eas_migration_request(cpu); local_irq_enable(); return 0; } @@ -7425,6 +7457,7 @@ int sched_cpu_starting(unsigned int cpu) { sched_rq_cpu_starting(cpu); sched_tick_start(cpu); + clear_eas_migration_request(cpu); return 0; } @@ -7447,6 +7480,8 @@ int sched_cpu_dying(unsigned int cpu) BUG_ON(rq->nr_running != 1); rq_unlock_irqrestore(rq, &rf); + clear_eas_migration_request(cpu); + calc_load_migrate(rq); update_max_interval(); nohz_balance_exit_idle(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3b8d6c1dfc30..8cc83bb0016b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10110,9 +10110,13 @@ static int active_load_balance_cpu_stop(void *data) int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; + struct sched_domain *sd = NULL; struct task_struct *p = NULL; struct rq_flags rf; +#ifdef CONFIG_SCHED_EAS + struct task_struct *push_task; + int push_task_detached = 0; +#endif rq_lock_irq(busiest_rq, &rf); /* @@ -10139,6 +10143,32 @@ static int active_load_balance_cpu_stop(void *data) */ BUG_ON(busiest_rq == target_rq); +#ifdef CONFIG_SCHED_EAS + push_task = busiest_rq->push_task; + target_cpu = busiest_rq->push_cpu; + if (push_task) { + struct lb_env env = { + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + .flags = 0, + .loop = 0, + }; + if (task_on_rq_queued(push_task) && + push_task->state == TASK_RUNNING && + task_cpu(push_task) == busiest_cpu && + cpu_online(target_cpu)) { + update_rq_clock(busiest_rq); + detach_task(push_task, &env); + push_task_detached = 1; + } + goto out_unlock; + } +#endif + /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { @@ -10178,8 +10208,23 @@ static int active_load_balance_cpu_stop(void *data) rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; + +#ifdef CONFIG_SCHED_EAS + push_task = busiest_rq->push_task; + if (push_task) + busiest_rq->push_task = NULL; +#endif rq_unlock(busiest_rq, &rf); +#ifdef CONFIG_SCHED_EAS + if (push_task) { + if (push_task_detached) + attach_one_task(target_rq, push_task); + + put_task_struct(push_task); + } +#endif + if (p) attach_one_task(target_rq, p); @@ -10979,6 +11024,97 @@ static void rq_offline_fair(struct rq *rq) unthrottle_offline_cfs_rqs(rq); } +#ifdef CONFIG_SCHED_EAS +static inline int +kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) +{ + unsigned long flags; + int rc = 0; + + if (cpu_of(rq) == new_cpu) + return rc; + + /* Invoke active balance to force migrate currently running task */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (!rq->active_balance) { + rq->active_balance = 1; + rq->push_cpu = new_cpu; + get_task_struct(p); + rq->push_task = p; + rc = 1; + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + return rc; +} + +DEFINE_RAW_SPINLOCK(migration_lock); +static void check_for_migration_fair(struct rq *rq, struct task_struct *p) +{ + int active_balance; + int new_cpu = -1; + int prev_cpu = task_cpu(p); + int ret; + +#ifdef CONFIG_SCHED_RTG + bool need_down_migrate = false; + struct cpumask *rtg_target = find_rtg_target(p); + + if (rtg_target && + (capacity_orig_of(prev_cpu) > + capacity_orig_of(cpumask_first(rtg_target)))) + need_down_migrate = true; +#endif + + if (rq->misfit_task_load) { + if (rq->curr->state != TASK_RUNNING || + rq->curr->nr_cpus_allowed == 1) + return; + + raw_spin_lock(&migration_lock); +#ifdef CONFIG_SCHED_RTG + if (rtg_target) { + new_cpu = find_rtg_cpu(p); + + if (new_cpu != -1 && need_down_migrate && + cpumask_test_cpu(new_cpu, rtg_target) && + idle_cpu(new_cpu)) + goto do_active_balance; + + if (new_cpu != -1 && + capacity_orig_of(new_cpu) > capacity_orig_of(prev_cpu)) + goto do_active_balance; + + goto out_unlock; + } +#endif + rcu_read_lock(); + new_cpu = find_energy_efficient_cpu(p, prev_cpu); + rcu_read_unlock(); + + if (new_cpu == -1 || + capacity_orig_of(new_cpu) <= capacity_orig_of(prev_cpu)) + goto out_unlock; +#ifdef CONFIG_SCHED_RTG +do_active_balance: +#endif + active_balance = kick_active_balance(rq, p, new_cpu); + if (active_balance) { + mark_reserved(new_cpu); + raw_spin_unlock(&migration_lock); + ret = stop_one_cpu_nowait(prev_cpu, + active_load_balance_cpu_stop, rq, + &rq->active_balance_work); + if (!ret) + clear_reserved(new_cpu); + else + wake_up_if_idle(new_cpu); + return; + } +out_unlock: + raw_spin_unlock(&migration_lock); + } +} +#endif /* CONFIG_SCHED_EAS */ #endif /* CONFIG_SMP */ /* @@ -11530,6 +11666,9 @@ const struct sched_class fair_sched_class #ifdef CONFIG_SCHED_WALT .fixup_walt_sched_stats = walt_fixup_sched_stats_fair, #endif +#ifdef CONFIG_SCHED_EAS + .check_for_migration = check_for_migration_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d79744dcc048..1a4f1806eb78 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1047,6 +1047,9 @@ struct rq { /* For active balancing */ int active_balance; int push_cpu; +#ifdef CONFIG_SCHED_EAS + struct task_struct *push_task; +#endif struct cpu_stop_work active_balance_work; /* CPU of this runqueue: */ @@ -1925,6 +1928,9 @@ struct sched_class { void (*fixup_walt_sched_stats)(struct rq *rq, struct task_struct *p, u16 updated_demand_scaled); #endif +#ifdef CONFIG_SCHED_EAS + void (*check_for_migration)(struct rq *rq, struct task_struct *p); +#endif } __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) @@ -2768,6 +2774,7 @@ extern bool task_fits_max(struct task_struct *p, int cpu); extern unsigned long capacity_spare_without(int cpu, struct task_struct *p); extern int update_preferred_cluster(struct related_thread_group *grp, struct task_struct *p, u32 old_load, bool from_tick); +extern struct cpumask *find_rtg_target(struct task_struct *p); #endif #ifdef CONFIG_SCHED_WALT -- Gitee From 31cd27f8a963a56f4462bfd19c17fbd0f59e62f0 Mon Sep 17 00:00:00 2001 From: Hu Zhaodong Date: Tue, 15 Feb 2022 05:05:17 +0800 Subject: [PATCH 069/113] sched: RT active load balancing optimization ohos inclusion category: feature issue: #I4TNS2 CVE: NA Signed-off-by: Hu Zhaodong ------------------------------------------- allow migrating running rt task in clock interrupt Signed-off-by: gaochao --- include/linux/sched/sysctl.h | 4 ++ init/Kconfig | 7 +++ kernel/sched/rt.c | 90 ++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 7 +++ kernel/sysctl.c | 9 ++++ 5 files changed, 117 insertions(+) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 210909cd4141..acec3b1fd469 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -41,6 +41,10 @@ sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif +#ifdef CONFIG_SCHED_RT_ACTIVE_LB +extern unsigned int sysctl_sched_enable_rt_active_lb; +#endif + enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/init/Kconfig b/init/Kconfig index db7449e779c6..57554d795040 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -866,6 +866,13 @@ config SCHED_EAS help Check and migrate the CFS process to a more suitable CPU in the tick. +config SCHED_RT_ACTIVE_LB + bool "RT Capacity Aware Misfit Task" + depends on SCHED_EAS + default n + help + Check and migrate the RT process to a more suitable CPU in the tick. + endmenu # diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6c1475950441..7fd02d8d2b90 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -17,6 +17,10 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); struct rt_bandwidth def_rt_bandwidth; +#ifdef CONFIG_SCHED_RT_ACTIVE_LB +unsigned int sysctl_sched_enable_rt_active_lb = 1; +#endif + static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = @@ -2443,6 +2447,89 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) } } +#ifdef CONFIG_SCHED_RT_ACTIVE_LB +static int rt_active_load_balance_cpu_stop(void *data) +{ + struct rq *busiest_rq = data; + struct task_struct *next_task = busiest_rq->rt_push_task; + struct rq *lowest_rq = NULL; + unsigned long flags; + + raw_spin_lock_irqsave(&busiest_rq->lock, flags); + busiest_rq->rt_active_balance = 0; + + /* find_lock_lowest_rq locks the rq if found */ + lowest_rq = find_lock_lowest_rq(next_task, busiest_rq); + if (!lowest_rq) + goto out; + + if (capacity_orig_of(cpu_of(lowest_rq)) <= capacity_orig_of(task_cpu(next_task))) + goto unlock; + + deactivate_task(busiest_rq, next_task, 0); + set_task_cpu(next_task, lowest_rq->cpu); + activate_task(lowest_rq, next_task, 0); + + resched_curr(lowest_rq); +unlock: + double_unlock_balance(busiest_rq, lowest_rq); +out: + put_task_struct(next_task); + raw_spin_unlock_irqrestore(&busiest_rq->lock, flags); + + return 0; +} + +static void check_for_migration_rt(struct rq *rq, struct task_struct *p) +{ + bool need_actvie_lb = false; + bool misfit_task = false; + int cpu = task_cpu(p); + unsigned long cpu_orig_cap; +#ifdef CONFIG_SCHED_RTG + struct cpumask *rtg_target = NULL; +#endif + + if (!sysctl_sched_enable_rt_active_lb) + return; + + if (p->nr_cpus_allowed == 1) + return; + + cpu_orig_cap = capacity_orig_of(cpu); + /* cpu has max capacity, no need to do balance */ + if (cpu_orig_cap == rq->rd->max_cpu_capacity) + return; + +#ifdef CONFIG_SCHED_RTG + rtg_target = find_rtg_target(p); + if (rtg_target) + misfit_task = capacity_orig_of(cpumask_first(rtg_target)) > + cpu_orig_cap; + else + misfit_task = !rt_task_fits_capacity(p, cpu); +#else + misfit_task = !rt_task_fits_capacity(p, cpu); +#endif + + if (misfit_task) { + raw_spin_lock(&rq->lock); + if (!rq->active_balance && !rq->rt_active_balance) { + rq->rt_active_balance = 1; + rq->rt_push_task = p; + get_task_struct(p); + need_actvie_lb = true; + } + raw_spin_unlock(&rq->lock); + + if (need_actvie_lb) + stop_one_cpu_nowait(task_cpu(p), + rt_active_load_balance_cpu_stop, + rq, &rq->rt_active_balance_work); + } +} +#endif + static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) { /* @@ -2491,6 +2578,9 @@ const struct sched_class rt_sched_class #ifdef CONFIG_SCHED_WALT .fixup_walt_sched_stats = fixup_walt_sched_stats_common, #endif +#ifdef CONFIG_SCHED_RT_ACTIVE_LB + .check_for_migration = check_for_migration_rt, +#endif }; #ifdef CONFIG_RT_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1a4f1806eb78..09ad491bed45 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1052,6 +1052,13 @@ struct rq { #endif struct cpu_stop_work active_balance_work; + /* For rt active balancing */ +#ifdef CONFIG_SCHED_RT_ACTIVE_LB + int rt_active_balance; + struct task_struct *rt_push_task; + struct cpu_stop_work rt_active_balance_work; +#endif + /* CPU of this runqueue: */ int cpu; int online; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f13b9e456f50..d5fef7aba276 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1659,6 +1659,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_RT_ACTIVE_LB + { + .procname = "sched_enable_rt_active_lb", + .data = &sysctl_sched_enable_rt_active_lb, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_SCHED_WALT { .procname = "sched_use_walt_cpu_util", -- Gitee From 024de1def44970604dd5d2b365d9ce8e07fbe93a Mon Sep 17 00:00:00 2001 From: Hu Zhaodong Date: Tue, 15 Feb 2022 05:30:10 +0800 Subject: [PATCH 070/113] sched: RT capacity-aware scheduling optimization ohos inclusion category: feature issue: #I4TNS2 CVE: NA Signed-off-by: Hu Zhaodong ------------------------------------------- RT task detects capacity during CPU selection Signed-off-by: gaochao --- include/linux/sched/sysctl.h | 3 + include/trace/events/eas_sched.h | 76 +++++++++++++ include/trace/events/sched.h | 4 + init/Kconfig | 7 ++ kernel/sched/fair.c | 8 ++ kernel/sched/rt.c | 177 +++++++++++++++++++++++++++++++ kernel/sched/sched.h | 17 +++ kernel/sched/topology.c | 14 +++ kernel/sysctl.c | 9 ++ 9 files changed, 315 insertions(+) create mode 100644 include/trace/events/eas_sched.h diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index acec3b1fd469..a08551ebd23d 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -41,6 +41,9 @@ sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif +#ifdef CONFIG_SCHED_RT_CAS +extern unsigned int sysctl_sched_enable_rt_cas; +#endif #ifdef CONFIG_SCHED_RT_ACTIVE_LB extern unsigned int sysctl_sched_enable_rt_active_lb; #endif diff --git a/include/trace/events/eas_sched.h b/include/trace/events/eas_sched.h new file mode 100644 index 000000000000..bd24c9ef5b6e --- /dev/null +++ b/include/trace/events/eas_sched.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifdef CONFIG_SCHED_RT_CAS +TRACE_EVENT(sched_find_cas_cpu_each, + + TP_PROTO(struct task_struct *task, int cpu, int target_cpu, + int isolated, int idle, unsigned long task_util, + unsigned long cpu_util, int cpu_cap), + + TP_ARGS(task, cpu, target_cpu, isolated, idle, task_util, cpu_util, cpu_cap), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(int, prio) + __field(int, cpu) + __field(int, target_cpu) + __field(int, isolated) + __field(unsigned long, idle) + __field(unsigned long, task_util) + __field(unsigned long, cpu_util) + __field(unsigned long, cpu_cap) + ), + + TP_fast_assign( + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->pid = task->pid; + __entry->prio = task->prio; + __entry->cpu = cpu; + __entry->target_cpu = target_cpu; + __entry->isolated = isolated; + __entry->idle = idle; + __entry->task_util = task_util; + __entry->cpu_util = cpu_util; + __entry->cpu_cap = cpu_cap; + ), + + TP_printk("comm=%s pid=%d prio=%d cpu=%d target_cpu=%d isolated=%d idle=%d task_util=%lu cpu_util=%lu cpu_cap=%lu", + __entry->comm, __entry->pid, __entry->prio, + __entry->cpu, __entry->target_cpu, __entry->isolated, + __entry->idle, __entry->task_util, + __entry->cpu_util, __entry->cpu_cap) +); + +TRACE_EVENT(sched_find_cas_cpu, + + TP_PROTO(struct task_struct *task, struct cpumask *lowest_mask, + unsigned long tutil, int prev_cpu, int target_cpu), + + TP_ARGS(task, lowest_mask, tutil, prev_cpu, target_cpu), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned int, prio) + __bitmask(lowest, num_possible_cpus()) + __field(unsigned long, tutil) + __field(int, prev_cpu) + __field(int, target_cpu) + ), + + TP_fast_assign( + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->pid = task->pid; + __entry->prio = task->prio; + __assign_bitmask(lowest, cpumask_bits(lowest_mask), num_possible_cpus()); + __entry->tutil = tutil; + __entry->prev_cpu = prev_cpu; + __entry->target_cpu = target_cpu; + ), + + TP_printk("comm=%s pid=%d prio=%d lowest_mask=%s tutil=%lu perfer_idle=%u prev=%d target=%d ", + __entry->comm, __entry->pid, __entry->prio, + __get_bitmask(lowest), __entry->tutil, + __entry->prev_cpu, __entry->target_cpu) +); +#endif /* CONFIG_SCHED_RT_CAS */ diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 27b6ed3c9e58..dd5fff2bb1b2 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -10,6 +10,10 @@ #include #include +#ifdef CONFIG_SCHED_RT_CAS +#include "eas_sched.h" +#endif + /* * Tracepoint for calling kthread_stop, performed to end a kthread: */ diff --git a/init/Kconfig b/init/Kconfig index 57554d795040..ded631516e22 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -866,6 +866,13 @@ config SCHED_EAS help Check and migrate the CFS process to a more suitable CPU in the tick. +config SCHED_RT_CAS + bool "rt-cas optimization" + depends on SCHED_EAS + default n + help + RT task detects capacity during CPU selection + config SCHED_RT_ACTIVE_LB bool "RT Capacity Aware Misfit Task" depends on SCHED_EAS diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8cc83bb0016b..2c0781ce163f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3953,14 +3953,22 @@ static inline unsigned long task_util_est(struct task_struct *p) } #ifdef CONFIG_UCLAMP_TASK +#ifdef CONFIG_SCHED_RT_CAS +unsigned long uclamp_task_util(struct task_struct *p) +#else static inline unsigned long uclamp_task_util(struct task_struct *p) +#endif { return clamp(task_util_est(p), uclamp_eff_value(p, UCLAMP_MIN), uclamp_eff_value(p, UCLAMP_MAX)); } #else +#ifdef CONFIG_SCHED_RT_CAS +unsigned long uclamp_task_util(struct task_struct *p) +#else static inline unsigned long uclamp_task_util(struct task_struct *p) +#endif { return task_util_est(p); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7fd02d8d2b90..d5c00fa02a9d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -17,6 +17,10 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); struct rt_bandwidth def_rt_bandwidth; +#ifdef CONFIG_SCHED_RT_CAS +unsigned int sysctl_sched_enable_rt_cas = 1; +#endif + #ifdef CONFIG_SCHED_RT_ACTIVE_LB unsigned int sysctl_sched_enable_rt_active_lb = 1; #endif @@ -1709,6 +1713,170 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) return NULL; } +#ifdef CONFIG_SCHED_RT_CAS +static int find_cas_cpu(struct sched_domain *sd, + struct task_struct *task, struct cpumask *lowest_mask) +{ + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + struct sched_group *sg = NULL; + struct sched_group *sg_target = NULL; + struct sched_group *sg_backup = NULL; + struct cpumask search_cpu, backup_search_cpu; + int cpu = -1; + int target_cpu = -1; + unsigned long cpu_capacity; + unsigned long boosted_tutil = uclamp_task_util(task); + unsigned long target_capacity = ULONG_MAX; + unsigned long util; + unsigned long target_cpu_util = ULONG_MAX; + int prev_cpu = task_cpu(task); +#ifdef CONFIG_SCHED_RTG + struct cpumask *rtg_target = NULL; +#endif + bool boosted = uclamp_boosted(task); + + if (!sysctl_sched_enable_rt_cas) + return -1; + + rcu_read_lock(); + +#ifdef CONFIG_SCHED_RTG + rtg_target = find_rtg_target(task); +#endif + + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, 0)); + if (!sd) { + rcu_read_unlock(); + return -1; + } + + sg = sd->groups; + do { + if (!cpumask_intersects(lowest_mask, sched_group_span(sg))) + continue; + + if (boosted) { + if (cpumask_test_cpu(rd->max_cap_orig_cpu, + sched_group_span(sg))) { + sg_target = sg; + break; + } + } + + cpu = group_first_cpu(sg); +#ifdef CONFIG_SCHED_RTG + /* honor the rtg tasks */ + if (rtg_target) { + if (cpumask_test_cpu(cpu, rtg_target)) { + sg_target = sg; + break; + } + + /* active LB or big_task favor cpus with more capacity */ + if (task->state == TASK_RUNNING || boosted) { + if (capacity_orig_of(cpu) > + capacity_orig_of(cpumask_any(rtg_target))) { + sg_target = sg; + break; + } + + sg_backup = sg; + continue; + } + } +#endif + /* + * 1. add margin to support task migration + * 2. if task_util is high then all cpus, make sure the + * sg_backup with the most powerful cpus is selected + */ + if (!rt_task_fits_capacity(task, cpu)) { + sg_backup = sg; + continue; + } + + /* support task boost */ + cpu_capacity = capacity_orig_of(cpu); + if (boosted_tutil > cpu_capacity) { + sg_backup = sg; + continue; + } + + /* sg_target: select the sg with smaller capacity */ + if (cpu_capacity < target_capacity) { + target_capacity = cpu_capacity; + sg_target = sg; + } + } while (sg = sg->next, sg != sd->groups); + + if (!sg_target) + sg_target = sg_backup; + + if (sg_target) { + cpumask_and(&search_cpu, lowest_mask, sched_group_span(sg_target)); + cpumask_copy(&backup_search_cpu, lowest_mask); + cpumask_andnot(&backup_search_cpu, &backup_search_cpu, &search_cpu); + } else { + cpumask_copy(&search_cpu, lowest_mask); + cpumask_clear(&backup_search_cpu); + } + +retry: + cpu = cpumask_first(&search_cpu); + do { + trace_sched_find_cas_cpu_each(task, cpu, target_cpu, + cpu_isolated(cpu), + idle_cpu(cpu), boosted_tutil, cpu_util(cpu), + capacity_orig_of(cpu)); + + if (cpu_isolated(cpu)) + continue; + + if (!cpumask_test_cpu(cpu, task->cpus_ptr)) + continue; + + /* find best cpu with smallest max_capacity */ + if (target_cpu != -1 && + capacity_orig_of(cpu) > capacity_orig_of(target_cpu)) + continue; + + util = cpu_util(cpu); + + /* Find the least loaded CPU */ + if (util > target_cpu_util) + continue; + + /* + * If the preivous CPU has same load, keep it as + * target_cpu + */ + if (target_cpu_util == util && target_cpu == prev_cpu) + continue; + + /* + * If candidate CPU is the previous CPU, select it. + * If all above conditions are same, select the least + * cumulative window demand CPU. + */ + target_cpu_util = util; + target_cpu = cpu; + } while ((cpu = cpumask_next(cpu, &search_cpu)) < nr_cpu_ids); + + if (target_cpu != -1 && cpumask_test_cpu(target_cpu, lowest_mask)) { + goto done; + } else if (!cpumask_empty(&backup_search_cpu)) { + cpumask_copy(&search_cpu, &backup_search_cpu); + cpumask_clear(&backup_search_cpu); + goto retry; + } + +done: + trace_sched_find_cas_cpu(task, lowest_mask, boosted_tutil, prev_cpu, target_cpu); + rcu_read_unlock(); + return target_cpu; +} +#endif + static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static int find_lowest_rq(struct task_struct *task) @@ -1718,6 +1886,9 @@ static int find_lowest_rq(struct task_struct *task) int this_cpu = smp_processor_id(); int cpu = task_cpu(task); int ret; +#ifdef CONFIG_SCHED_RT_CAS + int cas_cpu; +#endif /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) @@ -1744,6 +1915,12 @@ static int find_lowest_rq(struct task_struct *task) if (!ret) return -1; /* No targets found */ +#ifdef CONFIG_SCHED_RT_CAS + cas_cpu = find_cas_cpu(sd, task, lowest_mask); + if (cas_cpu != -1) + return cas_cpu; +#endif + /* * At this point we have built a mask of CPUs representing the * lowest priority tasks in the system. Now we want to elect diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 09ad491bed45..e4c65d96185e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -87,6 +87,10 @@ struct rq; struct cpuidle_state; +#ifdef CONFIG_SCHED_RT_CAS +extern unsigned long uclamp_task_util(struct task_struct *p); +#endif + #ifdef CONFIG_SCHED_WALT extern unsigned int sched_ravg_window; extern unsigned int walt_cpu_util_freq_divisor; @@ -893,6 +897,9 @@ struct root_domain { * CPUs of the rd. Protected by RCU. */ struct perf_domain __rcu *pd; +#ifdef CONFIG_SCHED_RT_CAS + int max_cap_orig_cpu; +#endif }; extern void init_defrootdomain(void); @@ -2582,6 +2589,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, return clamp(util, min_util, max_util); } +static inline bool uclamp_boosted(struct task_struct *p) +{ + return uclamp_eff_value(p, UCLAMP_MIN) > 0; +} + /* * When uclamp is compiled in, the aggregation at rq level is 'turned off' * by default in the fast path and only gets turned on once userspace performs @@ -2602,6 +2614,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, return util; } +static inline bool uclamp_boosted(struct task_struct *p) +{ + return false; +} + static inline bool uclamp_is_used(void) { return false; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b30b62f0d683..9191e5daaa3c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -522,6 +522,10 @@ static int init_rootdomain(struct root_domain *rd) if (cpupri_init(&rd->cpupri) != 0) goto free_cpudl; + +#ifdef CONFIG_SCHED_RT_CAS + rd->max_cap_orig_cpu = -1; +#endif return 0; free_cpudl: @@ -2121,9 +2125,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { +#ifdef CONFIG_SCHED_RT_CAS + int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu); +#endif + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); +#ifdef CONFIG_SCHED_RT_CAS + if (max_cpu < 0 || arch_scale_cpu_capacity(i) > + arch_scale_cpu_capacity(max_cpu)) + WRITE_ONCE(d.rd->max_cap_orig_cpu, i); +#endif + /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d5fef7aba276..e34d6937594c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1659,6 +1659,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_RT_CAS + { + .procname = "sched_enable_rt_cas", + .data = &sysctl_sched_enable_rt_cas, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_SCHED_RT_ACTIVE_LB { .procname = "sched_enable_rt_active_lb", -- Gitee From ed993f9532771f8c83d6fca46aad5cfd27a9889b Mon Sep 17 00:00:00 2001 From: waterwin Date: Sat, 19 Feb 2022 03:34:55 +0000 Subject: [PATCH 071/113] hmdfs: Not support symlink in hmdfs ohos inclusion category: feature issue: #I4UDA7 CVE: NA ---------------------------------------------- delete hmdfs symlink Signed-off-by: waterwin --- fs/hmdfs/Kconfig | 8 - fs/hmdfs/Makefile | 2 - fs/hmdfs/comm/fault_inject.c | 134 ----------------- fs/hmdfs/comm/fault_inject.h | 88 ----------- fs/hmdfs/comm/socket_adapter.c | 4 - fs/hmdfs/hmdfs.h | 2 - fs/hmdfs/hmdfs_dentryfile.c | 36 +---- fs/hmdfs/hmdfs_merge_view.h | 2 - fs/hmdfs/hmdfs_server.c | 257 +++------------------------------ fs/hmdfs/hmdfs_trace.h | 6 - fs/hmdfs/inode_local.c | 171 +--------------------- fs/hmdfs/inode_merge.c | 145 ++----------------- fs/hmdfs/inode_remote.c | 20 ++- fs/hmdfs/main.c | 4 - fs/hmdfs/stash.c | 2 +- 15 files changed, 60 insertions(+), 821 deletions(-) delete mode 100644 fs/hmdfs/comm/fault_inject.c delete mode 100644 fs/hmdfs/comm/fault_inject.h diff --git a/fs/hmdfs/Kconfig b/fs/hmdfs/Kconfig index 379606a6f466..1bb5c2347630 100644 --- a/fs/hmdfs/Kconfig +++ b/fs/hmdfs/Kconfig @@ -38,11 +38,3 @@ config HMDFS_FS_DEBUG it works. If unsure, say N. - -config HMDFS_FS_FAULT_INJECT - bool "HMDFS fault inject" - depends on HMDFS_FS - help - HMDFS provide fault inject for test. - - If unsure, say N. diff --git a/fs/hmdfs/Makefile b/fs/hmdfs/Makefile index 6f38c843664e..48a64acc8331 100644 --- a/fs/hmdfs/Makefile +++ b/fs/hmdfs/Makefile @@ -12,5 +12,3 @@ hmdfs-y += comm/connection.o comm/socket_adapter.o comm/transport.o hmdfs-$(CONFIG_HMDFS_FS_ENCRYPTION) += comm/crypto.o hmdfs-$(CONFIG_HMDFS_FS_PERMISSION) += authority/authentication.o hmdfs-$(CONFIG_HMDFS_FS_PERMISSION) += authority/config.o - -hmdfs-$(CONFIG_FS_FAULT_INJECTION) += comm/fault_inject.o diff --git a/fs/hmdfs/comm/fault_inject.c b/fs/hmdfs/comm/fault_inject.c deleted file mode 100644 index 11779b53b0ea..000000000000 --- a/fs/hmdfs/comm/fault_inject.c +++ /dev/null @@ -1,134 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * fs/hmdfs/comm/fault_inject.c - * - * Copyright (c) 2020-2021 Huawei Device Co., Ltd. - */ - -#include "hmdfs.h" -#include "fault_inject.h" -#include "connection.h" - -static DECLARE_FAULT_ATTR(fail_default_attr); -static struct dentry *hmdfs_debugfs_root; - -void __init hmdfs_create_debugfs_root(void) -{ - hmdfs_debugfs_root = debugfs_create_dir("hmdfs", NULL); - if (!hmdfs_debugfs_root) - hmdfs_warning("failed to create debugfs directory"); -} - -void hmdfs_destroy_debugfs_root(void) -{ - debugfs_remove_recursive(hmdfs_debugfs_root); - hmdfs_debugfs_root = NULL; -} - -void hmdfs_fault_inject_init(struct hmdfs_fault_inject *fault_inject, - const char *name) -{ - struct dentry *dir = NULL; - struct dentry *parent = NULL; - struct fault_attr *attr = &fault_inject->attr; - - if (!hmdfs_debugfs_root) - return; - - parent = debugfs_create_dir(name, hmdfs_debugfs_root); - if (!parent) { - hmdfs_warning("failed to create %s debugfs directory", name); - return; - } - - *attr = fail_default_attr; - dir = fault_create_debugfs_attr("fault_inject", parent, attr); - if (IS_ERR(dir)) { - hmdfs_warning("hmdfs: failed to create debugfs attr"); - debugfs_remove_recursive(parent); - return; - } - fault_inject->parent = parent; - debugfs_create_ulong("op_mask", 0600, dir, &fault_inject->op_mask); - debugfs_create_ulong("fail_send_message", 0600, dir, - &fault_inject->fail_send_message); - debugfs_create_ulong("fake_fid_ver", 0600, dir, - &fault_inject->fake_fid_ver); - debugfs_create_bool("fail_req", 0600, dir, &fault_inject->fail_req); -} - -void hmdfs_fault_inject_fini(struct hmdfs_fault_inject *fault_inject) -{ - debugfs_remove_recursive(fault_inject->parent); -} - -bool hmdfs_should_fail_sendmsg(struct hmdfs_fault_inject *fault_inject, - struct hmdfs_peer *con, - struct hmdfs_send_data *msg, int *err) -{ - struct hmdfs_head_cmd *head = (struct hmdfs_head_cmd *)msg->head; - unsigned long type = fault_inject->fail_send_message; - - if (!test_bit(head->operations.command, &fault_inject->op_mask)) - return false; - - if (type != T_MSG_FAIL && type != T_MSG_DISCARD) - return false; - - if (!should_fail(&fault_inject->attr, 1)) - return false; - - if (type == T_MSG_FAIL) - *err = -EINVAL; - else if (type == T_MSG_DISCARD) - *err = 0; - - hmdfs_err( - "fault injection err %d, %s message, device_id %llu, msg_id %u, cmd %d", - *err, (type == T_MSG_FAIL) ? "fail" : "discard", con->device_id, - le32_to_cpu(head->msg_id), head->operations.command); - return true; -} - -bool hmdfs_should_fail_req(struct hmdfs_fault_inject *fault_inject, - struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, - int *err) -{ - if (!test_bit(cmd->operations.command, &fault_inject->op_mask)) - return false; - - if (!fault_inject->fail_req) - return false; - - if (!should_fail(&fault_inject->attr, 1)) - return false; - - *err = -EIO; - hmdfs_err("fault injection err %d, device_id %llu, msg_id %u, cmd %d", - *err, con->device_id, le32_to_cpu(cmd->msg_id), - cmd->operations.command); - return true; -} - -bool hmdfs_should_fake_fid_ver(struct hmdfs_fault_inject *fault_inject, - struct hmdfs_peer *con, - struct hmdfs_head_cmd *cmd, - enum CHANGE_FID_VER_TYPE fake_type) -{ - unsigned long type = fault_inject->fake_fid_ver; - - if (!test_bit(cmd->operations.command, &fault_inject->op_mask)) - return false; - - if (type != fake_type) - return false; - - if (!should_fail(&fault_inject->attr, 1)) - return false; - - hmdfs_err( - "fault injection to change fid ver by %s cookie, device_id %llu, msg_id %u, cmd %d", - (type == T_BOOT_COOKIE) ? "boot" : "con", con->device_id, - le32_to_cpu(cmd->msg_id), cmd->operations.command); - return true; -} diff --git a/fs/hmdfs/comm/fault_inject.h b/fs/hmdfs/comm/fault_inject.h deleted file mode 100644 index be8876ab0328..000000000000 --- a/fs/hmdfs/comm/fault_inject.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/hmdfs/comm/fault_inject.h - * - * Copyright (c) 2020-2021 Huawei Device Co., Ltd. - */ - -#ifndef HMDFS_FAULT_INJECT_H -#define HMDFS_FAULT_INJECT_H - -#include -#include "protocol.h" - -struct hmdfs_fault_inject { -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - struct fault_attr attr; - struct dentry *parent; - unsigned long op_mask; - unsigned long fail_send_message; - unsigned long fake_fid_ver; - bool fail_req; -#endif -}; - -enum FAIL_MESSAGE_TYPE { - T_MSG_FAIL = 1, - T_MSG_DISCARD = 2, -}; - -enum CHANGE_FID_VER_TYPE { - T_BOOT_COOKIE = 1, - T_CON_COOKIE = 2, -}; - -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS -void __init hmdfs_create_debugfs_root(void); -void hmdfs_destroy_debugfs_root(void); - -void hmdfs_fault_inject_init(struct hmdfs_fault_inject *fault_inject, - const char *name); -void hmdfs_fault_inject_fini(struct hmdfs_fault_inject *fault_inject); -bool hmdfs_should_fail_sendmsg(struct hmdfs_fault_inject *fault_inject, - struct hmdfs_peer *con, - struct hmdfs_send_data *msg, int *err); -bool hmdfs_should_fail_req(struct hmdfs_fault_inject *fault_inject, - struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, - int *err); -bool hmdfs_should_fake_fid_ver(struct hmdfs_fault_inject *fault_inject, - struct hmdfs_peer *con, - struct hmdfs_head_cmd *cmd, - enum CHANGE_FID_VER_TYPE fake_type); -#else -static inline void __init hmdfs_create_debugfs_root(void) {} -static inline void hmdfs_destroy_debugfs_root(void) {} - -static inline void -hmdfs_fault_inject_init(struct hmdfs_fault_inject *fault_inject, - const char *name) -{ -} -static inline void -hmdfs_fault_inject_fini(struct hmdfs_fault_inject *fault_inject) -{ -} -static inline bool -hmdfs_should_fail_sendmsg(struct hmdfs_fault_inject *fault_inject, - struct hmdfs_peer *con, struct hmdfs_send_data *msg, - int *err) -{ - return false; -} -static inline bool -hmdfs_should_fail_req(struct hmdfs_fault_inject *fault_inject, - struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, - int *err) -{ - return false; -} -static inline bool -hmdfs_should_fake_fid_ver(struct hmdfs_fault_inject *fault_inject, - struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, - enum CHANGE_FID_VER_TYPE fake_type) -{ - return false; -} -#endif - -#endif // HMDFS_FAULT_INJECT_H diff --git a/fs/hmdfs/comm/socket_adapter.c b/fs/hmdfs/comm/socket_adapter.c index 769b6d28ebce..eff3d3e1c044 100644 --- a/fs/hmdfs/comm/socket_adapter.c +++ b/fs/hmdfs/comm/socket_adapter.c @@ -142,10 +142,6 @@ int hmdfs_sendmessage(struct hmdfs_peer *node, struct hmdfs_send_data *msg) goto out; } - if (hmdfs_should_fail_sendmsg(&node->sbi->fault_inject, node, msg, - &ret)) - goto out; - old_cred = hmdfs_override_creds(node->sbi->system_cred); do { diff --git a/fs/hmdfs/hmdfs.h b/fs/hmdfs/hmdfs.h index 4228bb64c43e..0c5cce32e30a 100644 --- a/fs/hmdfs/hmdfs.h +++ b/fs/hmdfs/hmdfs.h @@ -16,7 +16,6 @@ #include #include "comm/protocol.h" -#include "comm/fault_inject.h" #if KERNEL_VERSION(4, 15, 0) < LINUX_VERSION_CODE #define hmdfs_time_t timespec64 @@ -185,7 +184,6 @@ struct hmdfs_sb_info { /* To bridge the userspace utils */ struct kfifo notify_fifo; spinlock_t notify_fifo_lock; - struct hmdfs_fault_inject fault_inject; /* For reboot detect */ uint64_t boot_cookie; diff --git a/fs/hmdfs/hmdfs_dentryfile.c b/fs/hmdfs/hmdfs_dentryfile.c index e034cb8071f0..bb9faff3e027 100644 --- a/fs/hmdfs/hmdfs_dentryfile.c +++ b/fs/hmdfs/hmdfs_dentryfile.c @@ -489,9 +489,8 @@ int read_dentry(struct hmdfs_sb_info *sbi, char *file_name, else if (S_ISREG(le16_to_cpu( dentry_group->nsl[j].i_mode))) file_type = DT_REG; - else if (S_ISLNK(le16_to_cpu( - dentry_group->nsl[j].i_mode))) - file_type = DT_LNK; + else + continue; pos = hmdfs_set_pos(0, i, j); is_continue = dir_emit( @@ -684,25 +683,14 @@ void update_dentry(struct hmdfs_dentry_group *d, struct dentry *child_dentry, struct inode *inode, __u32 name_hash, unsigned int bit_pos) { struct hmdfs_dentry *de; - struct hmdfs_dentry_info *gdi = hmdfs_d(child_dentry); const struct qstr name = child_dentry->d_name; int slots = get_dentry_slots(name.len); int i; unsigned long ino; __u32 igen; - /* - * If the dentry's inode is symlink, it must be lower inode, - * and we should use the upper ino and generation to fill - * the dentryfile. - */ - if (!gdi && S_ISLNK(d_inode(child_dentry)->i_mode)) { - ino = d_inode(child_dentry)->i_ino; - igen = d_inode(child_dentry)->i_generation; - } else { - ino = inode->i_ino; - igen = inode->i_generation; - } + ino = inode->i_ino; + igen = inode->i_generation; de = &d->nsl[bit_pos]; de->hash = cpu_to_le32(name_hash); @@ -713,21 +701,7 @@ void update_dentry(struct hmdfs_dentry_group *d, struct dentry *child_dentry, de->i_size = cpu_to_le64(inode->i_size); de->i_ino = cpu_to_le64(generate_u64_ino(ino, igen)); de->i_flag = 0; - - /* - * If the dentry has fsdata, we just assume it must be - * hmdfs filesystem's dentry. - * Only client may update it's info in dentryfile when rename - * the remote file. - * Since the symlink mtime and size is from server's lower - * inode, we should just use it and only set S_IFLNK in mode. - */ - if (gdi && hm_islnk(gdi->file_type)) - de->i_mode = cpu_to_le16(S_IFLNK); - else if (!gdi && S_ISLNK(d_inode(child_dentry)->i_mode)) - de->i_mode = d_inode(child_dentry)->i_mode; - else - de->i_mode = cpu_to_le16(inode->i_mode); + de->i_mode = cpu_to_le16(inode->i_mode); for (i = 0; i < slots; i++) { __set_bit_le(bit_pos + i, d->bitmap); diff --git a/fs/hmdfs/hmdfs_merge_view.h b/fs/hmdfs/hmdfs_merge_view.h index 01064b3d98df..ad9eff5cb5c3 100644 --- a/fs/hmdfs/hmdfs_merge_view.h +++ b/fs/hmdfs/hmdfs_merge_view.h @@ -34,7 +34,6 @@ struct hmdfs_dentry_comrade { enum FILE_CMD_MERGE { F_MKDIR_MERGE = 0, F_CREATE_MERGE = 1, - F_SYMLINK_MERGE = 2, }; struct hmdfs_recursive_para { @@ -140,7 +139,6 @@ struct dentry *hmdfs_get_fst_lo_d(struct dentry *dentry); extern const struct inode_operations hmdfs_file_iops_merge; extern const struct file_operations hmdfs_file_fops_merge; -extern const struct inode_operations hmdfs_symlink_iops_merge; extern const struct inode_operations hmdfs_dir_iops_merge; extern const struct file_operations hmdfs_dir_fops_merge; extern const struct dentry_operations hmdfs_dops_merge; diff --git a/fs/hmdfs/hmdfs_server.c b/fs/hmdfs/hmdfs_server.c index ea3697f33128..ccf8170b9b4d 100644 --- a/fs/hmdfs/hmdfs_server.c +++ b/fs/hmdfs/hmdfs_server.c @@ -14,7 +14,6 @@ #include #include "authority/authentication.h" -#include "comm/fault_inject.h" #include "hmdfs.h" #include "hmdfs_dentryfile.h" #include "hmdfs_trace.h" @@ -73,39 +72,6 @@ void remove_file_from_conn(struct hmdfs_peer *conn, __u32 file_id) spin_unlock(lock); } -struct file *hmdfs_open_photokit_path(struct hmdfs_sb_info *sbi, - const char *path) -{ - struct file *file; - int err; - const char *root_name = sbi->local_dst; - char *real_path; - int path_len; - - path_len = strlen(root_name) + strlen(path) + 2; - if (path_len >= PATH_MAX) { - err = -EINVAL; - return ERR_PTR(err); - } - real_path = kzalloc(path_len, GFP_KERNEL); - if (!real_path) { - err = -ENOMEM; - return ERR_PTR(err); - } - - sprintf(real_path, "%s/%s", root_name, path); - file = filp_open(real_path, O_RDWR | O_LARGEFILE, 0644); - if (IS_ERR(file)) { - hmdfs_info("filp_open failed: %ld", PTR_ERR(file)); - } else { - hmdfs_info("get file with magic %lu", - file->f_inode->i_sb->s_magic); - } - - kfree(real_path); - return file; -} - struct file *hmdfs_open_path(struct hmdfs_sb_info *sbi, const char *path) { struct path root_path; @@ -212,38 +178,6 @@ void __init hmdfs_server_add_node_evt_cb(void) hmdfs_node_add_evt_cb(server_cb, ARRAY_SIZE(server_cb)); } -static int hmdfs_get_inode_by_name(struct hmdfs_peer *con, const char *filename, - uint64_t *ino) -{ - int ret = 0; - struct path root_path; - struct path dst_path; - struct inode *inode = NULL; - - ret = kern_path(con->sbi->local_dst, 0, &root_path); - if (ret) { - hmdfs_err("kern_path failed err = %d", ret); - return ret; - } - - ret = vfs_path_lookup(root_path.dentry, root_path.mnt, filename, 0, - &dst_path); - if (ret) { - path_put(&root_path); - return ret; - } - - inode = d_inode(dst_path.dentry); - if (con->sbi->sb == inode->i_sb) - inode = hmdfs_i(inode)->lower_inode; - *ino = generate_u64_ino(inode->i_ino, inode->i_generation); - - path_put(&dst_path); - path_put(&root_path); - - return 0; -} - static const char *datasl_str[] = { "s0", "s1", "s2", "s3", "s4" }; @@ -349,17 +283,14 @@ static struct file *hmdfs_open_file(struct hmdfs_peer *con, return ERR_PTR(-EACCES); } - if (hm_islnk(file_type)) - file = hmdfs_open_photokit_path(con->sbi, filename); - else { - if (hm_isshare(file_type)) { - err = hmdfs_check_share_access_permission(con->sbi, - filename, con->cid, &item); - if (err) - return ERR_PTR(err); - } - file = hmdfs_open_path(con->sbi, filename); + if (hm_isshare(file_type)) { + err = hmdfs_check_share_access_permission(con->sbi, + filename, con->cid, &item); + if (err) + return ERR_PTR(err); } + file = hmdfs_open_path(con->sbi, filename); + if (IS_ERR(file)) return file; @@ -413,14 +344,6 @@ static uint64_t hmdfs_server_pack_fid_ver(struct hmdfs_peer *con, uint64_t boot_cookie = con->sbi->boot_cookie; uint16_t con_cookie = con->fid_cookie; - if (hmdfs_should_fake_fid_ver(&con->sbi->fault_inject, con, - cmd, T_BOOT_COOKIE)) - boot_cookie = hmdfs_gen_boot_cookie(); - - if (hmdfs_should_fake_fid_ver(&con->sbi->fault_inject, con, - cmd, T_CON_COOKIE)) - con_cookie++; - return (boot_cookie | (con_cookie & ((1 << HMDFS_FID_VER_BOOT_COOKIE_SHIFT) - 1))); } @@ -520,15 +443,8 @@ static int hmdfs_get_open_info(struct hmdfs_peer *con, uint8_t file_type, info->stat_valid = true; } - /* if open a link file, get ino from link inode */ - if (hm_islnk(file_type)) { - ret = hmdfs_get_inode_by_name(con, filename, &info->real_ino); - if (ret) - return ret; - } else { - info->real_ino = generate_u64_ino(info->inode->i_ino, + info->real_ino = generate_u64_ino(info->inode->i_ino, info->inode->i_generation); - } return 0; } @@ -543,8 +459,6 @@ void hmdfs_server_open(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, int ret = 0; trace_hmdfs_server_open_enter(con, recv); - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &ret)) - goto out_err; resp = kzalloc(sizeread, GFP_KERNEL); info = kmalloc(sizeof(*info), GFP_KERNEL); @@ -584,7 +498,6 @@ void hmdfs_server_open(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, err_free: kfree(resp); kfree(info); -out_err: trace_hmdfs_server_open_exit(con, NULL, NULL, ret); hmdfs_send_err_response(con, cmd, ret); } @@ -605,9 +518,7 @@ static int hmdfs_check_and_create(struct path *path_parent, } else { if (is_excl) err = -EEXIST; - /* if inode aready exist, see if it's symlink */ - else if (S_ISREG(d_inode(dentry)->i_mode) && - hm_islnk(hmdfs_d(dentry)->file_type)) + else if (S_ISLNK(d_inode(dentry)->i_mode)) err = -EINVAL; else if (S_ISDIR(d_inode(dentry)->i_mode)) err = -EISDIR; @@ -741,9 +652,6 @@ void hmdfs_server_atomic_open(struct hmdfs_peer *con, struct atomic_open_response *resp = NULL; struct hmdfs_open_info *info = NULL; - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &err)) - goto out; - info = kmalloc(sizeof(*info), GFP_KERNEL); resp = kzalloc(sizeof(*resp), GFP_KERNEL); if (!resp || !info) { @@ -860,14 +768,10 @@ void hmdfs_server_fsync(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, goto out; } - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &ret)) - goto out_put_file; - ret = vfs_fsync_range(file, start, end, datasync); if (ret) hmdfs_err("fsync fail, ret %d", ret); -out_put_file: hmdfs_close_path(file); out: hmdfs_send_err_response(con, cmd, ret); @@ -897,9 +801,6 @@ void hmdfs_server_readpage(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, goto fail; } - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &ret)) - goto fail_put_file; - read_len = (size_t)le32_to_cpu(readpage_recv->size); if (read_len == 0) goto fail_put_file; @@ -1052,9 +953,6 @@ void hmdfs_server_readpages_open(struct hmdfs_peer *con, size_t resp_len = 0; struct hmdfs_open_info *info = NULL; - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &ret)) - goto fail; - info = kmalloc(sizeof(*info), GFP_KERNEL); if (!info) { ret = -ENOMEM; @@ -1283,9 +1181,6 @@ void hmdfs_server_readdir(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, goto send_err; } - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &err)) - goto err_lookup_path; - if (le32_to_cpu(readdir_recv->verify_cache)) { if (hmdfs_client_cache_validate(con->sbi, readdir_recv, &lo_p)) goto out_response; @@ -1417,9 +1312,6 @@ void hmdfs_server_rmdir(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, char *name = NULL; struct rmdir_request *rmdir_recv = data; - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &err)) - goto out; - path = rmdir_recv->path; name = rmdir_recv->path + le32_to_cpu(rmdir_recv->path_len) + 1; err = kern_path(con->sbi->local_dst, 0, &root_path); @@ -1427,7 +1319,7 @@ void hmdfs_server_rmdir(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, err = hmdfs_root_rmdir(con->device_id, &root_path, path, name); path_put(&root_path); } -out: + hmdfs_send_err_response(con, cmd, err); } @@ -1440,9 +1332,6 @@ void hmdfs_server_unlink(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, char *name = NULL; struct unlink_request *unlink_recv = data; - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &err)) - goto out; - path = unlink_recv->path; name = unlink_recv->path + le32_to_cpu(unlink_recv->path_len) + 1; err = kern_path(con->sbi->local_dst, 0, &root_path); @@ -1450,7 +1339,7 @@ void hmdfs_server_unlink(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, err = hmdfs_root_unlink(con->device_id, &root_path, path, name); path_put(&root_path); } -out: + hmdfs_send_err_response(con, cmd, err); } @@ -1469,9 +1358,6 @@ void hmdfs_server_rename(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, char *name_new = NULL; struct rename_request *recv = data; - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &err)) - goto out; - old_path_len = le32_to_cpu(recv->old_path_len); new_path_len = le32_to_cpu(recv->new_path_len); old_name_len = le32_to_cpu(recv->old_name_len); @@ -1486,50 +1372,8 @@ void hmdfs_server_rename(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, err = hmdfs_root_rename(con->sbi, con->device_id, path_old, name_old, path_new, name_new, flags); -out: - hmdfs_send_err_response(con, cmd, err); -} - -static int hmdfs_lookup_symlink(struct path *link_path, const char *path_fmt, - ...) -{ - int ret; - va_list args; - char *path = kmalloc(PATH_MAX, GFP_KERNEL); - if (!path) - return -ENOMEM; - - va_start(args, path_fmt); - ret = vsnprintf(path, PATH_MAX, path_fmt, args); - va_end(args); - - if (ret >= PATH_MAX) { - ret = -ENAMETOOLONG; - goto out; - } - - /* - * Todo: when rebuild dentryfile, there maybe deadlock - * because iterate_dir already hold the parent - * lock, but now, we didn't know the symlink - * src's parent. - */ - ret = kern_path(path, LOOKUP_FOLLOW, link_path); - if (ret) { - hmdfs_err("kern_path failed err = %d", ret); - goto out; - } - - if (!S_ISREG(d_inode(link_path->dentry)->i_mode)) { - hmdfs_err("path is dir symlink"); - path_put(link_path); - ret = -EOPNOTSUPP; - goto out; - } -out: - kfree(path); - return ret; + hmdfs_send_err_response(con, cmd, err); } static int hmdfs_filldir_real(struct dir_context *ctx, const char *name, @@ -1572,25 +1416,6 @@ static int hmdfs_filldir_real(struct dir_context *ctx, const char *name, if (d_type == DT_REG || d_type == DT_DIR) { create_dentry(child, d_inode(child), gc->file, gc->sbi); gc->num++; - } else if (d_type == DT_LNK) { - struct path link_path; - - res = hmdfs_lookup_symlink(&link_path, "%s/%s/%s", - gc->sbi->local_src, gc->dir, - namestr); - if (!res) { - create_dentry(child, d_inode(link_path.dentry), - gc->file, gc->sbi); - path_put(&link_path); - gc->num++; - } else if (res == -ENOENT) { - /* - * If source file do not exist, use the info from link - * inode. - */ - create_dentry(child, d_inode(child), gc->file, gc->sbi); - gc->num++; - } } dput(child); @@ -1702,16 +1527,12 @@ void hmdfs_server_writepage(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, goto out; } - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &err)) - goto out_put_file; - pos = (loff_t)le64_to_cpu(writepage_recv->index) << HMDFS_PAGE_OFFSET; count = le32_to_cpu(writepage_recv->count); ret = kernel_write(file, writepage_recv->buf, count, &pos); if (ret != count) err = -EIO; -out_put_file: hmdfs_close_path(file); out: hmdfs_send_err_response(con, cmd, err); @@ -1721,27 +1542,6 @@ void hmdfs_server_writepage(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, hmdfs_server_check_writeback(hswb); } -static int hmdfs_lookup_linkpath(struct hmdfs_sb_info *sbi, - const char *path_name, struct path *dst_path) -{ - struct path link_path; - int err; - - err = hmdfs_lookup_symlink(&link_path, "%s/%s", sbi->local_dst, - path_name); - if (err) - return err; - - if (d_inode(link_path.dentry)->i_sb != sbi->sb) { - path_put(dst_path); - *dst_path = link_path; - } else { - path_put(&link_path); - } - - return 0; -} - static struct inode *hmdfs_verify_path(struct dentry *dentry, char *recv_buf, struct super_block *sb) { @@ -1791,9 +1591,6 @@ void hmdfs_server_setattr(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, struct iattr attr; __u32 valid = le32_to_cpu(recv->valid); - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &err)) - goto out; - err = kern_path(con->sbi->local_dst, 0, &root_path); if (err) { hmdfs_err("kern_path failed err = %d", err); @@ -1811,14 +1608,9 @@ void hmdfs_server_setattr(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, goto out_put_dst; } - /* We need to follow if symlink was found */ if (S_ISLNK(inode->i_mode)) { - err = hmdfs_lookup_linkpath(con->sbi, recv->buf, &dst_path); - /* if source file doesn't exist, use link inode */ - if (err == -ENOENT) - err = 0; - else if (err) - goto out_put_dst; + err = -EPERM; + goto out_put_dst; } dentry = dst_path.dentry; @@ -1884,9 +1676,6 @@ void hmdfs_server_getattr(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, unsigned int recv_flags = le32_to_cpu(recv->lookup_flags); unsigned int lookup_flags = 0; - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &err)) - goto err; - err = hmdfs_convert_lookup_flags(recv_flags, &lookup_flags); if (err) goto err; @@ -1901,7 +1690,7 @@ void hmdfs_server_getattr(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, hmdfs_err("kern_path failed err = %d", err); goto err_free_resp; } - //TODO: local_dst -->local_src + err = vfs_path_lookup(root_path.dentry, root_path.mnt, recv->buf, lookup_flags, &dst_path); if (err) @@ -1912,12 +1701,10 @@ void hmdfs_server_getattr(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, err = -ENOENT; goto out_put_dst; } - /* We need to follow if symlink was found */ + if (S_ISLNK(inode->i_mode)) { - err = hmdfs_lookup_linkpath(con->sbi, recv->buf, &dst_path); - /* if source file doesn't exist, use link inode */ - if (err && err != -ENOENT) - goto out_put_dst; + err = -EPERM; + goto out_put_dst; } err = vfs_getattr(&dst_path, &ks, STATX_BASIC_STATS | STATX_BTIME, 0); @@ -1979,9 +1766,6 @@ void hmdfs_server_statfs(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, struct kstatfs *st = NULL; int err = 0; - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &err)) - goto out; - st = kzalloc(sizeof(*st), GFP_KERNEL); if (!st) { err = -ENOMEM; @@ -2038,11 +1822,6 @@ void hmdfs_server_syncfs(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, */ int ret = 0; - if (hmdfs_should_fail_req(&con->sbi->fault_inject, con, cmd, &ret)) { - hmdfs_send_err_response(con, cmd, ret); - return; - } - hmdfs_send_err_response(con, cmd, ret); } diff --git a/fs/hmdfs/hmdfs_trace.h b/fs/hmdfs/hmdfs_trace.h index 51ecdb9abbc4..d3c5262b5f46 100644 --- a/fs/hmdfs/hmdfs_trace.h +++ b/fs/hmdfs/hmdfs_trace.h @@ -205,12 +205,6 @@ define_hmdfs_lookup_op_end_event(hmdfs_create_merge); define_hmdfs_lookup_op_end_event(hmdfs_lookup_share); define_hmdfs_lookup_op_end_event(hmdfs_lookup_share_end); -define_hmdfs_lookup_op_end_event(hmdfs_symlink_merge); -define_hmdfs_lookup_op_end_event(hmdfs_symlink_local); - -define_hmdfs_lookup_op_end_event(hmdfs_get_link_merge); -define_hmdfs_lookup_op_end_event(hmdfs_get_link_local); - TRACE_EVENT(hmdfs_show_comrade, TP_PROTO(struct dentry *d, struct dentry *lo_d, uint64_t devid), diff --git a/fs/hmdfs/inode_local.c b/fs/hmdfs/inode_local.c index 561b45dbb465..04388c808c97 100644 --- a/fs/hmdfs/inode_local.c +++ b/fs/hmdfs/inode_local.c @@ -22,11 +22,6 @@ extern struct kmem_cache *hmdfs_dentry_cachep; -static const char *const symlink_tgt_white_list[] = { - "/storage/", - "/sdcard/", -}; - struct hmdfs_name_data { struct dir_context ctx; const struct qstr *to_find; @@ -61,11 +56,6 @@ int init_hmdfs_dentry_info(struct hmdfs_sb_info *sbi, struct dentry *dentry, return 0; } -static inline void set_symlink_flag(struct hmdfs_dentry_info *gdi) -{ - gdi->file_type = HM_SYMLINK; -} - static inline void set_sharefile_flag(struct hmdfs_dentry_info *gdi) { gdi->file_type = HM_SHARE; @@ -86,6 +76,7 @@ static inline void check_and_fixup_share_ops(struct inode *inode, struct inode *fill_inode_local(struct super_block *sb, struct inode *lower_inode, const char *name) { + int ret = 0; struct inode *inode; struct hmdfs_inode_info *info; @@ -113,8 +104,6 @@ struct inode *fill_inode_local(struct super_block *sb, else if (S_ISREG(lower_inode->i_mode)) inode->i_mode = (lower_inode->i_mode & S_IFMT) | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; - else if (S_ISLNK(lower_inode->i_mode)) - inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; #ifdef CONFIG_HMDFS_FS_PERMISSION inode->i_uid = lower_inode->i_uid; @@ -136,15 +125,18 @@ struct inode *fill_inode_local(struct super_block *sb, } else if (S_ISREG(lower_inode->i_mode)) { inode->i_op = &hmdfs_file_iops_local; inode->i_fop = &hmdfs_file_fops_local; - } else if (S_ISLNK(lower_inode->i_mode)) { - inode->i_op = &hmdfs_symlink_iops_local; - inode->i_fop = &hmdfs_file_fops_local; + } else { + ret = -EIO; + goto bad_inode; } fsstack_copy_inode_size(inode, lower_inode); check_and_fixup_share_ops(inode, name); unlock_new_inode(inode); return inode; +bad_inode: + iget_failed(inode); + return ERR_PTR(ret); } /* hmdfs_convert_lookup_flags - covert hmdfs lookup flags to vfs lookup flags @@ -270,8 +262,7 @@ struct dentry *hmdfs_lookup_local(struct inode *parent_inode, child_inode = fill_inode_local(parent_inode->i_sb, d_inode(lower_path.dentry), child_dentry->d_name.name); - if (S_ISLNK(d_inode(lower_path.dentry)->i_mode)) - set_symlink_flag(gdi); + if (IS_ERR(child_inode)) { err = PTR_ERR(child_inode); ret = ERR_PTR(err); @@ -702,145 +693,6 @@ int hmdfs_rename_local(struct inode *old_dir, struct dentry *old_dentry, return err; } -static bool symname_is_allowed(const char *symname) -{ - size_t symname_len = strlen(symname); - const char *prefix = NULL; - int i, total; - - /** - * Adjacent dots are prohibited. - * Note that vfs has escaped back slashes yet. - */ - for (i = 0; i < symname_len - 1; ++i) - if (symname[i] == '.' && symname[i + 1] == '.') - goto out_fail; - - /** - * Check if the symname is included in the whitelist - * Note that we skipped cmping strlen because symname is end with '\0' - */ - total = sizeof(symlink_tgt_white_list) / - sizeof(*symlink_tgt_white_list); - for (i = 0; i < total; ++i) { - prefix = symlink_tgt_white_list[i]; - if (!strncmp(symname, prefix, strlen(prefix))) - goto out_succ; - } - -out_fail: - hmdfs_err("Prohibited link path"); - return false; -out_succ: - return true; -} - -int hmdfs_symlink_local(struct inode *dir, struct dentry *dentry, - const char *symname) -{ - int err; - struct dentry *lower_dentry = NULL; - struct dentry *lower_parent_dentry = NULL; - struct path lower_path; - struct inode *child_inode = NULL; - struct inode *lower_dir_inode = hmdfs_i(dir)->lower_inode; - struct hmdfs_dentry_info *gdi = hmdfs_d(dentry); - kuid_t tmp_uid; -#ifdef CONFIG_HMDFS_FS_PERMISSION - const struct cred *saved_cred = NULL; - struct fs_struct *saved_fs = NULL, *copied_fs = NULL; - __u16 child_perm; -#endif - - if (unlikely(!symname_is_allowed(symname))) { - err = -EPERM; - goto path_err; - } - -#ifdef CONFIG_HMDFS_FS_PERMISSION - saved_cred = hmdfs_override_file_fsids(dir, &child_perm); - if (!saved_cred) { - err = -ENOMEM; - goto path_err; - } - - saved_fs = current->fs; - copied_fs = hmdfs_override_fsstruct(saved_fs); - if (!copied_fs) { - err = -ENOMEM; - goto revert_fsids; - } -#endif - hmdfs_get_lower_path(dentry, &lower_path); - lower_dentry = lower_path.dentry; - lower_parent_dentry = lock_parent(lower_dentry); - tmp_uid = hmdfs_override_inode_uid(lower_dir_inode); - err = vfs_symlink(lower_dir_inode, lower_dentry, symname); - hmdfs_revert_inode_uid(lower_dir_inode, tmp_uid); - unlock_dir(lower_parent_dentry); - if (err) - goto out_err; - set_symlink_flag(gdi); -#ifdef CONFIG_HMDFS_FS_PERMISSION - err = hmdfs_persist_perm(lower_dentry, &child_perm); -#endif - child_inode = fill_inode_local(dir->i_sb, d_inode(lower_dentry), - dentry->d_name.name); - if (IS_ERR(child_inode)) { - err = PTR_ERR(child_inode); - goto out_err; - } - d_add(dentry, child_inode); - fsstack_copy_attr_times(dir, lower_dir_inode); - fsstack_copy_inode_size(dir, lower_dir_inode); - -out_err: - hmdfs_put_lower_path(&lower_path); -#ifdef CONFIG_HMDFS_FS_PERMISSION - hmdfs_revert_fsstruct(saved_fs, copied_fs); -revert_fsids: - hmdfs_revert_fsids(saved_cred); -#endif -path_err: - trace_hmdfs_symlink_local(dir, dentry, err); - return err; -} - -static const char *hmdfs_get_link_local(struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) -{ - const char *link = NULL; - struct dentry *lower_dentry = NULL; - struct inode *lower_inode = NULL; - struct path lower_path; - - if (!dentry) { - hmdfs_err("dentry NULL"); - link = ERR_PTR(-ECHILD); - goto link_out; - } - - hmdfs_get_lower_path(dentry, &lower_path); - lower_dentry = lower_path.dentry; - lower_inode = d_inode(lower_dentry); - if (!lower_inode->i_op || !lower_inode->i_op->get_link) { - hmdfs_err("The lower inode doesn't support get_link i_op"); - link = ERR_PTR(-EINVAL); - goto out; - } - - link = lower_inode->i_op->get_link(lower_dentry, lower_inode, done); - if (IS_ERR_OR_NULL(link)) - goto out; - fsstack_copy_attr_atime(inode, lower_inode); -out: - hmdfs_put_lower_path(&lower_path); - trace_hmdfs_get_link_local(inode, dentry, PTR_ERR_OR_ZERO(link)); -link_out: - return link; -} - static int hmdfs_setattr_local(struct dentry *dentry, struct iattr *ia) { struct inode *inode = d_inode(dentry); @@ -1037,19 +889,12 @@ struct dentry *hmdfs_lookup_share(struct inode *parent_inode, return ret; } -const struct inode_operations hmdfs_symlink_iops_local = { - .get_link = hmdfs_get_link_local, - .permission = hmdfs_permission, - .setattr = hmdfs_setattr_local, -}; - const struct inode_operations hmdfs_dir_inode_ops_local = { .lookup = hmdfs_lookup_local, .mkdir = hmdfs_mkdir_local, .create = hmdfs_create_local, .rmdir = hmdfs_rmdir_local, .unlink = hmdfs_unlink_local, - .symlink = hmdfs_symlink_local, .rename = hmdfs_rename_local, .permission = hmdfs_permission, .setattr = hmdfs_setattr_local, diff --git a/fs/hmdfs/inode_merge.c b/fs/hmdfs/inode_merge.c index f84f57d5e85c..8315eba05cfc 100644 --- a/fs/hmdfs/inode_merge.c +++ b/fs/hmdfs/inode_merge.c @@ -100,6 +100,7 @@ static struct inode *fill_inode_merge(struct super_block *sb, struct dentry *child_dentry, struct dentry *lo_d_dentry) { + int ret = 0; struct dentry *fst_lo_d = NULL; struct hmdfs_inode_info *info = NULL; struct inode *inode = NULL; @@ -138,32 +139,29 @@ static struct inode *fill_inode_merge(struct super_block *sb, update_inode_attr(inode, child_dentry); mode = d_inode(fst_lo_d)->i_mode; - /* remote symlink need to treat as regfile, - * the specific operation is performed by device_view. - * local symlink is managed by merge_view. - */ - if (hm_islnk(hmdfs_d(fst_lo_d)->file_type) && - hmdfs_d(fst_lo_d)->device_id == 0) { - inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; - inode->i_op = &hmdfs_symlink_iops_merge; - inode->i_fop = &hmdfs_file_fops_merge; - set_nlink(inode, 1); - } else if (S_ISREG(mode)) { // Reguler file 0660 + + if (S_ISREG(mode)) { inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; inode->i_op = &hmdfs_file_iops_merge; inode->i_fop = &hmdfs_file_fops_merge; set_nlink(inode, 1); - } else if (S_ISDIR(mode)) { // Directory 0771 + } else if (S_ISDIR(mode)) { inode->i_mode = S_IFDIR | S_IRWXU | S_IRWXG | S_IXOTH; inode->i_op = &hmdfs_dir_iops_merge; inode->i_fop = &hmdfs_dir_fops_merge; set_nlink(inode, get_num_comrades(child_dentry) + 2); + } else { + ret = -EIO; + goto bad_inode; } unlock_new_inode(inode); out: dput(fst_lo_d); return inode; +bad_inode: + iget_failed(inode); + return ERR_PTR(ret); } struct hmdfs_dentry_comrade *alloc_comrade(struct dentry *lo_d, int dev_id) @@ -596,10 +594,9 @@ struct dentry *hmdfs_lookup_merge(struct inode *parent_inode, /* * Internal flags like LOOKUP_CREATE should not pass to device view. * LOOKUP_REVAL is needed because dentry cache in hmdfs might be stale - * after rename in lower fs. LOOKUP_FOLLOW is not needed because - * get_link is defined for symlink inode in merge_view. - * LOOKUP_DIRECTORY is not needed because merge_view can do the - * judgement that whether result is directory or not. + * after rename in lower fs. LOOKUP_DIRECTORY is not needed because + * merge_view can do the judgement that whether result is directory or + * not. */ flags = flags & LOOKUP_REVAL; @@ -771,32 +768,6 @@ int do_create_merge(struct inode *parent_inode, struct dentry *child_dentry, return ret; } -int do_symlink_merge(struct inode *parent_inode, struct dentry *child_dentry, - const char *symname, struct inode *lower_parent_inode, - struct dentry *lo_d_child) -{ - int ret = 0; - struct super_block *sb = parent_inode->i_sb; - struct inode *child_inode = NULL; - - ret = vfs_symlink(lower_parent_inode, lo_d_child, symname); - if (ret) - goto out; - - child_inode = - fill_inode_merge(sb, parent_inode, child_dentry, lo_d_child); - if (IS_ERR(child_inode)) { - ret = PTR_ERR(child_inode); - goto out; - } - - d_add(child_dentry, child_inode); - fsstack_copy_attr_times(parent_inode, lower_parent_inode); - fsstack_copy_inode_size(parent_inode, lower_parent_inode); -out: - return ret; -} - int hmdfs_do_ops_merge(struct inode *i_parent, struct dentry *d_child, struct dentry *lo_d_child, struct path path, struct hmdfs_recursive_para *rec_op_para) @@ -816,12 +787,6 @@ int hmdfs_do_ops_merge(struct inode *i_parent, struct dentry *d_child, rec_op_para->want_excl, d_inode(path.dentry), lo_d_child); break; - case F_SYMLINK_MERGE: - ret = do_symlink_merge(i_parent, d_child, - rec_op_para->name, - d_inode(path.dentry), - lo_d_child); - break; default: ret = -EINVAL; break; @@ -1036,16 +1001,11 @@ int do_rmdir_merge(struct inode *dir, struct dentry *dentry) struct dentry *lo_d_dir = NULL; struct inode *lo_i_dir = NULL; - //TODO: 当前只删本地,因不会影响到图库场景 - //TODO:图库重启清除软连接?或者什么场景会删除 - //TODO: remove 调用同时删除空目录以及非空目录,结果不一致 - //TODO: 如果校验会不会有并发问题?就算锁,也只能锁自己 mutex_lock(&dim->comrade_list_lock); list_for_each_entry(comrade, &(dim->comrade_list), list) { lo_d = comrade->lo_d; lo_d_dir = lock_parent(lo_d); lo_i_dir = d_inode(lo_d_dir); - //TODO: 部分成功,lo_d确认 ret = vfs_rmdir(lo_i_dir, lo_d); unlock_dir(lo_d_dir); if (ret) @@ -1085,7 +1045,7 @@ int do_unlink_merge(struct inode *dir, struct dentry *dentry) struct dentry *lo_d = NULL; struct dentry *lo_d_dir = NULL; struct inode *lo_i_dir = NULL; - // TODO:文件场景 list_first_entry + mutex_lock(&dim->comrade_list_lock); list_for_each_entry(comrade, &(dim->comrade_list), list) { lo_d = comrade->lo_d; @@ -1121,34 +1081,6 @@ int hmdfs_unlink_merge(struct inode *dir, struct dentry *dentry) return ret; } -int hmdfs_symlink_merge(struct inode *dir, struct dentry *dentry, - const char *symname) -{ - int ret = 0; - struct hmdfs_recursive_para *rec_op_para = NULL; - - if (hmdfs_file_type(dentry->d_name.name) != HMDFS_TYPE_COMMON) { - ret = -EACCES; - goto out; - } - - rec_op_para = kmalloc(sizeof(*rec_op_para), GFP_KERNEL); - if (!rec_op_para) { - ret = -ENOMEM; - goto out; - } - hmdfs_init_recursive_para(rec_op_para, F_SYMLINK_MERGE, 0, false, - symname); - ret = create_lo_d_child(dir, dentry, false, rec_op_para); - -out: - trace_hmdfs_symlink_merge(dir, dentry, ret); - if (ret) - d_drop(dentry); - kfree(rec_op_para); - return ret; -} - int do_rename_merge(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -1166,11 +1098,6 @@ int do_rename_merge(struct inode *old_dir, struct dentry *old_dentry, char *abs_path_buf = kmalloc(PATH_MAX, GFP_KERNEL); char *path_name = NULL; - /* TODO: Will WPS rename a temporary file to another directory? - * could flags with replace bit result in rename ops - * cross_devices? - * currently does not support replace flags. - */ if (flags & ~RENAME_NOREPLACE) { ret = -EINVAL; goto out; @@ -1302,56 +1229,12 @@ int hmdfs_rename_merge(struct inode *old_dir, struct dentry *old_dentry, return ret; } -static const char *hmdfs_get_link_merge(struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) -{ - const char *link = NULL; - struct dentry *lower_dentry = NULL; - struct inode *lower_inode = NULL; - - if (!dentry) { - hmdfs_err("dentry NULL"); - link = ERR_PTR(-ECHILD); - goto link_out; - } - - lower_dentry = hmdfs_get_fst_lo_d(dentry); - if (!lower_dentry) { - WARN_ON(1); - link = ERR_PTR(-EINVAL); - goto out; - } - lower_inode = d_inode(lower_dentry); - if (!lower_inode->i_op || !lower_inode->i_op->get_link) { - hmdfs_err("lower inode hold no operations"); - link = ERR_PTR(-EINVAL); - goto out; - } - - link = lower_inode->i_op->get_link(lower_dentry, lower_inode, done); - if (IS_ERR_OR_NULL(link)) - goto out; - fsstack_copy_attr_atime(inode, lower_inode); -out: - dput(lower_dentry); - trace_hmdfs_get_link_merge(inode, dentry, PTR_ERR_OR_ZERO(link)); -link_out: - return link; -} - -const struct inode_operations hmdfs_symlink_iops_merge = { - .get_link = hmdfs_get_link_merge, - .permission = hmdfs_permission, -}; - const struct inode_operations hmdfs_dir_iops_merge = { .lookup = hmdfs_lookup_merge, .mkdir = hmdfs_mkdir_merge, .create = hmdfs_create_merge, .rmdir = hmdfs_rmdir_merge, .unlink = hmdfs_unlink_merge, - .symlink = hmdfs_symlink_merge, .rename = hmdfs_rename_merge, .permission = hmdfs_permission, }; diff --git a/fs/hmdfs/inode_remote.c b/fs/hmdfs/inode_remote.c index 78f04bdc4813..32692b9ac67d 100644 --- a/fs/hmdfs/inode_remote.c +++ b/fs/hmdfs/inode_remote.c @@ -339,6 +339,7 @@ static void hmdfs_fill_inode_android(struct inode *inode, struct inode *dir, struct inode *fill_inode_remote(struct super_block *sb, struct hmdfs_peer *con, struct hmdfs_lookup_ret *res, struct inode *dir) { + int ret = 0; struct inode *inode = NULL; struct hmdfs_inode_info *info; umode_t mode = res->i_mode; @@ -372,24 +373,33 @@ struct inode *fill_inode_remote(struct super_block *sb, struct hmdfs_peer *con, inode->i_mode = S_IFDIR | S_IRWXU | S_IRWXG | S_IXOTH; else if (S_ISREG(mode)) inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; - else if (S_ISLNK(mode)) - inode->i_mode = S_IFREG | S_IRWXU | S_IRWXG; + else { + ret = -EIO; + goto bad_inode; + } - if (S_ISREG(mode) || S_ISLNK(mode)) { // Reguler file + if (S_ISREG(mode)) { inode->i_op = con->conn_operations->remote_file_iops; inode->i_fop = con->conn_operations->remote_file_fops; inode->i_size = res->i_size; set_nlink(inode, 1); - } else if (S_ISDIR(mode)) { // Directory + } else if (S_ISDIR(mode)) { inode->i_op = &hmdfs_dev_dir_inode_ops_remote; inode->i_fop = &hmdfs_dev_dir_ops_remote; set_nlink(inode, 2); + } else { + ret = -EIO; + goto bad_inode; } + inode->i_mapping->a_ops = con->conn_operations->remote_file_aops; hmdfs_fill_inode_android(inode, dir, mode); unlock_new_inode(inode); return inode; +bad_inode: + iget_failed(inode); + return ERR_PTR(ret); } static bool in_share_dir(struct dentry *child_dentry) @@ -447,8 +457,6 @@ static struct dentry *hmdfs_lookup_remote_dentry(struct inode *parent_inode, lookup_result = hmdfs_lookup_by_con(con, child_dentry, &qstr, flags, relative_path); if (lookup_result != NULL) { - if (S_ISLNK(lookup_result->i_mode)) - gdi->file_type = HM_SYMLINK; if (in_share_dir(child_dentry)) gdi->file_type = HM_SHARE; inode = fill_inode_remote(sb, con, lookup_result, parent_inode); diff --git a/fs/hmdfs/main.c b/fs/hmdfs/main.c index a490d069d239..f692cfa89747 100644 --- a/fs/hmdfs/main.c +++ b/fs/hmdfs/main.c @@ -238,7 +238,6 @@ void hmdfs_put_super(struct super_block *sb) hmdfs_info("local_dst is %s, local_src is %s", sbi->local_dst, sbi->local_src); - hmdfs_fault_inject_fini(&sbi->fault_inject); hmdfs_cfn_destroy(sbi); hmdfs_unregister_sysfs(sbi); hmdfs_connections_stop(sbi); @@ -923,7 +922,6 @@ static int hmdfs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->hsi.wait_list); INIT_LIST_HEAD(&sbi->hsi.pending_list); spin_lock_init(&sbi->hsi.list_lock); - hmdfs_fault_inject_init(&sbi->fault_inject, ctrl_path); return err; out_freeroot: @@ -1054,7 +1052,6 @@ static int __init hmdfs_init(void) goto out_err; hmdfs_message_verify_init(); - hmdfs_create_debugfs_root(); return 0; out_err: hmdfs_sysfs_exit(); @@ -1067,7 +1064,6 @@ static int __init hmdfs_init(void) static void __exit hmdfs_exit(void) { - hmdfs_destroy_debugfs_root(); hmdfs_sysfs_exit(); hmdfs_exit_configfs(); unregister_filesystem(&hmdfs_fs_type); diff --git a/fs/hmdfs/stash.c b/fs/hmdfs/stash.c index c320af7f60e0..413720404cc7 100644 --- a/fs/hmdfs/stash.c +++ b/fs/hmdfs/stash.c @@ -2179,7 +2179,7 @@ hmdfs_need_rebuild_inode_stash_status(struct hmdfs_peer *conn, umode_t mode) { return hmdfs_is_stash_enabled(conn->sbi) && READ_ONCE(conn->need_rebuild_stash_list) && - (S_ISREG(mode) || S_ISLNK(mode)); + S_ISREG(mode); } void hmdfs_remote_init_stash_status(struct hmdfs_peer *conn, -- Gitee From df5a01d0efed088001e403871e723e4dc78b3387 Mon Sep 17 00:00:00 2001 From: gaochao Date: Mon, 21 Feb 2022 19:59:01 +0800 Subject: [PATCH 072/113] cpuset: fix cpumask_subset when setting invalid cpus ohos inclusion category: bugfix issue: #I4UE7J CVE: NA ------------------------------------------- To show the problem: # cat /proc/cpuinfo | grep -i processor | wc -l 8 # cat cpus # echo 8 > cpus # cat cpus # echo 0-8 > cpus # cat cpus 0-7 # With patch: # cat /proc/cpuinfo | grep -i processor | wc -l 8 # cat cpus # echo 8 > cpus sh: write error: Invalid argument # cat cpus # # echo 0-8 > cpus sh: write error: Invalid argument # # cat cpus # fix cpumask_subset in update_cpumask when setting invalid cpus Signed-off-by: gaochao --- kernel/cgroup/cpuset.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c33d70215079..3173fe47380a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1539,16 +1539,16 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, return retval; } - if (!cpumask_subset(trialcs->cpus_requested, top_cpuset.cpus_requested)) + if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask)) return -EINVAL; + cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, + cpu_active_mask); + /* Nothing to do if the cpus didn't change */ if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested)) return 0; - cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, - cpu_active_mask); - retval = validate_change(cs, trialcs); if (retval < 0) return retval; -- Gitee From cf037325f085838da991559c2249683448a90334 Mon Sep 17 00:00:00 2001 From: waterwin Date: Tue, 15 Feb 2022 17:21:21 +0800 Subject: [PATCH 073/113] hmdfs: modify read and write interface in hmdfs ohos inclusion category: bugfix issue: #I4T9TP CVE: NA ---------------------------------------------- vfs_read and vfs_write is not exported, change interface to vfs_read_iter and vfs_write_iter Signed-off-by: qianjiaxing --- fs/hmdfs/comm/connection.h | 2 +- fs/hmdfs/file_local.c | 109 +++++++++++++++++++++++------------ fs/hmdfs/file_merge.c | 14 ++++- fs/hmdfs/hmdfs_device_view.h | 10 +++- fs/hmdfs/inode_remote.c | 8 +-- 5 files changed, 96 insertions(+), 47 deletions(-) diff --git a/fs/hmdfs/comm/connection.h b/fs/hmdfs/comm/connection.h index 2d80491b9201..6137c549824c 100644 --- a/fs/hmdfs/comm/connection.h +++ b/fs/hmdfs/comm/connection.h @@ -144,7 +144,7 @@ struct hmdfs_peer { uint64_t device_id; unsigned long conn_time; uint8_t version; - u8 status; + int status; u64 features; long long old_sb_dirty_count; atomic64_t sb_dirty_count; diff --git a/fs/hmdfs/file_local.c b/fs/hmdfs/file_local.c index bef62b2c04f3..86d402515589 100644 --- a/fs/hmdfs/file_local.c +++ b/fs/hmdfs/file_local.c @@ -57,49 +57,84 @@ int hmdfs_file_release_local(struct inode *inode, struct file *file) return 0; } -ssize_t hmdfs_read_local(struct kiocb *iocb, struct iov_iter *iter) +static void hmdfs_file_accessed(struct file *file) { - struct file *lower_file = hmdfs_f(iocb->ki_filp)->lower_file; - int err; + struct file *lower_file = hmdfs_f(file)->lower_file; + struct inode *inode = file_inode(file); + struct inode *lower_inode = file_inode(lower_file); - if (iter->type & ITER_KVEC) - err = kernel_read(lower_file, iter->iov->iov_base, - iter->iov->iov_len, &(iocb->ki_pos)); - else - err = vfs_read(lower_file, iter->iov->iov_base, - iter->iov->iov_len, &(iocb->ki_pos)); + if (file->f_flags & O_NOATIME) + return; - if (err >= 0) - file_inode(iocb->ki_filp)->i_atime = file_inode(lower_file)->i_atime; - return err; + inode->i_atime = lower_inode->i_atime; +} + +ssize_t hmdfs_do_read_iter(struct file *file, struct iov_iter *iter, + loff_t *ppos) +{ + ssize_t ret; + struct file *lower_file = hmdfs_f(file)->lower_file; + + if (!iov_iter_count(iter)) + return 0; + + ret = vfs_iter_read(lower_file, iter, ppos, 0); + hmdfs_file_accessed(file); + + return ret; +} + +static ssize_t hmdfs_local_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + return hmdfs_do_read_iter(iocb->ki_filp, iter, &iocb->ki_pos); } -ssize_t hmdfs_write_local(struct kiocb *iocb, struct iov_iter *iter) +static void hmdfs_file_modified(struct file *file) { - struct file *lower_file = hmdfs_f(iocb->ki_filp)->lower_file; - struct inode *inode = file_inode(iocb->ki_filp); + struct inode *inode = file_inode(file); + struct dentry *dentry = file_dentry(file); + struct file *lower_file = hmdfs_f(file)->lower_file; struct inode *lower_inode = file_inode(lower_file); - struct dentry *dentry = file_dentry(iocb->ki_filp); - int err; - if (iter->type & ITER_KVEC) - err = kernel_write(lower_file, iter->iov->iov_base, - iter->iov->iov_len, &(iocb->ki_pos)); - else - err = vfs_write(lower_file, iter->iov->iov_base, - iter->iov->iov_len, &(iocb->ki_pos)); - - if (err >= 0) { - inode_lock(inode); - i_size_write(inode, i_size_read(lower_inode)); - inode->i_atime = lower_inode->i_atime; - inode->i_ctime = lower_inode->i_ctime; - inode->i_mtime = lower_inode->i_mtime; - if (!hmdfs_i_merge(hmdfs_i(inode))) - update_inode_to_dentry(dentry, inode); - inode_unlock(inode); - } - return err; + inode->i_atime = lower_inode->i_atime; + inode->i_ctime = lower_inode->i_ctime; + inode->i_mtime = lower_inode->i_mtime; + i_size_write(inode, i_size_read(lower_inode)); + + if (!hmdfs_i_merge(hmdfs_i(inode))) + update_inode_to_dentry(dentry, inode); +} + +ssize_t hmdfs_do_write_iter(struct file *file, struct iov_iter *iter, + loff_t *ppos) +{ + ssize_t ret; + struct file *lower_file = hmdfs_f(file)->lower_file; + struct inode *inode = file_inode(file); + + if (!iov_iter_count(iter)) + return 0; + + inode_lock(inode); + + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + + file_start_write(lower_file); + ret = vfs_iter_write(lower_file, iter, ppos, 0); + file_end_write(lower_file); + + hmdfs_file_modified(file); + +out_unlock: + inode_unlock(inode); + return ret; +} + +ssize_t hmdfs_local_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + return hmdfs_do_write_iter(iocb->ki_filp, iter, &iocb->ki_pos); } int hmdfs_fsync_local(struct file *file, loff_t start, loff_t end, int datasync) @@ -164,8 +199,8 @@ int hmdfs_file_mmap_local(struct file *file, struct vm_area_struct *vma) const struct file_operations hmdfs_file_fops_local = { .owner = THIS_MODULE, .llseek = hmdfs_file_llseek_local, - .read_iter = hmdfs_read_local, - .write_iter = hmdfs_write_local, + .read_iter = hmdfs_local_read_iter, + .write_iter = hmdfs_local_write_iter, .mmap = hmdfs_file_mmap_local, .open = hmdfs_file_open_local, .release = hmdfs_file_release_local, diff --git a/fs/hmdfs/file_merge.c b/fs/hmdfs/file_merge.c index 237bb9e806d9..8c76b7e3098f 100644 --- a/fs/hmdfs/file_merge.c +++ b/fs/hmdfs/file_merge.c @@ -507,6 +507,16 @@ const struct file_operations hmdfs_dir_fops_merge = { .compat_ioctl = hmdfs_dir_compat_ioctl_merge, }; +static ssize_t hmdfs_merge_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + return hmdfs_do_read_iter(iocb->ki_filp, iter, &iocb->ki_pos); +} + +ssize_t hmdfs_merge_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + return hmdfs_do_write_iter(iocb->ki_filp, iter, &iocb->ki_pos); +} + int hmdfs_file_open_merge(struct inode *inode, struct file *file) { int err = 0; @@ -561,8 +571,8 @@ int hmdfs_file_flush_merge(struct file *file, fl_owner_t id) const struct file_operations hmdfs_file_fops_merge = { .owner = THIS_MODULE, .llseek = hmdfs_file_llseek_local, - .read_iter = hmdfs_read_local, - .write_iter = hmdfs_write_local, + .read_iter = hmdfs_merge_read_iter, + .write_iter = hmdfs_merge_write_iter, .mmap = hmdfs_file_mmap_local, .open = hmdfs_file_open_merge, .flush = hmdfs_file_flush_merge, diff --git a/fs/hmdfs/hmdfs_device_view.h b/fs/hmdfs/hmdfs_device_view.h index fc77ef9ebcbd..160391226caf 100644 --- a/fs/hmdfs/hmdfs_device_view.h +++ b/fs/hmdfs/hmdfs_device_view.h @@ -51,7 +51,7 @@ struct hmdfs_dentry_info { uint64_t device_id; spinlock_t lock; struct mutex cache_pull_lock; - bool async_readdir_in_progress; + int async_readdir_in_progress; }; struct hmdfs_lookup_ret { @@ -112,8 +112,12 @@ int hmdfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); loff_t hmdfs_file_llseek_local(struct file *file, loff_t offset, int whence); -ssize_t hmdfs_read_local(struct kiocb *iocb, struct iov_iter *iter); -ssize_t hmdfs_write_local(struct kiocb *iocb, struct iov_iter *iter); + +ssize_t hmdfs_do_read_iter(struct file *file, struct iov_iter *iter, + loff_t *ppos); +ssize_t hmdfs_do_write_iter(struct file *file, struct iov_iter *iter, + loff_t *ppos); + int hmdfs_file_release_local(struct inode *inode, struct file *file); int hmdfs_file_mmap_local(struct file *file, struct vm_area_struct *vma); struct dentry *hmdfs_lookup(struct inode *parent_inode, diff --git a/fs/hmdfs/inode_remote.c b/fs/hmdfs/inode_remote.c index 32692b9ac67d..0a4493455e0f 100644 --- a/fs/hmdfs/inode_remote.c +++ b/fs/hmdfs/inode_remote.c @@ -117,7 +117,7 @@ static void hmdfs_remote_readdir_work(struct work_struct *work) bool empty = false; get_remote_dentry_file(dentry, con); - hmdfs_d(dentry)->async_readdir_in_progress = false; + hmdfs_d(dentry)->async_readdir_in_progress = 0; hmdfs_revert_creds(old_cred); dput(dentry); @@ -138,13 +138,13 @@ static void get_remote_dentry_file_in_wq(struct dentry *dentry, struct hmdfs_readdir_work *rw = NULL; /* do nothing if async readdir is already in progress */ - if (cmpxchg_relaxed(&hmdfs_d(dentry)->async_readdir_in_progress, false, - true)) + if (cmpxchg_relaxed(&hmdfs_d(dentry)->async_readdir_in_progress, 0, + 1)) return; rw = kmalloc(sizeof(*rw), GFP_KERNEL); if (!rw) { - hmdfs_d(dentry)->async_readdir_in_progress = false; + hmdfs_d(dentry)->async_readdir_in_progress = 0; return; } -- Gitee From 0ac7b1bc7c7630d60f4e8d2fe55f24bef72618fc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 9 Feb 2022 16:37:53 +0100 Subject: [PATCH 074/113] fixed fd631da from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/93 usb: gadget: rndis: check size of RNDIS_MSG_SET command stable inclusion from stable-v5.10.101 commit fb4ff0f96de37c44236598e8b53fe43b1df36bf3 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-25375 Signed-off-by: Yu Changchun --------------------------------- commit 38ea1eac7d88072bbffb630e2b3db83ca649b826 upstream. Check the size of the RNDIS_MSG_SET command given to us before attempting to respond to an invalid message size. Reported-by: Szymon Heidrich Cc: stable@kernel.org Tested-by: Szymon Heidrich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- drivers/usb/gadget/function/rndis.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c index 9ea94215e113..60d89339a563 100644 --- a/drivers/usb/gadget/function/rndis.c +++ b/drivers/usb/gadget/function/rndis.c @@ -655,14 +655,17 @@ static int rndis_set_response(struct rndis_params *params, rndis_set_cmplt_type *resp; rndis_resp_t *r; + BufLength = le32_to_cpu(buf->InformationBufferLength); + BufOffset = le32_to_cpu(buf->InformationBufferOffset); + if ((BufLength > RNDIS_MAX_TOTAL_SIZE) || + (BufOffset + 8 >= RNDIS_MAX_TOTAL_SIZE)) + return -EINVAL; + r = rndis_add_response(params, sizeof(rndis_set_cmplt_type)); if (!r) return -ENOMEM; resp = (rndis_set_cmplt_type *)r->buf; - BufLength = le32_to_cpu(buf->InformationBufferLength); - BufOffset = le32_to_cpu(buf->InformationBufferOffset); - #ifdef VERBOSE_DEBUG pr_debug("%s: Length: %d\n", __func__, BufLength); pr_debug("%s: Offset: %d\n", __func__, BufOffset); -- Gitee From 7ea9b48f13e3be1dd8690f5e32b08306e3b63264 Mon Sep 17 00:00:00 2001 From: Szymon Heidrich Date: Mon, 24 Jan 2022 12:14:00 +0100 Subject: [PATCH 075/113] fixed fede5bc from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/93 USB: gadget: validate interface OS descriptor requests stable inclusion from stable-v5.10.101 commit 22ec1004728548598f4f5b4a079a7873409eacfd category: bugfix issue: #I4U9Y8 CVE: CVE-2022-25258 Signed-off-by: Yu Changchun --------------------------------- commit 75e5b4849b81e19e9efe1654b30d7f3151c33c2c upstream. Stall the control endpoint in case provided index exceeds array size of MAX_CONFIG_INTERFACES or when the retrieved function pointer is null. Signed-off-by: Szymon Heidrich Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- drivers/usb/gadget/composite.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index 8bec0cbf844e..a980799900e7 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -1944,6 +1944,9 @@ composite_setup(struct usb_gadget *gadget, const struct usb_ctrlrequest *ctrl) if (w_index != 0x5 || (w_value >> 8)) break; interface = w_value & 0xFF; + if (interface >= MAX_CONFIG_INTERFACES || + !os_desc_cfg->interface[interface]) + break; buf[6] = w_index; count = count_ext_prop(os_desc_cfg, interface); -- Gitee From 6fa6d53688900fd51273bcd63170e83bca2b8915 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Sat, 1 Jan 2022 01:21:38 +0800 Subject: [PATCH 076/113] fixed 4bd60af from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/93 usb: gadget: clear related members when goto fail mainline inclusion from mainline-v5.17-rc1 commit 501e38a5531efbd77d5c73c0ba838a889bfc1d74 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-24958 Signed-off-by: Yu Changchun ------------------------------------------------- dev->config and dev->hs_config and dev->dev need to be cleaned if dev_config fails to avoid UAF. Acked-by: Alan Stern Signed-off-by: Hangyu Hua Link: https://lore.kernel.org/r/20211231172138.7993-3-hbh25y@gmail.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- drivers/usb/gadget/legacy/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 217d2b66fa51..523d05e5dca5 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -1876,8 +1876,8 @@ dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) value = usb_gadget_probe_driver(&gadgetfs_driver); if (value != 0) { - kfree (dev->buf); - dev->buf = NULL; + spin_lock_irq(&dev->lock); + goto fail; } else { /* at this point "good" hardware has for the first time * let the USB the host see us. alternatively, if users @@ -1894,6 +1894,9 @@ dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) return value; fail: + dev->config = NULL; + dev->hs_config = NULL; + dev->dev = NULL; spin_unlock_irq (&dev->lock); pr_debug ("%s: %s fail %zd, %p\n", shortname, __func__, value, dev); kfree (dev->buf); -- Gitee From 5f89b15c3af88fae50fba32a1fe5bedc3fd8f288 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Sat, 1 Jan 2022 01:21:37 +0800 Subject: [PATCH 077/113] fixed 5478ca1 from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/93 usb: gadget: don't release an existing dev->buf mainline inclusion from mainline-v5.17-rc1 commit 89f3594d0de58e8a57d92d497dea9fee3d4b9cda category: bugfix issue: #I4U9Y8 CVE: CVE-2022-24958 Signed-off-by: Yu Changchun ------------------------------------------------- dev->buf does not need to be released if it already exists before executing dev_config. Acked-by: Alan Stern Signed-off-by: Hangyu Hua Link: https://lore.kernel.org/r/20211231172138.7993-2-hbh25y@gmail.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- drivers/usb/gadget/legacy/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 523d05e5dca5..454860d52ce7 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -1828,8 +1828,9 @@ dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) spin_lock_irq (&dev->lock); value = -EINVAL; if (dev->buf) { + spin_unlock_irq(&dev->lock); kfree(kbuf); - goto fail; + return value; } dev->buf = kbuf; -- Gitee From 0db98aa3d4522962166d395ae4d7db978832bf71 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 14 Jan 2022 19:33:44 +0800 Subject: [PATCH 078/113] fixed e95ba23 from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/93 bpf, selftests: Add test case trying to taint map value pointer stable inclusion from stable-v5.10.88 commit 0612679e48d0f9c8723c94feae3309550dcf2edf category: bugfix issue: #I4U9Y8 CVE: CVE-2021-45402 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0612679e48d0f9c8723c94feae3309550dcf2edf Signed-off-by: Yu Changchun -------------------------------- commit b1a7288dedc6caf9023f2676b4f5ed34cf0d4029 upstream. Add a test case which tries to taint map value pointer arithmetic into a unknown scalar with subsequent export through the map. Before fix: # ./test_verifier 1186 #1186/u map access: trying to leak tained dst reg FAIL Unexpected success to load! verification time 24 usec stack depth 8 processed 15 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1 #1186/p map access: trying to leak tained dst reg FAIL Unexpected success to load! verification time 8 usec stack depth 8 processed 15 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1 Summary: 0 PASSED, 0 SKIPPED, 2 FAILED After fix: # ./test_verifier 1186 #1186/u map access: trying to leak tained dst reg OK #1186/p map access: trying to leak tained dst reg OK Summary: 2 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- .../selftests/bpf/verifier/value_ptr_arith.c | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index a3e593ddfafc..d8765a4d5bc6 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -848,6 +848,29 @@ .errstr = "R0 invalid mem access 'inv'", .errstr_unpriv = "R0 pointer -= pointer prohibited", }, +{ + "map access: trying to leak tained dst reg", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV32_IMM(BPF_REG_1, 0xFFFFFFFF), + BPF_MOV32_REG(BPF_REG_1, BPF_REG_1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 4 }, + .result = REJECT, + .errstr = "math between map_value pointer and 4294967295 is not allowed", +}, { "32bit pkt_ptr -= scalar", .insns = { -- Gitee From c93d71c64220f70c5ef3afbe2b22457a6adfbfcc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 14 Jan 2022 19:33:43 +0800 Subject: [PATCH 079/113] fixed 36dd088 from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/93 bpf: Make 32->64 bounds propagation slightly more robust stable inclusion from stable-v5.10.88 commit 279e0bf80d95184666c9d41361b1625c045d1dcb category: bugfix issue: #I4U9Y8 CVE: CVE-2021-45402 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=279e0bf80d95184666c9d41361b1625c045d1dcb Signed-off-by: Yu Changchun -------------------------------- commit e572ff80f05c33cd0cb4860f864f5c9c044280b6 upstream. Make the bounds propagation in __reg_assign_32_into_64() slightly more robust and readable by aligning it similarly as we did back in the __reg_combine_64_into_32() counterpart. Meaning, only propagate or pessimize them as a smin/smax pair. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/bpf/verifier.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ced1f02c43f9..cc75dbc243ff 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1249,22 +1249,28 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } +static bool __reg32_bound_s64(s32 a) +{ + return a >= 0 && a <= S32_MAX; +} + static void __reg_assign_32_into_64(struct bpf_reg_state *reg) { reg->umin_value = reg->u32_min_value; reg->umax_value = reg->u32_max_value; - /* Attempt to pull 32-bit signed bounds into 64-bit bounds - * but must be positive otherwise set to worse case bounds - * and refine later from tnum. + + /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must + * be positive otherwise set to worse case bounds and refine later + * from tnum. */ - if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0) - reg->smax_value = reg->s32_max_value; - else - reg->smax_value = U32_MAX; - if (reg->s32_min_value >= 0) + if (__reg32_bound_s64(reg->s32_min_value) && + __reg32_bound_s64(reg->s32_max_value)) { reg->smin_value = reg->s32_min_value; - else + reg->smax_value = reg->s32_max_value; + } else { reg->smin_value = 0; + reg->smax_value = U32_MAX; + } } static void __reg_combine_32_into_64(struct bpf_reg_state *reg) -- Gitee From d2bf4ff552fe1f74c7eef0e5d1517f1b65df2f90 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 14 Jan 2022 19:33:42 +0800 Subject: [PATCH 080/113] fixed 25f69b7 from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/93 bpf: Fix signed bounds propagation after mov32 stable inclusion from stable-v5.10.88 commit e2aad0b5f2cbf71a31d00ce7bb4dee948adff5a9 category: bugfix issue: #I4U9Y8 CVE: CVE-2021-45402 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e2aad0b5f2cbf71a31d00ce7bb4dee948adff5a9 Signed-off-by: Yu Changchun -------------------------------- commit 3cf2b61eb06765e27fec6799292d9fb46d0b7e60 upstream. For the case where both s32_{min,max}_value bounds are positive, the __reg_assign_32_into_64() directly propagates them to their 64 bit counterparts, otherwise it pessimises them into [0,u32_max] universe and tries to refine them later on by learning through the tnum as per comment in mentioned function. However, that does not always happen, for example, in mov32 operation we call zext_32_to_64(dst_reg) which invokes the __reg_assign_32_into_64() as is without subsequent bounds update as elsewhere thus no refinement based on tnum takes place. Thus, not calling into the __update_reg_bounds() / __reg_deduce_bounds() / __reg_bound_offset() triplet as we do, for example, in case of ALU ops via adjust_scalar_min_max_vals(), will lead to more pessimistic bounds when dumping the full register state: Before fix: 0: (b4) w0 = -1 1: R0_w=invP4294967295 (id=0,imm=ffffffff, smin_value=4294967295,smax_value=4294967295, umin_value=4294967295,umax_value=4294967295, var_off=(0xffffffff; 0x0), s32_min_value=-1,s32_max_value=-1, u32_min_value=-1,u32_max_value=-1) 1: (bc) w0 = w0 2: R0_w=invP4294967295 (id=0,imm=ffffffff, smin_value=0,smax_value=4294967295, umin_value=4294967295,umax_value=4294967295, var_off=(0xffffffff; 0x0), s32_min_value=-1,s32_max_value=-1, u32_min_value=-1,u32_max_value=-1) Technically, the smin_value=0 and smax_value=4294967295 bounds are not incorrect, but given the register is still a constant, they break assumptions about const scalars that smin_value == smax_value and umin_value == umax_value. After fix: 0: (b4) w0 = -1 1: R0_w=invP4294967295 (id=0,imm=ffffffff, smin_value=4294967295,smax_value=4294967295, umin_value=4294967295,umax_value=4294967295, var_off=(0xffffffff; 0x0), s32_min_value=-1,s32_max_value=-1, u32_min_value=-1,u32_max_value=-1) 1: (bc) w0 = w0 2: R0_w=invP4294967295 (id=0,imm=ffffffff, smin_value=4294967295,smax_value=4294967295, umin_value=4294967295,umax_value=4294967295, var_off=(0xffffffff; 0x0), s32_min_value=-1,s32_max_value=-1, u32_min_value=-1,u32_max_value=-1) Without the smin_value == smax_value and umin_value == umax_value invariant being intact for const scalars, it is possible to leak out kernel pointers from unprivileged user space if the latter is enabled. For example, when such registers are involved in pointer arithmtics, then adjust_ptr_min_max_vals() will taint the destination register into an unknown scalar, and the latter can be exported and stored e.g. into a BPF map value. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: Kuee K1r0a Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cc75dbc243ff..48ebed09d466 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7131,6 +7131,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->dst_reg); } zext_32_to_64(dst_reg); + + __update_reg_bounds(dst_reg); + __reg_deduce_bounds(dst_reg); + __reg_bound_offset(dst_reg); } } else { /* case: R = imm -- Gitee From 65886e60f428c71dbbad17ef4b358366b23ae1c1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Mar 2022 12:19:55 +0800 Subject: [PATCH 081/113] fixed e0d63c7 from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/93 f2fs: fix to do sanity check in is_alive() mainline inclusion from mainline-v5.16-rc1 commit 77900c45ee5cd5da63bd4d818a41dbdf367e81cd category: bugfix issue: #I4U9Y8 CVE: CVE-2021-44879 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=77900c45ee5cd5da63bd4d818a41dbdf367e81cd Signed-off-by: Yu Changchun -------------------------------- In fuzzed image, SSA table may indicate that a data block belongs to invalid node, which node ID is out-of-range (0, 1, 2 or max_nid), in order to avoid migrating inconsistent data in such corrupted image, let's do sanity check anyway before data block migration. Cc: stable@vger.kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Guo Xuenan Reviewed-by: Xiu Jianfeng Reviewed-by: fang wei Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/f2fs/gc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a981e466cc7d..9c621a9007da 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1016,6 +1016,9 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, set_sbi_flag(sbi, SBI_NEED_FSCK); } + if (f2fs_check_nid_range(sbi, dni->ino)) + return false; + *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); -- Gitee From 21cf796d628d4a9eeba88c09a8e464954af2dffd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Mar 2022 12:19:54 +0800 Subject: [PATCH 082/113] fixed 656494a from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/93 f2fs: fix to avoid panic in is_alive() if metadata is inconsistent mainline inclusion from mainline-v5.16-rc1 commit f6db43076d190d9bf75559dec28e18b9d12e4ce5 category: bugfix issue: #I4U9Y8 CVE: CVE-2021-44879 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f6db43076d190d9bf75559dec28e18b9d12e4ce5 Signed-off-by: Yu Changchun -------------------------------- As report by Wenqing Liu in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215231 If we enable CONFIG_F2FS_CHECK_FS config, and with fuzzed image attached in above link, we will encounter panic when executing below script: 1. mkdir mnt 2. mount -t f2fs tmp1.img mnt 3. touch tmp F2FS-fs (loop11): mismatched blkaddr 5765 (source_blkaddr 1) in seg 3 kernel BUG at fs/f2fs/gc.c:1042! do_garbage_collect+0x90f/0xa80 [f2fs] f2fs_gc+0x294/0x12a0 [f2fs] f2fs_balance_fs+0x2c5/0x7d0 [f2fs] f2fs_create+0x239/0xd90 [f2fs] lookup_open+0x45e/0xa90 open_last_lookups+0x203/0x670 path_openat+0xae/0x490 do_filp_open+0xbc/0x160 do_sys_openat2+0x2f1/0x500 do_sys_open+0x5e/0xa0 __x64_sys_openat+0x28/0x40 Previously, f2fs tries to catch data inconcistency exception in between SSA and SIT table during GC, however once the exception is caught, it will call f2fs_bug_on to hang kernel, it's not needed, instead, let's set SBI_NEED_FSCK flag and skip migrating current block. Fixes: bbf9f7d90f21 ("f2fs: Fix indefinite loop in f2fs_gc()") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Guo Xuenan Reviewed-by: Xiu Jianfeng Reviewed-by: fang wei Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9c621a9007da..a5ee52415c5b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1032,7 +1032,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u\n", blkaddr, source_blkaddr, segno); - f2fs_bug_on(sbi, 1); + set_sbi_flag(sbi, SBI_NEED_FSCK); } } #endif -- Gitee From 1a593fbdaa4ba6142b9da3bd40c5fe86ad716352 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Mar 2022 12:19:53 +0800 Subject: [PATCH 083/113] fixed 9410ad3 from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/93 f2fs: fix to do sanity check on inode type during garbage collection mainline inclusion from mainline-v5.16-rc1 commit 9056d6489f5a41cfbb67f719d2c0ce61ead72d9f category: bugfix issue: #I4U9Y8 CVE: CVE-2021-44879 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9056d6489f5a41cfbb67f719d2c0ce61ead72d9f Signed-off-by: Yu Changchun -------------------------------- As report by Wenqing Liu in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215231 - Overview kernel NULL pointer dereference triggered in folio_mark_dirty() when mount and operate on a crafted f2fs image - Reproduce tested on kernel 5.16-rc3, 5.15.X under root 1. mkdir mnt 2. mount -t f2fs tmp1.img mnt 3. touch tmp 4. cp tmp mnt F2FS-fs (loop0): sanity_check_inode: inode (ino=49) extent info [5942, 4294180864, 4] is incorrect, run fsck to fix F2FS-fs (loop0): f2fs_check_nid_range: out-of-range nid=31340049, run fsck to fix. BUG: kernel NULL pointer dereference, address: 0000000000000000 folio_mark_dirty+0x33/0x50 move_data_page+0x2dd/0x460 [f2fs] do_garbage_collect+0xc18/0x16a0 [f2fs] f2fs_gc+0x1d3/0xd90 [f2fs] f2fs_balance_fs+0x13a/0x570 [f2fs] f2fs_create+0x285/0x840 [f2fs] path_openat+0xe6d/0x1040 do_filp_open+0xc5/0x140 do_sys_openat2+0x23a/0x310 do_sys_open+0x57/0x80 The root cause is for special file: e.g. character, block, fifo or socket file, f2fs doesn't assign address space operations pointer array for mapping->a_ops field, so, in a fuzzed image, SSA table indicates a data block belong to special file, when f2fs tries to migrate that block, it causes NULL pointer access once move_data_page() calls a_ops->set_dirty_page(). Cc: stable@vger.kernel.org Reported-by: Wenqing Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Guo Xuenan Reviewed-by: Xiu Jianfeng Reviewed-by: fang wei Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/f2fs/gc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a5ee52415c5b..d87861e8a064 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1450,7 +1450,8 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (phase == 3) { inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode) || is_bad_inode(inode)) { + if (IS_ERR(inode) || is_bad_inode(inode) || + special_file(inode->i_mode)) { set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } -- Gitee From a92940110895ae682a150a2e14bda9c0196baed3 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Tue, 22 Feb 2022 22:13:10 +0800 Subject: [PATCH 084/113] fixed 56d5b06 from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/93 yam: fix a memory leak in yam_siocdevprivate() mainline inclusion from mainline-v5.17-rc2 commit 29eb31542787e1019208a2e1047bb7c76c069536 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-24959 Signed-off-by: Yu Changchun ------------------------------------------------- ym needs to be free when ym->cmd != SIOCYAMSMCS. Fixes: 0781168e23a2 ("yam: fix a missing-check bug") Signed-off-by: Hangyu Hua Signed-off-by: David S. Miller conflict: The bug is in function yam_siocdevprivate() in mainline, but it is in function yam_ioctl() because the function name is changed in 25ec92fbdd("hamradio: use ndo_siocdevprivate") in mainline. Signed-off-by: Lu Wei Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/net/hamradio/yam.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index 5ab53e9942f3..5d30b3e1806a 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -951,9 +951,7 @@ static int yam_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) sizeof(struct yamdrv_ioctl_mcs)); if (IS_ERR(ym)) return PTR_ERR(ym); - if (ym->cmd != SIOCYAMSMCS) - return -EINVAL; - if (ym->bitrate > YAM_MAXBITRATE) { + if (ym->cmd != SIOCYAMSMCS || ym->bitrate > YAM_MAXBITRATE) { kfree(ym); return -EINVAL; } -- Gitee From 662d231a952ab803ab55deece5778ca697b2260c Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 17 Feb 2022 14:59:04 +0800 Subject: [PATCH 085/113] fixed e252d54 from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/93 tipc: improve size validations for received domain records mainline inclusion from mainline-v5.17-rc4 commit 9aa422ad326634b76309e8ff342c246800621216 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-0435 Reference: https://github.com/torvalds/linux/commit/9aa422ad326634b76309e8ff342c246800621216 Signed-off-by: Yu Changchun -------------------------------- The function tipc_mon_rcv() allows a node to receive and process domain_record structs from peer nodes to track their views of the network topology. This patch verifies that the number of members in a received domain record does not exceed the limit defined by MAX_MON_DOMAIN, something that may otherwise lead to a stack overflow. tipc_mon_rcv() is called from the function tipc_link_proto_rcv(), where we are reading a 32 bit message data length field into a uint16. To avert any risk of bit overflow, we add an extra sanity check for this in that function. We cannot see that happen with the current code, but future designers being unaware of this risk, may introduce it by allowing delivery of very large (> 64k) sk buffers from the bearer layer. This potential problem was identified by Eric Dumazet. This fixes CVE-2022-0435 Reported-by: Samuel Page Reported-by: Eric Dumazet Fixes: 35c55c9877f8 ("tipc: add neighbor monitoring framework") Signed-off-by: Jon Maloy Reviewed-by: Xin Long Reviewed-by: Samuel Page Reviewed-by: Eric Dumazet Signed-off-by: Linus Torvalds Signed-off-by: Zhengchao Shao Reviewed-by: Yongjun Wei Reviewed-by: Yue Haibing Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- net/tipc/link.c | 9 +++++++-- net/tipc/monitor.c | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index c92e6984933c..ca73873ff4a5 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2156,7 +2156,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, struct tipc_msg *hdr = buf_msg(skb); struct tipc_gap_ack_blks *ga = NULL; bool reply = msg_probe(hdr), retransmitted = false; - u16 dlen = msg_data_sz(hdr), glen = 0; + u32 dlen = msg_data_sz(hdr), glen = 0; u16 peers_snd_nxt = msg_next_sent(hdr); u16 peers_tol = msg_link_tolerance(hdr); u16 peers_prio = msg_linkprio(hdr); @@ -2170,6 +2170,10 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, void *data; trace_tipc_proto_rcv(skb, false, l->name); + + if (dlen > U16_MAX) + goto exit; + if (tipc_link_is_blocked(l) || !xmitq) goto exit; @@ -2265,7 +2269,8 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, /* Receive Gap ACK blocks from peer if any */ glen = tipc_get_gap_ack_blks(&ga, l, hdr, true); - + if(glen > dlen) + break; tipc_mon_rcv(l->net, data + glen, dlen - glen, l->addr, &l->mon_state, l->bearer_id); diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 6dce2abf436e..a37190da5a50 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -465,6 +465,8 @@ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, state->probing = false; /* Sanity check received domain record */ + if (new_member_cnt > MAX_MON_DOMAIN) + return; if (dlen < dom_rec_len(arrv_dom, 0)) return; if (dlen != dom_rec_len(arrv_dom, new_member_cnt)) -- Gitee From d62ac3c375dab44e19c77646fa56e370269345e7 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 21 Feb 2022 11:03:13 +0100 Subject: [PATCH 086/113] fixed 5a30307 from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/96 lib/iov_iter: initialize "flags" in new pipe_buffer stable inclusion from stable-5.10.102 commit b19ec7afa9297d862ed86443e0164643b97250ab category: bugfix issue: #I4U9Y8 CVE: CVE-2022-0847 Signed-off-by: Yu Changchun --------------------------------------- commit 9d2231c5d74e13b2a0546fee6737ee4446017903 upstream. The functions copy_page_to_iter_pipe() and push_pipe() can both allocate a new pipe_buffer, but the "flags" member initializer is missing. Fixes: 241699cd72a8 ("new iov_iter flavour: pipe-backed") To: Alexander Viro To: linux-fsdevel@vger.kernel.org To: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Max Kellermann Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yu Changchun --- lib/iov_iter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 537bfdc8cd09..1b05d2896ceb 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -407,6 +407,7 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by return 0; buf->ops = &page_cache_pipe_buf_ops; + buf->flags = 0; get_page(page); buf->page = page; buf->offset = offset; @@ -543,6 +544,7 @@ static size_t push_pipe(struct iov_iter *i, size_t size, break; buf->ops = &default_pipe_buf_ops; + buf->flags = 0; buf->page = page; buf->offset = 0; buf->len = min_t(ssize_t, left, PAGE_SIZE); -- Gitee From f48b051c89ebbf86cee9ca4db885f851ad434012 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 2 Mar 2022 18:25:34 +0800 Subject: [PATCH 087/113] fixed 0d4583e from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/96 netfilter: nf_tables_offload: incorrect flow offload action array size mainline inclusion from mainline-v5.17-rc6 commit b1a5983f56e371046dcf164f90bfaf704d2b89f6 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-25636 Signed-off-by: Yu Changchun -------------------------------- immediate verdict expression needs to allocate one slot in the flow offload action array, however, immediate data expression does not need to do so. fwd and dup expression need to allocate one slot, this is missing. Add a new offload_action interface to report if this expression needs to allocate one slot in the flow offload action array. Fixes: be2861dc36d7 ("netfilter: nft_{fwd,dup}_netdev: add offload support") Reported-and-tested-by: Nick Gregory Signed-off-by: Pablo Neira Ayuso conficts: net/netfilter/nft_fwd_netdev.c include/net/netfilter/nf_tables.h Signed-off-by: Lu Wei Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- include/net/netfilter/nf_tables.h | 2 +- include/net/netfilter/nf_tables_offload.h | 2 -- net/netfilter/nf_tables_offload.c | 3 ++- net/netfilter/nft_dup_netdev.c | 6 ++++++ net/netfilter/nft_fwd_netdev.c | 6 ++++++ net/netfilter/nft_immediate.c | 12 +++++++++++- 6 files changed, 26 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index ed4a9d098164..76bfb6cd5815 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -825,7 +825,7 @@ struct nft_expr_ops { int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); - u32 offload_flags; + bool (*offload_action)(const struct nft_expr *expr); const struct nft_expr_type *type; void *data; }; diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index 434a6158852f..7a453a35a41d 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -67,8 +67,6 @@ struct nft_flow_rule { struct flow_rule *rule; }; -#define NFT_OFFLOAD_F_ACTION (1 << 0) - void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index e5fcbb0e4b8e..839fd09f1bb4 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -94,7 +94,8 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { - if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION) + if (expr->ops->offload_action && + expr->ops->offload_action(expr)) num_actions++; expr = nft_expr_next(expr); diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index 40788b3f1071..70c457476b87 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -67,6 +67,11 @@ static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx, return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif); } +static bool nft_dup_netdev_offload_action(const struct nft_expr *expr) +{ + return true; +} + static struct nft_expr_type nft_dup_netdev_type; static const struct nft_expr_ops nft_dup_netdev_ops = { .type = &nft_dup_netdev_type, @@ -75,6 +80,7 @@ static const struct nft_expr_ops nft_dup_netdev_ops = { .init = nft_dup_netdev_init, .dump = nft_dup_netdev_dump, .offload = nft_dup_netdev_offload, + .offload_action = nft_dup_netdev_offload_action, }; static struct nft_expr_type nft_dup_netdev_type __read_mostly = { diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index b77985986b24..3b0dcd170551 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -77,6 +77,11 @@ static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx, return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif); } +static bool nft_fwd_netdev_offload_action(const struct nft_expr *expr) +{ + return true; +} + struct nft_fwd_neigh { enum nft_registers sreg_dev:8; enum nft_registers sreg_addr:8; @@ -219,6 +224,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .dump = nft_fwd_netdev_dump, .validate = nft_fwd_validate, .offload = nft_fwd_netdev_offload, + .offload_action = nft_fwd_netdev_offload_action, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index c63eb3b17178..5c9d88560a47 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -213,6 +213,16 @@ static int nft_immediate_offload(struct nft_offload_ctx *ctx, return 0; } +static bool nft_immediate_offload_action(const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + if (priv->dreg == NFT_REG_VERDICT) + return true; + + return false; +} + static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), @@ -224,7 +234,7 @@ static const struct nft_expr_ops nft_imm_ops = { .dump = nft_immediate_dump, .validate = nft_immediate_validate, .offload = nft_immediate_offload, - .offload_flags = NFT_OFFLOAD_F_ACTION, + .offload_action = nft_immediate_offload_action, }; struct nft_expr_type nft_imm_type __read_mostly = { -- Gitee From 7a2a3f9be3f1896cf7986f4b74e8156ab78045ec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 23 Feb 2022 16:53:38 +0800 Subject: [PATCH 088/113] fixed 4e12265 from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/96 udf: Restore i_lenAlloc when inode expansion fails mainline inclusion from mainline-v5.17-rc2 commit ea8569194b43f0f01f0a84c689388542c7254a1f category: bugfix issue: #I4U9Y8 CVE: CVE-2022-0617 Signed-off-by: Yu Changchun -------------------------------- When we fail to expand inode from inline format to a normal format, we restore inode to contain the original inline formatting but we forgot to set i_lenAlloc back. The mismatch between i_lenAlloc and i_size was then causing further problems such as warnings and lost data down the line. Reported-by: butt3rflyh4ck CC: stable@vger.kernel.org Fixes: 7e49b6f2480c ("udf: Convert UDF to new truncate calling sequence") Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Zhang Wensheng Reviewed-by: Zhang Yi Reviewed-by: Xiu Jianfeng Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/udf/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 0dd2f93ac048..b09a433de493 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -316,6 +316,7 @@ int udf_expand_file_adinicb(struct inode *inode) unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; inode->i_data.a_ops = &udf_adinicb_aops; + iinfo->i_lenAlloc = inode->i_size; up_write(&iinfo->i_data_sem); } put_page(page); -- Gitee From 283c70336e769fb03c5577e89389957d525ccfed Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 23 Feb 2022 16:53:37 +0800 Subject: [PATCH 089/113] fixed ee2b628 from https://gitee.com/linux_anio/kernel_linux_5.10/pulls/96 udf: Fix NULL ptr deref when converting from inline format mainline inclusion from mainline-v5.17-rc2 commit 7fc3b7c2981bbd1047916ade327beccb90994eee category: bugfix issue: #I4U9Y8 CVE: CVE-2022-0617 Signed-off-by: Yu Changchun ----------------------------------------------- udf_expand_file_adinicb() calls directly ->writepage to write data expanded into a page. This however misses to setup inode for writeback properly and so we can crash on inode->i_wb dereference when submitting page for IO like: BUG: kernel NULL pointer dereference, address: 0000000000000158 #PF: supervisor read access in kernel mode ... __folio_start_writeback+0x2ac/0x350 __block_write_full_page+0x37d/0x490 udf_expand_file_adinicb+0x255/0x400 [udf] udf_file_write_iter+0xbe/0x1b0 [udf] new_sync_write+0x125/0x1c0 vfs_write+0x28e/0x400 Fix the problem by marking the page dirty and going through the standard writeback path to write the page. Strictly speaking we would not even have to write the page but we want to catch e.g. ENOSPC errors early. Reported-by: butt3rflyh4ck CC: stable@vger.kernel.org Fixes: 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks") Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Zhang Wensheng Reviewed-by: Zhang Yi Reviewed-by: Xiu Jianfeng Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/udf/inode.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index b09a433de493..d32b836f6ca7 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -257,10 +257,6 @@ int udf_expand_file_adinicb(struct inode *inode) char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); int err; - struct writeback_control udf_wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = 1, - }; WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { @@ -304,8 +300,10 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; /* from now on we have normal address_space methods */ inode->i_data.a_ops = &udf_aops; + set_page_dirty(page); + unlock_page(page); up_write(&iinfo->i_data_sem); - err = inode->i_data.a_ops->writepage(page, &udf_wbc); + err = filemap_fdatawrite(inode->i_mapping); if (err) { /* Restore everything back so that we don't lose data... */ lock_page(page); -- Gitee From 7957d2df9a579c0217fc47a5fe359bdab882483b Mon Sep 17 00:00:00 2001 From: Hu Zhaodong Date: Tue, 8 Mar 2022 19:34:37 +0800 Subject: [PATCH 090/113] sched: fair: fix WALT load statistic for cfs_bandwidth enable ohos inclusion category: bugfix issue: #I4WP5B CVE: NA Signed-off-by: Hu Zhaodong ------------------------------------------- While enqueueing sched_entity to cfs_rq, no matter whether the cfs_rq was throttled, walt_inc_cfs_rq_stats should be called accompanied by cfs_rq->h_nr_running++ to make sure the cfs_rq perceive the attached task. It's same for the dequeue routine. Signed-off-by: Li Ming --- kernel/sched/fair.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2c0781ce163f..dafc7d8d9c8f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4984,7 +4984,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); - walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq); + walt_inc_throttled_cfs_rq_stats(&rq->walt_stats, tcfs_rq); unthrottle_throttle: /* @@ -5616,13 +5616,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + walt_inc_cfs_rq_stats(cfs_rq, p); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; - walt_inc_cfs_rq_stats(cfs_rq, p); - flags = ENQUEUE_WAKEUP; } @@ -5713,13 +5712,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + walt_dec_cfs_rq_stats(cfs_rq, p); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; - walt_dec_cfs_rq_stats(cfs_rq, p); - /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ -- Gitee From 796aa0fafdc172fa1d9291802888410b8deff91e Mon Sep 17 00:00:00 2001 From: Lingutla Chandrasekhar Date: Fri, 11 Jan 2019 12:32:05 +0530 Subject: [PATCH 091/113] sched: core: Fix stale rq clock usage in migration path codeauora inclusion category: bugfix issue: #I4WP5B CVE: NA Signed-off-by: Hu Zhaodong ------------------------------------------- While migrating a task, move_queued_task() updates current cpu's rq clock (which sets RQCF_UPDATED) with rq->lock held, and momentarly releases the rq->lock and reaquire it along with new cpu rq->lock. In between, if any other cpu takes the current rq->lock, which might have called rq_pin_lock() (which clears RQCF_UPDATED) and released the lock without updating cpu rq clock, then rq's clock_update_flags becomes stale until rq_pin_lock() called again. If the migration tries to reports load to cpufreq governor, then it would access the stale rq_clock, and the assert_clock_updated reports warning with below call stack: detach_entity_cfs_rq+0x71c/0x780 migrate_task_rq_fair+0x50/0xd0 set_task_cpu+0x150/0x238 move_queued_task+0x1b4/0x3e8 migration_cpu_stop+0x188/0x1f0 cpu_stopper_thread+0xac/0x150 smpboot_thread_fn+0x1c4/0x2e8 Also as commit '2463f46361a02d("sched: Fix assert_clock_updated warning emitted during CPU isolation")' mentioned, this warning could lead to deadlock when console enabled. To fix this, while reacquring the cpu rq->lock, if RQCF_UPDATED is not set then force update the rq clock. Signed-off-by: Lingutla Chandrasekhar Signed-off-by: Li Ming --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 471b2129ea84..46a0df7d1047 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1751,6 +1751,8 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, deactivate_task(rq, p, DEQUEUE_NOCLOCK); #ifdef CONFIG_SCHED_WALT double_lock_balance(rq, cpu_rq(new_cpu)); + if (!(rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(rq); #endif set_task_cpu(p, new_cpu); #ifdef CONFIG_SCHED_WALT -- Gitee From 600373a6e03012d674d34a35af7dc28a7c4c281a Mon Sep 17 00:00:00 2001 From: Dai Li Date: Wed, 9 Mar 2022 19:45:52 +0800 Subject: [PATCH 092/113] sched: Fix some bugs for frame RTG ohos inclusion category: bugfix issue: #I4VYOH CVE: NA -------------------------------- Fix following bugs: 1. frame_time re-equaled 2. get_enable function can't reach its' code in disable scene 3. should allow add normal task for Frame rtgs Signed-off-by: Dai Li --- kernel/sched/rtg/frame_rtg.c | 4 +--- kernel/sched/rtg/rtg.c | 2 -- kernel/sched/rtg/rtg_ctrl.c | 15 +++------------ kernel/sched/rtg/rtg_ctrl.h | 4 ---- 4 files changed, 4 insertions(+), 21 deletions(-) diff --git a/kernel/sched/rtg/frame_rtg.c b/kernel/sched/rtg/frame_rtg.c index 89561c84774e..686705e91cff 100644 --- a/kernel/sched/rtg/frame_rtg.c +++ b/kernel/sched/rtg/frame_rtg.c @@ -182,7 +182,7 @@ int set_frame_rate(struct frame_info *frame_info, int rate) return -EINVAL; frame_info->frame_rate = (unsigned int)rate; - frame_info->frame_time = frame_info->frame_time = div_u64(NSEC_PER_SEC, rate); + frame_info->frame_time = div_u64(NSEC_PER_SEC, rate); frame_info->max_vload_time = div_u64(frame_info->frame_time, NSEC_PER_MSEC) + frame_info->vload_margin; @@ -419,8 +419,6 @@ static int do_set_rtg_sched(struct task_struct *task, bool is_rtg, } err = sched_set_group_id(task, grpid); if (err < 0) { - pr_err("[FRAME_RTG]: %s task:%d set_group_id err:%d\n", - __func__, task->pid, err); if (is_rtg) { policy = SCHED_NORMAL; sp.sched_priority = 0; diff --git a/kernel/sched/rtg/rtg.c b/kernel/sched/rtg/rtg.c index 91e2c6abfa4e..8db694d22f2c 100644 --- a/kernel/sched/rtg/rtg.c +++ b/kernel/sched/rtg/rtg.c @@ -313,8 +313,6 @@ static int __sched_set_group_id(struct task_struct *p, unsigned int group_id) remove_task_from_group(p); } else { #endif - pr_err("%s[%d] switching group from %d to %d failed.\n", - p->comm, p->pid, old_grp->id, group_id); rc = -EINVAL; goto done; #ifdef CONFIG_SCHED_RTG_CGROUP diff --git a/kernel/sched/rtg/rtg_ctrl.c b/kernel/sched/rtg/rtg_ctrl.c index 0e87dc7162c4..923ed654de6c 100644 --- a/kernel/sched/rtg/rtg_ctrl.c +++ b/kernel/sched/rtg/rtg_ctrl.c @@ -15,7 +15,6 @@ #include atomic_t g_rtg_enable = ATOMIC_INIT(0); -atomic_t g_enable_type = ATOMIC_INIT(ALL_ENABLE); // default: all enable static atomic_t g_rt_frame_num = ATOMIC_INIT(0); static int g_frame_max_util = DEFAULT_MAX_UTIL; static int g_max_rt_frames = DEFAULT_MAX_RT_FRAME; @@ -55,11 +54,6 @@ static rtg_ctrl_func g_func_array[RTG_CTRL_MAX_NR] = { static int init_proc_state(const int *config, int len); static void deinit_proc_state(void); -int get_enable_type(void) -{ - return atomic_read(&g_enable_type); -} - static int set_enable_config(char *config_str) { char *p = NULL; @@ -86,8 +80,6 @@ static int set_enable_config(char *config_str) config[RTG_FRAME_MAX_UTIL] = value; else if (!strcmp(tmp, "invalid_interval")) config[RTG_INVALID_INTERVAL] = value; - else if (!strcmp(tmp, "enable_type")) - atomic_set(&g_enable_type, value); else continue; } @@ -242,7 +234,7 @@ static long ctrl_set_enable(int abi, void __user *uarg) static long ctrl_get_enable(int abi, void __user *uarg) { - return get_enable_type(); + return atomic_read(&g_rtg_enable); } static int parse_config(const struct rtg_str_data *rs_data) @@ -713,12 +705,11 @@ static int parse_add_rtg_thread(const struct rtg_grp_data *rs_data) return -INVALID_RTG_ID; } if (frame_info->thread_num + add_num > MAX_TID_NUM) { - pr_err("[SCHED_RTG] frame info thread up to max already.\n"); write_unlock(&frame_info->lock); return -INVALID_RTG_ID; } add_index = frame_info->thread_num; - prio = frame_info->prio; + prio = (proc_info.type == NORMAL_TASK) ? NOT_RT_PRIO : frame_info->prio; for (i = 0; i < add_num; i++) { frame_info->thread[add_index] = update_frame_thread(frame_info, prio, prio, rs_data->tids[i], @@ -902,7 +893,7 @@ static long do_proc_rtg_ioctl(int abi, struct file *file, unsigned int cmd, unsi return -INVALID_MAGIC; } - if ((func_id != SET_ENABLE) && !atomic_read(&g_rtg_enable)) { + if (!atomic_read(&g_rtg_enable) && (func_id != SET_ENABLE) && (func_id != GET_ENABLE)) { pr_err("[SCHED_RTG] CMD_ID %x error: Rtg not enabled yet.\n", cmd); return -RTG_DISABLED; } diff --git a/kernel/sched/rtg/rtg_ctrl.h b/kernel/sched/rtg/rtg_ctrl.h index df8ac420d156..8993700048be 100644 --- a/kernel/sched/rtg/rtg_ctrl.h +++ b/kernel/sched/rtg/rtg_ctrl.h @@ -67,10 +67,6 @@ enum rtg_err_no { INVALID_RTG_ID, NO_RT_FRAME, }; -enum enable_type { - ALL_ENABLE = 1, - ENABLE_MAX -}; struct rtg_grp_data { int rtg_cmd; -- Gitee From 019d19478459e86581b020617c90e219b55cf741 Mon Sep 17 00:00:00 2001 From: CY Fan Date: Fri, 4 Mar 2022 12:54:14 +0800 Subject: [PATCH 093/113] zswapd: fix some size limitations ohos inclusion category: bugfix issue: #I4W8T3 CVE: NA ------------------------------------------- Signed-off-by: CY Fan --- mm/zswapd_control.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/zswapd_control.c b/mm/zswapd_control.c index d91a08b5ae2f..ab5ca24b3006 100644 --- a/mm/zswapd_control.c +++ b/mm/zswapd_control.c @@ -28,6 +28,8 @@ #define ZRAM_WM_RATIO 0 #define MAX_RATIO 100 +#define CHECK_BUFFER_VALID(var1, var2) (((var2) != 0) && ((var1) > (var2))) + struct zswapd_param { unsigned int min_score; unsigned int max_score; @@ -148,6 +150,11 @@ static ssize_t avail_buffers_params_write(struct kernfs_open_file *of, if (sscanf(buf, "%u %u %u %llu", &buffers, &min_buffers, &high_buffers, &threshold) != 4) return -EINVAL; + if (CHECK_BUFFER_VALID(min_buffers, buffers) || + CHECK_BUFFER_VALID(min_buffers, high_buffers) || + CHECK_BUFFER_VALID(buffers, high_buffers)) + return -EINVAL; + atomic_set(&avail_buffers, buffers); atomic_set(&min_avail_buffers, min_buffers); atomic_set(&high_avail_buffers, high_buffers); @@ -410,7 +417,8 @@ static ssize_t zswapd_single_memcg_param_write(struct kernfs_open_file *of, &refault_threshold) != 3) return -EINVAL; - if (ub_mem2zram_ratio > MAX_RATIO || ub_zram2ufs_ratio > MAX_RATIO) + if (ub_mem2zram_ratio > MAX_RATIO || ub_zram2ufs_ratio > MAX_RATIO || + refault_threshold > MAX_RATIO) return -EINVAL; atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, -- Gitee From b94174593cdeed4628d5012f0f8b609f4efeb2bb Mon Sep 17 00:00:00 2001 From: CY Fan Date: Tue, 8 Mar 2022 20:40:07 +0800 Subject: [PATCH 094/113] memcg: fix memcg use case timeout issue ohos inclusion category: bugfix issue: #I4UC37 CVE: NA ----------------- This patch replaces the score_list_lock with the rwlock and skips the non-target memcg Signed-off-by: CY Fan --- include/linux/memcg_policy.h | 2 +- mm/memcg_control.c | 14 +++++++------- mm/memcg_reclaim.c | 25 ++++++++++++++++++++++--- mm/memcontrol.c | 4 ++-- 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/include/linux/memcg_policy.h b/include/linux/memcg_policy.h index 201b0e973e3c..4aec2a1bb3ec 100644 --- a/include/linux/memcg_policy.h +++ b/include/linux/memcg_policy.h @@ -15,7 +15,7 @@ struct scan_control; extern struct list_head score_head; extern bool score_head_inited; -extern spinlock_t score_list_lock; +extern rwlock_t score_list_lock; extern struct cgroup_subsys memory_cgrp_subsys; #ifdef CONFIG_HYPERHOLD_FILE_LRU void shrink_anon_memcg(struct pglist_data *pgdat, diff --git a/mm/memcg_control.c b/mm/memcg_control.c index 985fcaa66943..dd62304a9c68 100644 --- a/mm/memcg_control.c +++ b/mm/memcg_control.c @@ -17,7 +17,7 @@ struct list_head score_head; bool score_head_inited; -DEFINE_SPINLOCK(score_list_lock); +DEFINE_RWLOCK(score_list_lock); DEFINE_MUTEX(reclaim_para_lock); /** @@ -40,7 +40,7 @@ struct mem_cgroup *get_next_memcg(struct mem_cgroup *prev) if (unlikely(!score_head_inited)) return NULL; - spin_lock_irqsave(&score_list_lock, flags); + read_lock_irqsave(&score_list_lock, flags); if (unlikely(!prev)) pos = &score_head; @@ -60,7 +60,7 @@ struct mem_cgroup *get_next_memcg(struct mem_cgroup *prev) memcg = NULL; unlock: - spin_unlock_irqrestore(&score_list_lock, flags); + read_unlock_irqrestore(&score_list_lock, flags); if (prev) css_put(&prev->css); @@ -83,7 +83,7 @@ struct mem_cgroup *get_prev_memcg(struct mem_cgroup *next) if (unlikely(!score_head_inited)) return NULL; - spin_lock_irqsave(&score_list_lock, flags); + read_lock_irqsave(&score_list_lock, flags); if (unlikely(!next)) pos = &score_head; @@ -106,7 +106,7 @@ struct mem_cgroup *get_prev_memcg(struct mem_cgroup *next) memcg = NULL; unlock: - spin_unlock_irqrestore(&score_list_lock, flags); + read_unlock_irqrestore(&score_list_lock, flags); if (next) css_put(&next->css); @@ -125,7 +125,7 @@ void memcg_app_score_update(struct mem_cgroup *target) struct list_head *tmp; unsigned long flags; - spin_lock_irqsave(&score_list_lock, flags); + write_lock_irqsave(&score_list_lock, flags); list_for_each_prev_safe(pos, tmp, &score_head) { struct mem_cgroup *memcg = list_entry(pos, struct mem_cgroup, score_node); @@ -134,7 +134,7 @@ void memcg_app_score_update(struct mem_cgroup *target) break; } list_move_tail(&target->score_node, pos); - spin_unlock_irqrestore(&score_list_lock, flags); + write_unlock_irqrestore(&score_list_lock, flags); } static u64 mem_cgroup_app_score_read(struct cgroup_subsys_state *css, diff --git a/mm/memcg_reclaim.c b/mm/memcg_reclaim.c index f88826c13ae2..74c3d4dfa08b 100644 --- a/mm/memcg_reclaim.c +++ b/mm/memcg_reclaim.c @@ -215,6 +215,18 @@ void shrink_anon_memcg(struct pglist_data *pgdat, sc->nr_reclaimed_anon += nr_reclaimed; } +static inline bool memcg_is_child_of(struct mem_cgroup *mcg, struct mem_cgroup *tmcg) +{ + while (!mem_cgroup_is_root(mcg)) { + if (mcg == tmcg) + break; + + mcg = parent_mem_cgroup(mcg); + } + + return (mcg == tmcg); +} + static void shrink_anon(struct pglist_data *pgdat, struct scan_control *sc, unsigned long *nr) { @@ -229,7 +241,12 @@ static void shrink_anon(struct pglist_data *pgdat, node_lruvec(pgdat), LRU_INACTIVE_ANON, MAX_NR_ZONES); while ((memcg = get_next_memcg(memcg))) { - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + struct lruvec *lruvec = NULL; + + if (!memcg_is_child_of(memcg, target_memcg)) + continue; + + lruvec = mem_cgroup_lruvec(memcg, pgdat); reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; @@ -438,8 +455,10 @@ bool shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc) get_scan_count_hyperhold(pgdat, sc, nr, &node_lru_pages); - /* Shrink the Total-File-LRU */ - shrink_file(pgdat, sc, nr); + if (!cgroup_reclaim(sc)) { + /* Shrink the Total-File-LRU */ + shrink_file(pgdat, sc, nr); + } /* Shrink Anon by iterating score_list */ shrink_anon(pgdat, sc, nr); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 30e068e95e21..b1d67bcfb617 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5496,9 +5496,9 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) #ifdef CONFIG_HYPERHOLD_MEMCG unsigned long flags; - spin_lock_irqsave(&score_list_lock, flags); + write_lock_irqsave(&score_list_lock, flags); list_del_init(&memcg->score_node); - spin_unlock_irqrestore(&score_list_lock, flags); + write_unlock_irqrestore(&score_list_lock, flags); css_put(css); #endif -- Gitee From eaca764503fae9a2a7e31971811a8ed15be1e4aa Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 8 Mar 2022 22:06:16 +0800 Subject: [PATCH 095/113] bpf: Introduce composable reg, ret and arg types. mainline inclusion from mainline-v5.17-rc1 commit d639b9d13a39cf15639cbe6e8b2c43eb60148a73 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-0500 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d639b9d13a39cf15639cbe6e8b2c43eb60148a73 Signed-off-by: Yu Changchun -------------------------------- There are some common properties shared between bpf reg, ret and arg values. For instance, a value may be a NULL pointer, or a pointer to a read-only memory. Previously, to express these properties, enumeration was used. For example, in order to test whether a reg value can be NULL, reg_type_may_be_null() simply enumerates all types that are possibly NULL. The problem of this approach is that it's not scalable and causes a lot of duplication. These properties can be combined, for example, a type could be either MAYBE_NULL or RDONLY, or both. This patch series rewrites the layout of reg_type, arg_type and ret_type, so that common properties can be extracted and represented as composable flag. For example, one can write ARG_PTR_TO_MEM | PTR_MAYBE_NULL which is equivalent to the previous ARG_PTR_TO_MEM_OR_NULL The type ARG_PTR_TO_MEM are called "base type" in this patch. Base types can be extended with flags. A flag occupies the higher bits while base types sits in the lower bits. This patch in particular sets up a set of macro for this purpose. The following patches will rewrite arg_types, ret_types and reg_types respectively. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-2-haoluo@google.com Conflicts: include/linux/bpf.h include/linux/bpf_verifier.h Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- include/linux/bpf.h | 43 ++++++++++++++++++++++++++++++++++++ include/linux/bpf_verifier.h | 13 +++++++++++ 2 files changed, 56 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 88245386a4b6..3a5d1a88195c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -254,6 +254,29 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, extern const struct bpf_map_ops bpf_map_offload_ops; +/* bpf_type_flag contains a set of flags that are applicable to the values of + * arg_type, ret_type and reg_type. For example, a pointer value may be null, + * or a memory is read-only. We classify types into two categories: base types + * and extended types. Extended types are base types combined with a type flag. + * + * Currently there are no more than 32 base types in arg_type, ret_type and + * reg_types. + */ +#define BPF_BASE_TYPE_BITS 8 + +enum bpf_type_flag { + /* PTR may be NULL. */ + PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = PTR_MAYBE_NULL, +}; + +/* Max number of base types. */ +#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) + +/* Max number of all types. */ +#define BPF_TYPE_LIMIT (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1)) + /* function argument constraints */ enum bpf_arg_type { ARG_DONTCARE = 0, /* unused argument in helper function */ @@ -296,7 +319,13 @@ enum bpf_arg_type { ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ __BPF_ARG_TYPE_MAX, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_ARG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* type of values returned from helper functions */ enum bpf_return_type { @@ -311,7 +340,14 @@ enum bpf_return_type { RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ + __BPF_RET_TYPE_MAX, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_RET_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL @@ -410,7 +446,14 @@ enum bpf_reg_type { PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ + __BPF_REG_TYPE_MAX, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_REG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* The information passed from prog-specific *_is_valid_access * back to the verifier. diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6e330ff2f28d..b04dba78100a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -480,5 +480,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *tgt_prog, u32 btf_id, struct bpf_attach_target_info *tgt_info); +#define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) + +/* extract base type from bpf_{arg, return, reg}_type. */ +static inline u32 base_type(u32 type) +{ + return type & BPF_BASE_TYPE_MASK; +} + +/* extract flags from an extended type. See bpf_type_flag in bpf.h. */ +static inline u32 type_flag(u32 type) +{ + return type & ~BPF_BASE_TYPE_MASK; +} #endif /* _LINUX_BPF_VERIFIER_H */ -- Gitee From 8e552525db8666a1393e6f72d1b20e5d6aeb465d Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 8 Mar 2022 22:06:17 +0800 Subject: [PATCH 096/113] bpf: Replace ARG_XXX_OR_NULL with ARG_XXX | PTR_MAYBE_NULL mainline inclusion from mainline-v5.17-rc1 commit 48946bd6a5d695c50b34546864b79c1f910a33c1 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-0500 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=48946bd6a5d695c50b34546864b79c1f910a33c1 Signed-off-by: Yu Changchun -------------------------------- We have introduced a new type to make bpf_arg composable, by reserving high bits of bpf_arg to represent flags of a type. One of the flags is PTR_MAYBE_NULL which indicates a pointer may be NULL. When applying this flag to an arg_type, it means the arg can take NULL pointer. This patch switches the qualified arg_types to use this flag. The arg_types changed in this patch include: 1. ARG_PTR_TO_MAP_VALUE_OR_NULL 2. ARG_PTR_TO_MEM_OR_NULL 3. ARG_PTR_TO_CTX_OR_NULL 4. ARG_PTR_TO_SOCKET_OR_NULL 5. ARG_PTR_TO_ALLOC_MEM_OR_NULL 6. ARG_PTR_TO_STACK_OR_NULL This patch does not eliminate the use of these arg_types, instead it makes them an alias to the 'ARG_XXX | PTR_MAYBE_NULL'. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-3-haoluo@google.com Conflicts: include/linux/bpf.h kernel/bpf/verifier.c Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- include/linux/bpf.h | 12 +++++++----- kernel/bpf/verifier.c | 36 +++++++++++++----------------------- 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3a5d1a88195c..35b6653df672 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -288,13 +288,11 @@ enum bpf_arg_type { ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ - ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */ /* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ - ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */ ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, * helper function must fill all bytes or clear * them in error case. @@ -304,22 +302,26 @@ enum bpf_arg_type { ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ ARG_PTR_TO_CTX, /* pointer to context */ - ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */ ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ - ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ - ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ __BPF_ARG_TYPE_MAX, + /* Extended arg_types. */ + ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE, + ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, + ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, + ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, + ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48ebed09d466..587ad538f5f1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -437,13 +437,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) return type == ARG_PTR_TO_SOCK_COMMON; } -static bool arg_type_may_be_null(enum bpf_arg_type type) +static bool type_may_be_null(u32 type) { - return type == ARG_PTR_TO_MAP_VALUE_OR_NULL || - type == ARG_PTR_TO_MEM_OR_NULL || - type == ARG_PTR_TO_CTX_OR_NULL || - type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; + return type & PTR_MAYBE_NULL; } /* Determine whether the function releases some resources allocated by another @@ -4250,9 +4246,8 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { - return type == ARG_PTR_TO_MEM || - type == ARG_PTR_TO_MEM_OR_NULL || - type == ARG_PTR_TO_UNINIT_MEM; + return base_type(type) == ARG_PTR_TO_MEM || + base_type(type) == ARG_PTR_TO_UNINIT_MEM; } static bool arg_type_is_mem_size(enum bpf_arg_type type) @@ -4379,26 +4374,21 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, - [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types, [ARG_CONST_SIZE] = &scalar_types, [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_MAP_PTR] = &const_map_ptr_types, [ARG_PTR_TO_CTX] = &context_types, - [ARG_PTR_TO_CTX_OR_NULL] = &context_types, [ARG_PTR_TO_SOCK_COMMON] = &sock_types, #ifdef CONFIG_NET [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, #endif [ARG_PTR_TO_SOCKET] = &fullsock_types, - [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, [ARG_PTR_TO_MEM] = &mem_types, - [ARG_PTR_TO_MEM_OR_NULL] = &mem_types, [ARG_PTR_TO_UNINIT_MEM] = &mem_types, [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, - [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, @@ -4413,7 +4403,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, const struct bpf_reg_types *compatible; int i, j; - compatible = compatible_reg_types[arg_type]; + compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) { verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); return -EFAULT; @@ -4494,15 +4484,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; } - if (arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { + if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || + base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { err = resolve_map_arg_type(env, meta, &arg_type); if (err) return err; } - if (register_is_null(reg) && arg_type_may_be_null(arg_type)) + if (register_is_null(reg) && type_may_be_null(arg_type)) /* A NULL register has a SCALAR_VALUE type, so skip * type checking. */ @@ -4549,10 +4538,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (arg_type == ARG_PTR_TO_MAP_VALUE || - (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && - !register_is_null(reg)) || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { + } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || + base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + if (type_may_be_null(arg_type) && register_is_null(reg)) + return 0; + /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ -- Gitee From f936969b02ef4df0d8a0fad6460f5d46180f2160 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 8 Mar 2022 22:06:18 +0800 Subject: [PATCH 097/113] bpf: Replace RET_XXX_OR_NULL with RET_XXX | PTR_MAYBE_NULL mainline inclusion from mainline-v5.17-rc1 commit 3c4807322660d4290ac9062c034aed6b87243861 issue: #I4U9Y8 bugzilla: https://gitee.com/openeuler/kernel/issues/I4WRPV CVE: CVE-2022-0500 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3c4807322660d4290ac9062c034aed6b87243861 Signed-off-by: Yu Changchun -------------------------------- We have introduced a new type to make bpf_ret composable, by reserving high bits to represent flags. One of the flag is PTR_MAYBE_NULL, which indicates a pointer may be NULL. When applying this flag to ret_types, it means the returned value could be a NULL pointer. This patch switches the qualified arg_types to use this flag. The ret_types changed in this patch include: 1. RET_PTR_TO_MAP_VALUE_OR_NULL 2. RET_PTR_TO_SOCKET_OR_NULL 3. RET_PTR_TO_TCP_SOCK_OR_NULL 4. RET_PTR_TO_SOCK_COMMON_OR_NULL 5. RET_PTR_TO_ALLOC_MEM_OR_NULL 6. RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL 7. RET_PTR_TO_BTF_ID_OR_NULL This patch doesn't eliminate the use of these names, instead it makes them aliases to 'RET_PTR_TO_XXX | PTR_MAYBE_NULL'. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-4-haoluo@google.com Conflicts: kernel/bpf/verifier.c Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- include/linux/bpf.h | 20 ++++++++++++------- kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 45 ++++++++++++++++++++++--------------------- 3 files changed, 37 insertions(+), 30 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 35b6653df672..ee48292b3428 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -334,16 +334,22 @@ enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ - RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ - RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ - RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ - RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ - RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ - RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ - RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ + RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ + RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ + RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ + RET_PTR_TO_ALLOC_MEM, /* returns a pointer to dynamically allocated memory */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ + RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ __BPF_RET_TYPE_MAX, + /* Extended ret_types. */ + RET_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE, + RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, + RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, + RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bb4350de9f11..2a04e6eafca5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -636,7 +636,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .func = bpf_per_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, .arg2_type = ARG_ANYTHING, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 587ad538f5f1..1279455d29c3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5322,6 +5322,7 @@ static int check_reference_leak(struct bpf_verifier_env *env) static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; + enum bpf_return_type ret_type; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; bool changes_data; @@ -5433,13 +5434,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; /* update return register (already marked as written above) */ - if (fn->ret_type == RET_INTEGER) { + ret_type = fn->ret_type; + if (ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ mark_reg_unknown(env, regs, BPF_REG_0); - } else if (fn->ret_type == RET_VOID) { + } else if (ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; - } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || - fn->ret_type == RET_PTR_TO_MAP_VALUE) { + } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) { /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -5452,28 +5453,27 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; - if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { + if (type_may_be_null(ret_type)) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + } else { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; if (map_value_has_spin_lock(meta.map_ptr)) regs[BPF_REG_0].id = ++env->id_gen; - } else { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; } - } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].mem_size = meta.mem_size; - } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { + } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); @@ -5492,30 +5492,31 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } regs[BPF_REG_0].type = - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? - PTR_TO_MEM : PTR_TO_MEM_OR_NULL; + (ret_type & PTR_MAYBE_NULL) ? + PTR_TO_MEM_OR_NULL : PTR_TO_MEM; regs[BPF_REG_0].mem_size = tsize; } else { regs[BPF_REG_0].type = - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? - PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; + (ret_type & PTR_MAYBE_NULL) ? + PTR_TO_BTF_ID_OR_NULL : PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } - } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; ret_btf_id = *fn->ret_btf_id; if (ret_btf_id == 0) { - verbose(env, "invalid return type %d of func %s#%d\n", - fn->ret_type, func_id_name(func_id), func_id); + verbose(env, "invalid return type %u of func %s#%d\n", + base_type(ret_type), func_id_name(func_id), + func_id); return -EINVAL; } regs[BPF_REG_0].btf_id = ret_btf_id; } else { - verbose(env, "unknown return type %d of func %s#%d\n", - fn->ret_type, func_id_name(func_id), func_id); + verbose(env, "unknown return type %u of func %s#%d\n", + base_type(ret_type), func_id_name(func_id), func_id); return -EINVAL; } -- Gitee From 78df1b492c84aeaa77477eacfce39f3e0d31a811 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 8 Mar 2022 22:06:19 +0800 Subject: [PATCH 098/113] bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX | PTR_MAYBE_NULL mainline inclusion from mainline-v5.17-rc1 commit c25b2ae136039ffa820c26138ed4a5e5f3ab3841 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-0500 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c25b2ae136039ffa820c26138ed4a5e5f3ab3841 Signed-off-by: Yu Changchun -------------------------------- We have introduced a new type to make bpf_reg composable, by allocating bits in the type to represent flags. One of the flags is PTR_MAYBE_NULL which indicates a pointer may be NULL. This patch switches the qualified reg_types to use this flag. The reg_types changed in this patch include: 1. PTR_TO_MAP_VALUE_OR_NULL 2. PTR_TO_SOCKET_OR_NULL 3. PTR_TO_SOCK_COMMON_OR_NULL 4. PTR_TO_TCP_SOCK_OR_NULL 5. PTR_TO_BTF_ID_OR_NULL 6. PTR_TO_MEM_OR_NULL 7. PTR_TO_RDONLY_BUF_OR_NULL 8. PTR_TO_RDWR_BUF_OR_NULL Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211217003152.48334-5-haoluo@google.com Conflicts: include/linux/bpf.h include/linux/bpf_verifier.h kernel/bpf/verifier.c Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/net/ethernet/netronome/nfp/bpf/fw.h | 4 +- include/linux/bpf.h | 16 +- include/linux/bpf_verifier.h | 4 + kernel/bpf/btf.c | 7 +- kernel/bpf/map_iter.c | 4 +- kernel/bpf/verifier.c | 273 +++++++++----------- net/core/bpf_sk_storage.c | 2 +- net/core/sock_map.c | 2 +- 8 files changed, 144 insertions(+), 168 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h index 4268a7e0f344..33f9058ed32e 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h @@ -13,8 +13,8 @@ */ #define NFP_BPF_SCALAR_VALUE 1 #define NFP_BPF_MAP_VALUE 4 -#define NFP_BPF_STACK 6 -#define NFP_BPF_PACKET_DATA 8 +#define NFP_BPF_STACK 5 +#define NFP_BPF_PACKET_DATA 7 enum bpf_cap_tlv_type { NFP_BPF_CAP_TYPE_FUNC = 1, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ee48292b3428..90e827902540 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -417,18 +417,14 @@ enum bpf_reg_type { PTR_TO_CTX, /* reg points to bpf_context */ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ PTR_TO_MAP_VALUE, /* reg points to map element value */ - PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ PTR_TO_STACK, /* reg == frame_pointer + offset */ PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ - PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ - PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ - PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ /* PTR_TO_BTF_ID points to a kernel struct that does not need @@ -446,16 +442,20 @@ enum bpf_reg_type { * been checked for null. Used primarily to inform the verifier * an explicit null check is required for this struct. */ - PTR_TO_BTF_ID_OR_NULL, PTR_TO_MEM, /* reg points to valid memory region */ - PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ - PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ - PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ __BPF_REG_TYPE_MAX, + /* Extended reg_types. */ + PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE, + PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET, + PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, + PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, + PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, + PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MEM, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b04dba78100a..efc26c4b5f7b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -17,6 +17,8 @@ * that converting umax_value to int cannot overflow. */ #define BPF_MAX_VAR_SIZ (1 << 29) +/* size of type_str_buf in bpf_verifier. */ +#define TYPE_STR_BUF_LEN 64 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that @@ -434,6 +436,8 @@ struct bpf_verifier_env { u32 peak_states; /* longest register parentage chain walked for liveness marking */ u32 longest_mark_read_walk; + /* buffer used in reg_type_str() to generate reg_type string */ + char type_str_buf[TYPE_STR_BUF_LEN]; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index aaf2fbaa0cc7..538f7adc61c4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4525,10 +4525,13 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + u32 type, flag; + type = base_type(ctx_arg_info->reg_type); + flag = type_flag(ctx_arg_info->reg_type); if (ctx_arg_info->offset == off && - (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL || - ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) { + (type == PTR_TO_RDWR_BUF || type == PTR_TO_RDONLY_BUF) && + (flag & PTR_MAYBE_NULL)) { info->reg_type = ctx_arg_info->reg_type; return true; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 6a9542af4212..631f0e44b7a9 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), - PTR_TO_RDONLY_BUF_OR_NULL }, + PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL }, { offsetof(struct bpf_iter__bpf_map_elem, value), - PTR_TO_RDWR_BUF_OR_NULL }, + PTR_TO_RDWR_BUF | PTR_MAYBE_NULL }, }, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1279455d29c3..8767b2bd8545 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -404,18 +404,6 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_SOCK_COMMON; } -static bool reg_type_may_be_null(enum bpf_reg_type type) -{ - return type == PTR_TO_MAP_VALUE_OR_NULL || - type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_BTF_ID_OR_NULL || - type == PTR_TO_MEM_OR_NULL || - type == PTR_TO_RDONLY_BUF_OR_NULL || - type == PTR_TO_RDWR_BUF_OR_NULL; -} - static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return reg->type == PTR_TO_MAP_VALUE && @@ -424,12 +412,9 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { - return type == PTR_TO_SOCKET || - type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_MEM || - type == PTR_TO_MEM_OR_NULL; + return base_type(type) == PTR_TO_SOCKET || + base_type(type) == PTR_TO_TCP_SOCK || + base_type(type) == PTR_TO_MEM; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -492,37 +477,50 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_request_sock; } -/* string representation of 'enum bpf_reg_type' */ -static const char * const reg_type_str[] = { - [NOT_INIT] = "?", - [SCALAR_VALUE] = "inv", - [PTR_TO_CTX] = "ctx", - [CONST_PTR_TO_MAP] = "map_ptr", - [PTR_TO_MAP_VALUE] = "map_value", - [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", - [PTR_TO_STACK] = "fp", - [PTR_TO_PACKET] = "pkt", - [PTR_TO_PACKET_META] = "pkt_meta", - [PTR_TO_PACKET_END] = "pkt_end", - [PTR_TO_FLOW_KEYS] = "flow_keys", - [PTR_TO_SOCKET] = "sock", - [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", - [PTR_TO_SOCK_COMMON] = "sock_common", - [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", - [PTR_TO_TCP_SOCK] = "tcp_sock", - [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", - [PTR_TO_TP_BUFFER] = "tp_buffer", - [PTR_TO_XDP_SOCK] = "xdp_sock", - [PTR_TO_BTF_ID] = "ptr_", - [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", - [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", - [PTR_TO_MEM] = "mem", - [PTR_TO_MEM_OR_NULL] = "mem_or_null", - [PTR_TO_RDONLY_BUF] = "rdonly_buf", - [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", - [PTR_TO_RDWR_BUF] = "rdwr_buf", - [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", -}; +/* string representation of 'enum bpf_reg_type' + * + * Note that reg_type_str() can not appear more than once in a single verbose() + * statement. + */ +static const char *reg_type_str(struct bpf_verifier_env *env, + enum bpf_reg_type type) +{ + char postfix[16] = {0}; + static const char * const str[] = { + [NOT_INIT] = "?", + [SCALAR_VALUE] = "inv", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_STACK] = "fp", + [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", + [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", + [PTR_TO_SOCKET] = "sock", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", + [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", + [PTR_TO_MEM] = "mem", + [PTR_TO_RDONLY_BUF] = "rdonly_buf", + [PTR_TO_RDWR_BUF] = "rdwr_buf", + }; + + if (type & PTR_MAYBE_NULL) { + if (base_type(type) == PTR_TO_BTF_ID || + base_type(type) == PTR_TO_PERCPU_BTF_ID) + strncpy(postfix, "or_null_", 16); + else + strncpy(postfix, "_or_null", 16); + } + + snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s", + str[base_type(type)], postfix); + return env->type_str_buf; +} static char slot_type_char[] = { [STACK_INVALID] = '?', @@ -574,7 +572,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, continue; verbose(env, " R%d", i); print_liveness(env, reg->live); - verbose(env, "=%s", reg_type_str[t]); + verbose(env, "=%s", reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && @@ -582,9 +580,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID || - t == PTR_TO_BTF_ID_OR_NULL || - t == PTR_TO_PERCPU_BTF_ID) + if (base_type(t) == PTR_TO_BTF_ID || + base_type(t) == PTR_TO_PERCPU_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -593,9 +590,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) verbose(env, ",r=%d", reg->range); - else if (t == CONST_PTR_TO_MAP || - t == PTR_TO_MAP_VALUE || - t == PTR_TO_MAP_VALUE_OR_NULL) + else if (base_type(t) == CONST_PTR_TO_MAP || + base_type(t) == PTR_TO_MAP_VALUE) verbose(env, ",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); @@ -665,7 +661,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (state->stack[i].slot_type[0] == STACK_SPILL) { reg = &state->stack[i].spilled_ptr; t = reg->type; - verbose(env, "=%s", reg_type_str[t]); + verbose(env, "=%s", reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) @@ -1564,7 +1560,7 @@ static int mark_reg_read(struct bpf_verifier_env *env, break; if (parent->live & REG_LIVE_DONE) { verbose(env, "verifier BUG type %s var_off %lld off %d\n", - reg_type_str[parent->type], + reg_type_str(env, parent->type), parent->var_off.value, parent->off); return -EFAULT; } @@ -2190,9 +2186,8 @@ static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) static bool is_spillable_regtype(enum bpf_reg_type type) { - switch (type) { + switch (base_type(type)) { case PTR_TO_MAP_VALUE: - case PTR_TO_MAP_VALUE_OR_NULL: case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: @@ -2201,21 +2196,14 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID_OR_NULL: case PTR_TO_RDONLY_BUF: - case PTR_TO_RDONLY_BUF_OR_NULL: case PTR_TO_RDWR_BUF: - case PTR_TO_RDWR_BUF_OR_NULL: case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: - case PTR_TO_MEM_OR_NULL: return true; default: return false; @@ -3016,7 +3004,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) + if (base_type(*reg_type) == PTR_TO_BTF_ID) *btf_id = info.btf_id; else env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; @@ -3082,7 +3070,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, } verbose(env, "R%d invalid %s access off=%d size=%d\n", - regno, reg_type_str[reg->type], off, size); + regno, reg_type_str(env, reg->type), off, size); return -EACCES; } @@ -3822,7 +3810,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { mark_reg_known_zero(env, regs, value_regno); - if (reg_type_may_be_null(reg_type)) + if (type_may_be_null(reg_type)) regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the @@ -3830,8 +3818,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (reg_type == PTR_TO_BTF_ID || - reg_type == PTR_TO_BTF_ID_OR_NULL) + if (base_type(reg_type) == PTR_TO_BTF_ID) regs[value_regno].btf_id = btf_id; } regs[value_regno].type = reg_type; @@ -3882,7 +3869,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str[reg->type]); + regno, reg_type_str(env, reg->type)); return -EACCES; } err = check_sock_access(env, insn_idx, regno, off, size, t); @@ -3901,7 +3888,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == PTR_TO_RDONLY_BUF) { if (t == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str[reg->type]); + regno, reg_type_str(env, reg->type)); return -EACCES; } err = check_buffer_access(env, reg, regno, off, size, false, @@ -3917,7 +3904,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EACCES; } @@ -3960,7 +3947,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins is_sk_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, - reg_type_str[reg_state(env, insn->dst_reg)->type]); + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES; } @@ -4156,9 +4143,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, register_is_null(reg)) return 0; - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], - reg_type_str[PTR_TO_STACK]); + verbose(env, "R%d type=%s ", regno, + reg_type_str(env, reg->type)); + verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK)); return -EACCES; } } @@ -4418,10 +4405,10 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, goto found; } - verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); + verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, type)); for (j = 0; j + 1 < i; j++) - verbose(env, "%s, ", reg_type_str[compatible->types[j]]); - verbose(env, "%s\n", reg_type_str[compatible->types[j]]); + verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); + verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); return -EACCES; found: @@ -5323,6 +5310,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn { const struct bpf_func_proto *fn = NULL; enum bpf_return_type ret_type; + enum bpf_type_flag ret_flag; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; bool changes_data; @@ -5435,6 +5423,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn /* update return register (already marked as written above) */ ret_type = fn->ret_type; + ret_flag = type_flag(fn->ret_type); if (ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ mark_reg_unknown(env, regs, BPF_REG_0); @@ -5453,25 +5442,23 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; - if (type_may_be_null(ret_type)) { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; - } else { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; - if (map_value_has_spin_lock(meta.map_ptr)) - regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; + if (!type_may_be_null(ret_type) && + map_value_has_spin_lock(meta.map_ptr)) { + regs[BPF_REG_0].id = ++env->id_gen; } } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag; } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag; } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag; } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = meta.mem_size; } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { const struct btf_type *t; @@ -5491,21 +5478,17 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn tname, PTR_ERR(ret)); return -EINVAL; } - regs[BPF_REG_0].type = - (ret_type & PTR_MAYBE_NULL) ? - PTR_TO_MEM_OR_NULL : PTR_TO_MEM; + regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = tsize; } else { - regs[BPF_REG_0].type = - (ret_type & PTR_MAYBE_NULL) ? - PTR_TO_BTF_ID_OR_NULL : PTR_TO_BTF_ID; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; ret_btf_id = *fn->ret_btf_id; if (ret_btf_id == 0) { verbose(env, "invalid return type %u of func %s#%d\n", @@ -5520,7 +5503,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } - if (reg_type_may_be_null(regs[BPF_REG_0].type)) + if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; if (is_ptr_cast_function(func_id)) { @@ -5621,25 +5604,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "math between %s pointer and %lld is not allowed\n", - reg_type_str[type], val); + reg_type_str(env, type), val); return false; } if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { verbose(env, "%s pointer offset %d is not allowed\n", - reg_type_str[type], reg->off); + reg_type_str(env, type), reg->off); return false; } if (smin == S64_MIN) { verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", - reg_type_str[type]); + reg_type_str(env, type)); return false; } if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { verbose(env, "value %lld makes %s pointer be out of bounds\n", - smin, reg_type_str[type]); + smin, reg_type_str(env, type)); return false; } @@ -6016,11 +5999,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - switch (ptr_reg->type) { - case PTR_TO_MAP_VALUE_OR_NULL: + if (ptr_reg->type & PTR_MAYBE_NULL) { verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", - dst, reg_type_str[ptr_reg->type]); + dst, reg_type_str(env, ptr_reg->type)); return -EACCES; + } + + switch (base_type(ptr_reg->type)) { case CONST_PTR_TO_MAP: /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) @@ -6033,10 +6018,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_XDP_SOCK: reject: verbose(env, "R%d pointer arithmetic on %s prohibited\n", - dst, reg_type_str[ptr_reg->type]); + dst, reg_type_str(env, ptr_reg->type)); return -EACCES; default: - if (reg_type_may_be_null(ptr_reg->type)) + if (type_may_be_null(ptr_reg->type)) goto reject; break; } @@ -7714,7 +7699,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, struct bpf_reg_state *reg, u32 id, bool is_null) { - if (reg_type_may_be_null(reg->type) && reg->id == id && + if (type_may_be_null(reg->type) && reg->id == id && !WARN_ON_ONCE(!reg->id)) { /* Old offset (both fixed and variable parts) should * have been known-zero, because we don't allow pointer @@ -7728,7 +7713,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } if (is_null) { reg->type = SCALAR_VALUE; - } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { + } else if (base_type(reg->type) == PTR_TO_MAP_VALUE) { const struct bpf_map *map = reg->map_ptr; if (map->inner_map_meta) { @@ -7742,21 +7727,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } else { reg->type = PTR_TO_MAP_VALUE; } - } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { - reg->type = PTR_TO_SOCKET; - } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { - reg->type = PTR_TO_SOCK_COMMON; - } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { - reg->type = PTR_TO_TCP_SOCK; - } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { - reg->type = PTR_TO_BTF_ID; - } else if (reg->type == PTR_TO_MEM_OR_NULL) { - reg->type = PTR_TO_MEM; - } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) { - reg->type = PTR_TO_RDONLY_BUF; - } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) { - reg->type = PTR_TO_RDWR_BUF; + } else { + reg->type &= ~PTR_MAYBE_NULL; } + if (is_null) { /* We don't need id and ref_obj_id from this point * onwards anymore, thus we should better reset it, @@ -8103,7 +8077,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && - reg_type_may_be_null(dst_reg->type)) { + type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ @@ -8332,7 +8306,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (is_subprog) { if (reg->type != SCALAR_VALUE) { verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } return 0; @@ -8393,7 +8367,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (reg->type != SCALAR_VALUE) { verbose(env, "At program exit the register R0 is not a known value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } @@ -9141,7 +9115,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return true; if (rcur->type == NOT_INIT) return false; - switch (rold->type) { + switch (base_type(rold->type)) { case SCALAR_VALUE: if (env->explore_alu_limits) return false; @@ -9162,6 +9136,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return false; } case PTR_TO_MAP_VALUE: + /* a PTR_TO_MAP_VALUE could be safe to use as a + * PTR_TO_MAP_VALUE_OR_NULL into the same map. + * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- + * checked, doing so could have affected others with the same + * id, and we can't check for that because we lost the id when + * we converted to a PTR_TO_MAP_VALUE. + */ + if (type_may_be_null(rold->type)) { + if (!type_may_be_null(rcur->type)) + return false; + if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) + return false; + /* Check our ids match any regs they're supposed to */ + return check_ids(rold->id, rcur->id, idmap); + } + /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. * 'id' is not compared, since it's only used for maps with @@ -9173,20 +9163,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); - case PTR_TO_MAP_VALUE_OR_NULL: - /* a PTR_TO_MAP_VALUE could be safe to use as a - * PTR_TO_MAP_VALUE_OR_NULL into the same map. - * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- - * checked, doing so could have affected others with the same - * id, and we can't check for that because we lost the id when - * we converted to a PTR_TO_MAP_VALUE. - */ - if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL) - return false; - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) - return false; - /* Check our ids match any regs they're supposed to */ - return check_ids(rold->id, rcur->id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: if (rcur->type != rold->type) @@ -9215,11 +9191,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, case PTR_TO_PACKET_END: case PTR_TO_FLOW_KEYS: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: /* Only valid matches are exact, which memcmp() above * would have accepted @@ -9729,17 +9702,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* Return true if it's OK to have the same insn return a different type. */ static bool reg_type_mismatch_ok(enum bpf_reg_type type) { - switch (type) { + switch (base_type(type)) { case PTR_TO_CTX: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID_OR_NULL: return false; default: return true; @@ -9957,7 +9926,7 @@ static int do_check(struct bpf_verifier_env *env) if (is_ctx_reg(env, insn->dst_reg)) { verbose(env, "BPF_ST stores into R%d %s is not allowed\n", insn->dst_reg, - reg_type_str[reg_state(env, insn->dst_reg)->type]); + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES; } diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index c907f0dc7f87..f01693d8d73e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -858,7 +858,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), PTR_TO_BTF_ID_OR_NULL }, { offsetof(struct bpf_iter__bpf_sk_storage_map, value), - PTR_TO_RDWR_BUF_OR_NULL }, + PTR_TO_RDWR_BUF | PTR_MAYBE_NULL }, }, .seq_info = &iter_seq_info, }; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ddc899e83313..10a6409c4731 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1608,7 +1608,7 @@ static struct bpf_iter_reg sock_map_iter_reg = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), - PTR_TO_RDONLY_BUF_OR_NULL }, + PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, -- Gitee From 40c93d9a812805cf6ea9823147d48e84f90b15f5 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 8 Mar 2022 22:06:20 +0800 Subject: [PATCH 099/113] bpf: Introduce MEM_RDONLY flag mainline inclusion from mainline-v5.17-rc1 commit 20b2aff4bc15bda809f994761d5719827d66c0b4 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-0500 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=20b2aff4bc15bda809f994761d5719827d66c0b4 Signed-off-by: Yu Changchun -------------------------------- This patch introduce a flag MEM_RDONLY to tag a reg value pointing to read-only memory. It makes the following changes: 1. PTR_TO_RDWR_BUF -> PTR_TO_BUF 2. PTR_TO_RDONLY_BUF -> PTR_TO_BUF | MEM_RDONLY Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-6-haoluo@google.com Conflicts: kernel/bpf/verifier.c Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- include/linux/bpf.h | 8 ++-- kernel/bpf/btf.c | 3 +- kernel/bpf/map_iter.c | 4 +- kernel/bpf/verifier.c | 83 +++++++++++++++++++++++---------------- net/core/bpf_sk_storage.c | 2 +- net/core/sock_map.c | 2 +- 6 files changed, 59 insertions(+), 43 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 90e827902540..a6b8de9c61e3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -268,7 +268,10 @@ enum bpf_type_flag { /* PTR may be NULL. */ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = PTR_MAYBE_NULL, + /* MEM is read-only. */ + MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_RDONLY, }; /* Max number of base types. */ @@ -443,8 +446,7 @@ enum bpf_reg_type { * an explicit null check is required for this struct. */ PTR_TO_MEM, /* reg points to valid memory region */ - PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ - PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ + PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ __BPF_REG_TYPE_MAX, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 538f7adc61c4..9d6c55138571 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4529,8 +4529,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, type = base_type(ctx_arg_info->reg_type); flag = type_flag(ctx_arg_info->reg_type); - if (ctx_arg_info->offset == off && - (type == PTR_TO_RDWR_BUF || type == PTR_TO_RDONLY_BUF) && + if (ctx_arg_info->offset == off && type == PTR_TO_BUF && (flag & PTR_MAYBE_NULL)) { info->reg_type = ctx_arg_info->reg_type; return true; diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 631f0e44b7a9..b0fa190b0979 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), - PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__bpf_map_elem, value), - PTR_TO_RDWR_BUF | PTR_MAYBE_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL }, }, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8767b2bd8545..23fec54c5f23 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -417,6 +417,11 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) base_type(type) == PTR_TO_MEM; } +static bool type_is_rdonly_mem(u32 type) +{ + return type & MEM_RDONLY; +} + static bool arg_type_may_be_refcounted(enum bpf_arg_type type) { return type == ARG_PTR_TO_SOCK_COMMON; @@ -485,7 +490,7 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) static const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) { - char postfix[16] = {0}; + char postfix[16] = {0}, prefix[16] = {0}; static const char * const str[] = { [NOT_INIT] = "?", [SCALAR_VALUE] = "inv", @@ -505,8 +510,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", [PTR_TO_MEM] = "mem", - [PTR_TO_RDONLY_BUF] = "rdonly_buf", - [PTR_TO_RDWR_BUF] = "rdwr_buf", + [PTR_TO_BUF] = "buf", }; if (type & PTR_MAYBE_NULL) { @@ -517,8 +521,11 @@ static const char *reg_type_str(struct bpf_verifier_env *env, strncpy(postfix, "_or_null", 16); } - snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s", - str[base_type(type)], postfix); + if (type & MEM_RDONLY) + strncpy(prefix, "rdonly_", 16); + + snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", + prefix, str[base_type(type)], postfix); return env->type_str_buf; } @@ -2200,8 +2207,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_TCP_SOCK: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_RDONLY_BUF: - case PTR_TO_RDWR_BUF: + case PTR_TO_BUF: case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: return true; @@ -3885,22 +3891,27 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == CONST_PTR_TO_MAP) { err = check_ptr_to_map_access(env, regs, regno, off, size, t, value_regno); - } else if (reg->type == PTR_TO_RDONLY_BUF) { - if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); - return -EACCES; + } else if (base_type(reg->type) == PTR_TO_BUF) { + bool rdonly_mem = type_is_rdonly_mem(reg->type); + const char *buf_info; + u32 *max_access; + + if (rdonly_mem) { + if (t == BPF_WRITE) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + buf_info = "rdonly"; + max_access = &env->prog->aux->max_rdonly_access; + } else { + buf_info = "rdwr"; + max_access = &env->prog->aux->max_rdwr_access; } + err = check_buffer_access(env, reg, regno, off, size, false, - "rdonly", - &env->prog->aux->max_rdonly_access); - if (!err && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_RDWR_BUF) { - err = check_buffer_access(env, reg, regno, off, size, false, - "rdwr", - &env->prog->aux->max_rdwr_access); - if (!err && t == BPF_READ && value_regno >= 0) + buf_info, max_access); + if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, @@ -4103,8 +4114,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + const char *buf_info; + u32 *max_access; - switch (reg->type) { + switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, @@ -4120,18 +4133,20 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); - case PTR_TO_RDONLY_BUF: - if (meta && meta->raw_mode) - return -EACCES; - return check_buffer_access(env, reg, regno, reg->off, - access_size, zero_size_allowed, - "rdonly", - &env->prog->aux->max_rdonly_access); - case PTR_TO_RDWR_BUF: + case PTR_TO_BUF: + if (type_is_rdonly_mem(reg->type)) { + if (meta && meta->raw_mode) + return -EACCES; + + buf_info = "rdonly"; + max_access = &env->prog->aux->max_rdonly_access; + } else { + buf_info = "rdwr"; + max_access = &env->prog->aux->max_rdwr_access; + } return check_buffer_access(env, reg, regno, reg->off, access_size, zero_size_allowed, - "rdwr", - &env->prog->aux->max_rdwr_access); + buf_info, max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, @@ -4334,8 +4349,8 @@ static const struct bpf_reg_types mem_types = { PTR_TO_PACKET_META, PTR_TO_MAP_VALUE, PTR_TO_MEM, - PTR_TO_RDONLY_BUF, - PTR_TO_RDWR_BUF, + PTR_TO_BUF, + PTR_TO_BUF | MEM_RDONLY, }, }; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index f01693d8d73e..5b61e99b8d63 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -858,7 +858,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), PTR_TO_BTF_ID_OR_NULL }, { offsetof(struct bpf_iter__bpf_sk_storage_map, value), - PTR_TO_RDWR_BUF | PTR_MAYBE_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL }, }, .seq_info = &iter_seq_info, }; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 10a6409c4731..d8e64cdfcb60 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1608,7 +1608,7 @@ static struct bpf_iter_reg sock_map_iter_reg = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), - PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, -- Gitee From 1a8e939156318b8fdcce5be97f21f9ca0bbf2a78 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 8 Mar 2022 22:06:21 +0800 Subject: [PATCH 100/113] bpf: Convert PTR_TO_MEM_OR_NULL to composable types. mainline inclusion from mainline-v5.17-rc1 commit cf9f2f8d62eca810afbd1ee6cc0800202b000e57 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-0500 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cf9f2f8d62eca810afbd1ee6cc0800202b000e57 Signed-off-by: Yu Changchun -------------------------------- Remove PTR_TO_MEM_OR_NULL and replace it with PTR_TO_MEM combined with flag PTR_MAYBE_NULL. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-7-haoluo@google.com Conflicts: kernel/bpf/btf.c kernel/bpf/verifier.c Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- include/linux/bpf.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a6b8de9c61e3..677421015a38 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -456,7 +456,6 @@ enum bpf_reg_type { PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, - PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MEM, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. -- Gitee From d530e6d7b76bc8fba38014c9a6654d3938ed48aa Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 8 Mar 2022 22:06:22 +0800 Subject: [PATCH 101/113] bpf: Make per_cpu_ptr return rdonly PTR_TO_MEM. mainline inclusion from mainline-v5.17-rc1 commit 34d3a78c681e8e7844b43d1a2f4671a04249c821 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-0500 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=34d3a78c681e8e7844b43d1a2f4671a04249c821 Signed-off-by: Yu Changchun -------------------------------- Tag the return type of {per, this}_cpu_ptr with RDONLY_MEM. The returned value of this pair of helpers is kernel object, which can not be updated by bpf programs. Previously these two helpers return PTR_OT_MEM for kernel objects of scalar type, which allows one to directly modify the memory. Now with RDONLY_MEM tagging, the verifier will reject programs that write into RDONLY_MEM. Fixes: 63d9b80dcf2c ("bpf: Introducte bpf_this_cpu_ptr()") Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") Fixes: 4976b718c355 ("bpf: Introduce pseudo_btf_id") Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-8-haoluo@google.com Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/bpf/helpers.c | 4 ++-- kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2a04e6eafca5..45e72c262caf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -636,7 +636,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .func = bpf_per_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, .arg2_type = ARG_ANYTHING, }; @@ -649,7 +649,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .func = bpf_this_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 23fec54c5f23..3409a4496323 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3779,15 +3779,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } - } else if (reg->type == PTR_TO_MEM) { + } else if (base_type(reg->type) == PTR_TO_MEM) { + bool rdonly_mem = type_is_rdonly_mem(reg->type); + + if (type_may_be_null(reg->type)) { + verbose(env, "R%d invalid mem access '%s'\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } + + if (t == BPF_WRITE && rdonly_mem) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into mem\n", value_regno); return -EACCES; } + err = check_mem_region_access(env, regno, off, size, reg->mem_size, false); - if (!err && t == BPF_READ && value_regno >= 0) + if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; @@ -5496,6 +5511,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = tsize; } else { + /* MEM_RDONLY may be carried from ret_flag, but it + * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise + * it will confuse the check of PTR_TO_BTF_ID in + * check_mem_access(). + */ + ret_flag &= ~MEM_RDONLY; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } @@ -8147,7 +8169,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->type = aux->btf_var.reg_type; - switch (dst_reg->type) { + switch (base_type(dst_reg->type)) { case PTR_TO_MEM: dst_reg->mem_size = aux->btf_var.mem_size; break; @@ -10149,7 +10171,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, tname, PTR_ERR(ret)); return -EINVAL; } - aux->btf_var.reg_type = PTR_TO_MEM; + aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; -- Gitee From 54f2072cd075a0716a5f4bf431229abac0fe8d81 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 8 Mar 2022 22:06:23 +0800 Subject: [PATCH 102/113] bpf: Add MEM_RDONLY for helper args that are pointers to rdonly mem. mainline inclusion from mainline-v5.17-rc1 commit 216e3cd2f28dbbf1fe86848e0e29e6693b9f0a20 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-0500 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=216e3cd2f28dbbf1fe86848e0e29e6693b9f0a20 Signed-off-by: Yu Changchun -------------------------------- Some helper functions may modify its arguments, for example, bpf_d_path, bpf_get_stack etc. Previously, their argument types were marked as ARG_PTR_TO_MEM, which is compatible with read-only mem types, such as PTR_TO_RDONLY_BUF. Therefore it's legitimate, but technically incorrect, to modify a read-only memory by passing it into one of such helper functions. This patch tags the bpf_args compatible with immutable memory with MEM_RDONLY flag. The arguments that don't have this flag will be only compatible with mutable memory types, preventing the helper from modifying a read-only memory. The bpf_args that have MEM_RDONLY are compatible with both mutable memory and immutable memory. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-9-haoluo@google.com Conflicts: kernel/bpf/btf.c kernel/bpf/helpers.c kernel/bpf/syscall.c kernel/trace/bpf_trace.c net/core/filter.c Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- include/linux/bpf.h | 4 ++- kernel/bpf/cgroup.c | 2 +- kernel/bpf/helpers.c | 6 ++-- kernel/bpf/ringbuf.c | 2 +- kernel/bpf/verifier.c | 20 +++++++++++-- kernel/trace/bpf_trace.c | 22 +++++++------- net/core/filter.c | 62 ++++++++++++++++++++-------------------- 7 files changed, 67 insertions(+), 51 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 677421015a38..f5dbec18b465 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -268,7 +268,9 @@ enum bpf_type_flag { /* PTR may be NULL. */ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), - /* MEM is read-only. */ + /* MEM is read-only. When applied on bpf_arg, it indicates the arg is + * compatible with both mutable and immutable memory. + */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), __BPF_TYPE_LAST_FLAG = MEM_RDONLY, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6aa9e10c6335..3cde9fa56172 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1637,7 +1637,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 45e72c262caf..4bb5921a7d21 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -499,7 +499,7 @@ const struct bpf_func_proto bpf_strtol_proto = { .func = bpf_strtol, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, @@ -527,7 +527,7 @@ const struct bpf_func_proto bpf_strtoul_proto = { .func = bpf_strtoul, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, @@ -599,7 +599,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index f9913bc65ef8..e69d067f7e7f 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -463,7 +463,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = { .func = bpf_ringbuf_output, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3409a4496323..a6caeb767369 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4365,7 +4365,6 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MAP_VALUE, PTR_TO_MEM, PTR_TO_BUF, - PTR_TO_BUF | MEM_RDONLY, }, }; @@ -4426,6 +4425,21 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, return -EFAULT; } + /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY + * + * Same for MAYBE_NULL: + * + * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL + * + * Therefore we fold these flags depending on the arg_type before comparison. + */ + if (arg_type & MEM_RDONLY) + type &= ~MEM_RDONLY; + if (arg_type & PTR_MAYBE_NULL) + type &= ~PTR_MAYBE_NULL; + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; if (expected == NOT_INIT) @@ -4435,14 +4449,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, goto found; } - verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, type)); + verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type)); for (j = 0; j + 1 < i; j++) verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); return -EACCES; found: - if (type == PTR_TO_BTF_ID) { + if (reg->type == PTR_TO_BTF_ID) { if (!arg_btf_id) { if (!compatible->btf_id) { verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ba644760f507..2399850a5d2e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -342,7 +342,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -545,7 +545,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .func = bpf_trace_printk, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, }; @@ -754,9 +754,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -771,7 +771,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -795,7 +795,7 @@ static const struct bpf_func_proto bpf_seq_printf_btf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -956,7 +956,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1229,7 +1229,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -1404,7 +1404,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1626,7 +1626,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1680,7 +1680,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; diff --git a/net/core/filter.c b/net/core/filter.c index 7ea752af7894..99d46bb7f419 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1712,7 +1712,7 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -2020,9 +2020,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; @@ -2560,7 +2560,7 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -4177,7 +4177,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4191,7 +4191,7 @@ const struct bpf_func_proto bpf_skb_output_proto = { .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4374,7 +4374,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -4400,7 +4400,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -4570,7 +4570,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4584,7 +4584,7 @@ const struct bpf_func_proto bpf_xdp_output_proto = { .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -5000,7 +5000,7 @@ static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -5034,7 +5034,7 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -5209,7 +5209,7 @@ static const struct bpf_func_proto bpf_bind_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -5671,7 +5671,7 @@ static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -5681,7 +5681,7 @@ static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -5724,7 +5724,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -5812,7 +5812,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -6037,7 +6037,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6056,7 +6056,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6075,7 +6075,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6112,7 +6112,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6135,7 +6135,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6158,7 +6158,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6177,7 +6177,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6196,7 +6196,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6215,7 +6215,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6528,9 +6528,9 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -6597,9 +6597,9 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -6828,7 +6828,7 @@ static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; -- Gitee From bb8a006342a8de23044febc631ea03bcdc0e93c9 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 8 Mar 2022 22:06:24 +0800 Subject: [PATCH 103/113] bpf/selftests: Test PTR_TO_RDONLY_MEM mainline inclusion from mainline-v5.17-rc1 commit 9497c458c10b049438ef6e6ddda898edbc3ec6a8 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-0500 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9497c458c10b049438ef6e6ddda898edbc3ec6a8 Signed-off-by: Yu Changchun -------------------------------- This test verifies that a ksym of non-struct can not be directly updated. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211217003152.48334-10-haoluo@google.com Conflicts: tools/testing/selftests/bpf/prog_tests/ksyms_btf.c Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- .../selftests/bpf/prog_tests/ksyms_btf.c | 14 +++++++++ .../bpf/progs/test_ksyms_btf_write_check.c | 29 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c index b58b775d19f3..97f38d4f6a26 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c @@ -6,6 +6,7 @@ #include #include "test_ksyms_btf.skel.h" #include "test_ksyms_btf_null_check.skel.h" +#include "test_ksyms_btf_write_check.skel.h" static int duration; @@ -81,6 +82,16 @@ static void test_null_check(void) test_ksyms_btf_null_check__destroy(skel); } +static void test_write_check(void) +{ + struct test_ksyms_btf_write_check *skel; + + skel = test_ksyms_btf_write_check__open_and_load(); + CHECK(skel, "skel_open", "unexpected load of a prog writing to ksym memory\n"); + + test_ksyms_btf_write_check__destroy(skel); +} + void test_ksyms_btf(void) { int percpu_datasec; @@ -106,4 +117,7 @@ void test_ksyms_btf(void) if (test__start_subtest("null_check")) test_null_check(); + + if (test__start_subtest("write_check")) + test_write_check(); } diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c new file mode 100644 index 000000000000..2180c41cd890 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google */ + +#include "vmlinux.h" + +#include + +extern const int bpf_prog_active __ksym; /* int type global var. */ + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + int *active; + __u32 cpu; + + cpu = bpf_get_smp_processor_id(); + active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); + if (active) { + /* Kernel memory obtained from bpf_{per,this}_cpu_ptr + * is read-only, should _not_ pass verification. + */ + /* WRITE_ONCE */ + *(volatile int *)active = -1; + } + + return 0; +} + +char _license[] SEC("license") = "GPL"; -- Gitee From 740633bd40e7e308d4b5da1f38628e4129b305af Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 8 Mar 2022 22:06:25 +0800 Subject: [PATCH 104/113] bpf: Generalize check_ctx_reg for reuse with other types mainline inclusion from mainline-v5.17-rc1 commit be80a1d3f9dbe5aee79a325964f7037fe2d92f30 category: bugfix issue: #I4U9Y8 CVE: CVE-2021-4204 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be80a1d3f9dbe5aee79a325964f7037fe2d92f30 Signed-off-by: Yu Changchun -------------------------------- Generalize the check_ctx_reg() helper function into a more generic named one so that it can be reused for other register types as well to check whether their offset is non-zero. No functional change. Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov Conflicts: include/linux/bpf_verifier.h kernel/bpf/btf.c Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- include/linux/bpf_verifier.h | 4 ++-- kernel/bpf/btf.c | 2 +- kernel/bpf/verifier.c | 21 +++++++++++---------- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index efc26c4b5f7b..2f6a60c76fbd 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -469,8 +469,8 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); -int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno); +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9d6c55138571..ce69bb25ede8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5207,7 +5207,7 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, i, btf_kind_str[BTF_INFO_KIND(t->info)]); goto out; } - if (check_ctx_reg(env, ®[i + 1], i + 1)) + if (check_ptr_off_reg(env, ®[i + 1], i + 1)) goto out; continue; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a6caeb767369..4f0732db88b5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3359,16 +3359,16 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) { - /* Access to ctx or passing it to a helper is only allowed in - * its original, unmodified form. + /* Access to this pointer-typed register or passing it to a helper + * is only allowed in its original, unmodified form. */ if (reg->off) { - verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", - regno, reg->off); + verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); return -EACCES; } @@ -3376,7 +3376,8 @@ int check_ctx_reg(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); return -EACCES; } @@ -3814,7 +3815,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_ctx_reg(env, reg, regno); + err = check_ptr_off_reg(env, reg, regno); if (err < 0) return err; @@ -4533,7 +4534,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err; if (type == PTR_TO_CTX) { - err = check_ctx_reg(env, reg, regno); + err = check_ptr_off_reg(env, reg, regno); if (err < 0) return err; } @@ -8301,7 +8302,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } - err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); + err = check_ptr_off_reg(env, ®s[ctx_reg], ctx_reg); if (err < 0) return err; -- Gitee From e4fec0bb015d1a8270e968c4c445ab4756922b18 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 8 Mar 2022 22:06:26 +0800 Subject: [PATCH 105/113] bpf: Mark PTR_TO_FUNC register initially with zero offset mainline inclusion from mainline-v5.17-rc1 commit d400a6cf1c8a57cdf10f35220ead3284320d85ff category: bugfix issue: #I4U9Y8 CVE: CVE-2021-4204 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d400a6cf1c8a57cdf10f35220ead3284320d85ff Signed-off-by: Yu Changchun -------------------------------- Similar as with other pointer types where we use ldimm64, clear the register content to zero first, and then populate the PTR_TO_FUNC type and subprogno number. Currently this is not done, and leads to reuse of stale register tracking data. Given for special ldimm64 cases we always clear the register offset, make it common for all cases, so it won't be forgotten in future. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/bpf/verifier.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4f0732db88b5..86294e455851 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8180,9 +8180,13 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - if (insn->src_reg == BPF_PSEUDO_BTF_ID) { - mark_reg_known_zero(env, regs, insn->dst_reg); + /* All special src_reg cases are listed below. From this point onwards + * we either succeed and assign a corresponding dst_reg->type after + * zeroing the offset, or fail and reject the program. + */ + mark_reg_known_zero(env, regs, insn->dst_reg); + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { dst_reg->type = aux->btf_var.reg_type; switch (base_type(dst_reg->type)) { case PTR_TO_MEM: @@ -8200,7 +8204,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } map = env->used_maps[aux->map_index]; - mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { -- Gitee From f5d8aee3c6b6ff99a13bc49ac7846382857c6796 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 8 Mar 2022 22:06:27 +0800 Subject: [PATCH 106/113] bpf: Generally fix helper register offset check mainline inclusion from mainline-v5.17-rc1 commit 6788ab23508bddb0a9d88e104284922cb2c22b77 category: bugfix issue: #I4U9Y8 CVE: CVE-2021-4204 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6788ab23508bddb0a9d88e104284922cb2c22b77 Signed-off-by: Yu Changchun -------------------------------- Right now the assertion on check_ptr_off_reg() is only enforced for register types PTR_TO_CTX (and open coded also for PTR_TO_BTF_ID), however, this is insufficient since many other PTR_TO_* register types such as PTR_TO_FUNC do not handle/expect register offsets when passed to helper functions. Given this can slip-through easily when adding new types, make this an explicit allow-list and reject all other current and future types by default if this is encountered. Also, extend check_ptr_off_reg() to handle PTR_TO_BTF_ID as well instead of duplicating it. For PTR_TO_BTF_ID, reg->off is used for BTF to match expected BTF ids if struct offset is used. This part still needs to be allowed, but the dynamic off from the tnum must be rejected. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/bpf/verifier.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 86294e455851..ac17e1a654af 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3359,14 +3359,15 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +static int __check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + bool fixed_off_ok) { /* Access to this pointer-typed register or passing it to a helper * is only allowed in its original, unmodified form. */ - if (reg->off) { + if (!fixed_off_ok && reg->off) { verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", reg_type_str(env, reg->type), regno, reg->off); return -EACCES; @@ -3384,6 +3385,12 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, return 0; } +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + return __check_ptr_off_reg(env, reg, regno, false); +} + static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, @@ -4473,12 +4480,6 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, kernel_type_name(*arg_btf_id)); return -EACCES; } - - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", - regno); - return -EACCES; - } } return 0; @@ -4533,10 +4534,25 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (err) return err; - if (type == PTR_TO_CTX) { - err = check_ptr_off_reg(env, reg, regno); + switch ((u32)type) { + case SCALAR_VALUE: + /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_BUF: + case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_STACK: + break; + /* All the rest must be rejected: */ + default: + err = __check_ptr_off_reg(env, reg, regno, + type == PTR_TO_BTF_ID); if (err < 0) return err; + break; } skip_type_check: -- Gitee From a503c21b338af06509934b738688a422ddd2f04f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 8 Mar 2022 22:06:28 +0800 Subject: [PATCH 107/113] bpf: Fix out of bounds access for ringbuf helpers mainline inclusion from mainline-v5.17-rc1 commit 64620e0a1e712a778095bd35cbb277dc2259281f category: bugfix issue: #I4U9Y8 CVE: CVE-2021-4204 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=64620e0a1e712a778095bd35cbb277dc2259281f Signed-off-by: Yu Changchun -------------------------------- Both bpf_ringbuf_submit() and bpf_ringbuf_discard() have ARG_PTR_TO_ALLOC_MEM in their bpf_func_proto definition as their first argument. They both expect the result from a prior bpf_ringbuf_reserve() call which has a return type of RET_PTR_TO_ALLOC_MEM_OR_NULL. Meaning, after a NULL check in the code, the verifier will promote the register type in the non-NULL branch to a PTR_TO_MEM and in the NULL branch to a known zero scalar. Generally, pointer arithmetic on PTR_TO_MEM is allowed, so the latter could have an offset. The ARG_PTR_TO_ALLOC_MEM expects a PTR_TO_MEM register type. However, the non- zero result from bpf_ringbuf_reserve() must be fed into either bpf_ringbuf_submit() or bpf_ringbuf_discard() but with the original offset given it will then read out the struct bpf_ringbuf_hdr mapping. The verifier missed to enforce a zero offset, so that out of bounds access can be triggered which could be used to escalate privileges if unprivileged BPF was enabled (disabled by default in kernel). Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: (SecCoder Security Lab) Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/bpf/verifier.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ac17e1a654af..e8319c3adc49 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4545,9 +4545,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: case PTR_TO_STACK: + /* Some of the argument types nevertheless require a + * zero register offset. + */ + if (arg_type == ARG_PTR_TO_ALLOC_MEM) + goto force_off_check; break; /* All the rest must be rejected: */ default: +force_off_check: err = __check_ptr_off_reg(env, reg, regno, type == PTR_TO_BTF_ID); if (err < 0) -- Gitee From 3fa292088ae414492c6afb4cc4a42b6fb561a9a3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 8 Mar 2022 22:06:29 +0800 Subject: [PATCH 108/113] bpf: Fix ringbuf memory type confusion when passing to helpers mainline inclusion from mainline-v5.17-rc1 commit a672b2e36a648afb04ad3bda93b6bda947a479a5 category: bugfix issue: #I4U9Y8 CVE: CVE-2021-4204 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a672b2e36a648afb04ad3bda93b6bda947a479a5 Signed-off-by: Yu Changchun -------------------------------- The bpf_ringbuf_submit() and bpf_ringbuf_discard() have ARG_PTR_TO_ALLOC_MEM in their bpf_func_proto definition as their first argument, and thus both expect the result from a prior bpf_ringbuf_reserve() call which has a return type of RET_PTR_TO_ALLOC_MEM_OR_NULL. While the non-NULL memory from bpf_ringbuf_reserve() can be passed to other helpers, the two sinks (bpf_ringbuf_submit(), bpf_ringbuf_discard()) right now only enforce a register type of PTR_TO_MEM. This can lead to potential type confusion since it would allow other PTR_TO_MEM memory to be passed into the two sinks which did not come from bpf_ringbuf_reserve(). Add a new MEM_ALLOC composable type attribute for PTR_TO_MEM, and enforce that: - bpf_ringbuf_reserve() returns NULL or PTR_TO_MEM | MEM_ALLOC - bpf_ringbuf_submit() and bpf_ringbuf_discard() only take PTR_TO_MEM | MEM_ALLOC but not plain PTR_TO_MEM arguments via ARG_PTR_TO_ALLOC_MEM - however, other helpers might treat PTR_TO_MEM | MEM_ALLOC as plain PTR_TO_MEM to populate the memory area when they use ARG_PTR_TO_{UNINIT_,}MEM in their func proto description Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- include/linux/bpf.h | 9 +++++++-- kernel/bpf/verifier.c | 6 +++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f5dbec18b465..17502f263035 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -273,7 +273,12 @@ enum bpf_type_flag { */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = MEM_RDONLY, + /* MEM was "allocated" from a different helper, and cannot be mixed + * with regular non-MEM_ALLOC'ed MEM types. + */ + MEM_ALLOC = BIT(2 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_ALLOC, }; /* Max number of base types. */ @@ -352,7 +357,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, - RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e8319c3adc49..a06645c2fcf2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -523,6 +523,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env, if (type & MEM_RDONLY) strncpy(prefix, "rdonly_", 16); + if (type & MEM_ALLOC) + strncpy(prefix, "alloc_", 16); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -4372,6 +4374,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_PACKET_META, PTR_TO_MAP_VALUE, PTR_TO_MEM, + PTR_TO_MEM | MEM_ALLOC, PTR_TO_BUF, }, }; @@ -4388,7 +4391,7 @@ static const struct bpf_reg_types int_ptr_types = { static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; -static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; +static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; @@ -4542,6 +4545,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, case PTR_TO_MAP_VALUE: case PTR_TO_MEM: case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_MEM | MEM_ALLOC: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: case PTR_TO_STACK: -- Gitee From 8cee16c085719195499accd94d744731c6c59db8 Mon Sep 17 00:00:00 2001 From: Gilad Reti Date: Tue, 8 Mar 2022 22:06:30 +0800 Subject: [PATCH 109/113] selftests/bpf: Add verifier test for PTR_TO_MEM spill mainline inclusion from mainline-v5.11-rc5 commit 4237e9f4a96228ccc8a7abe5e4b30834323cd353 category: bugfix issue: #I4U9Y8 CVE: CVE-2021-4204 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4237e9f4a96228ccc8a7abe5e4b30834323cd353 Signed-off-by: Yu Changchun -------------------------------- Add a test to check that the verifier is able to recognize spilling of PTR_TO_MEM registers, by reserving a ringbuf buffer, forcing the spill of a pointer holding the buffer address to the stack, filling it back in from the stack and writing to the memory area pointed by it. The patch was partially contributed by CyberArk Software, Inc. Signed-off-by: Gilad Reti Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210113053810.13518-2-gilad.reti@gmail.com Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- tools/testing/selftests/bpf/test_verifier.c | 12 +++++++- .../selftests/bpf/verifier/spill_fill.c | 30 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index a4c55fcb0e7b..0fc813235575 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -50,7 +50,7 @@ #define MAX_INSNS BPF_MAXINSNS #define MAX_TEST_INSNS 1000000 #define MAX_FIXUPS 8 -#define MAX_NR_MAPS 20 +#define MAX_NR_MAPS 21 #define MAX_TEST_RUNS 8 #define POINTER_VALUE 0xcafe4all #define TEST_DATA_LEN 64 @@ -87,6 +87,7 @@ struct bpf_test { int fixup_sk_storage_map[MAX_FIXUPS]; int fixup_map_event_output[MAX_FIXUPS]; int fixup_map_reuseport_array[MAX_FIXUPS]; + int fixup_map_ringbuf[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; uint32_t insn_processed; @@ -640,6 +641,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, int *fixup_sk_storage_map = test->fixup_sk_storage_map; int *fixup_map_event_output = test->fixup_map_event_output; int *fixup_map_reuseport_array = test->fixup_map_reuseport_array; + int *fixup_map_ringbuf = test->fixup_map_ringbuf; if (test->fill_helper) { test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn)); @@ -817,6 +819,14 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, fixup_map_reuseport_array++; } while (*fixup_map_reuseport_array); } + if (*fixup_map_ringbuf) { + map_fds[20] = create_map(BPF_MAP_TYPE_RINGBUF, 0, + 0, 4096); + do { + prog[*fixup_map_ringbuf].imm = map_fds[20]; + fixup_map_ringbuf++; + } while (*fixup_map_ringbuf); + } } struct libcap { diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c index 45d43bf82f26..0b943897aaf6 100644 --- a/tools/testing/selftests/bpf/verifier/spill_fill.c +++ b/tools/testing/selftests/bpf/verifier/spill_fill.c @@ -28,6 +28,36 @@ .result = ACCEPT, .result_unpriv = ACCEPT, }, +{ + "check valid spill/fill, ptr to mem", + .insns = { + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + /* store a pointer to the reserved memory in R6 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + /* spill R6(mem) into the stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + /* fill it back in R7 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8), + /* should be able to access *(R7) = 0 */ + BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0), + /* submit the reserved ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .result = ACCEPT, + .result_unpriv = ACCEPT, +}, { "check corrupted spill/fill", .insns = { -- Gitee From b6997cbce252571460339d6e61b09ab25f73f62e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 8 Mar 2022 22:06:31 +0800 Subject: [PATCH 110/113] bpf, selftests: Add various ringbuf tests with invalid offset mainline inclusion from mainline-v5.17-rc1 commit 722e4db3ae0d52b2e3801280afbe19cf2d188e91 category: bugfix issue: #I4U9Y8 CVE: CVE-2021-4204 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=722e4db3ae0d52b2e3801280afbe19cf2d188e91 Signed-off-by: Yu Changchun -------------------------------- Assert that the verifier is rejecting invalid offsets on the ringbuf entries: # ./test_verifier | grep ring #947/u ringbuf: invalid reservation offset 1 OK #947/p ringbuf: invalid reservation offset 1 OK #948/u ringbuf: invalid reservation offset 2 OK #948/p ringbuf: invalid reservation offset 2 OK Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- .../testing/selftests/bpf/verifier/ringbuf.c | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 tools/testing/selftests/bpf/verifier/ringbuf.c diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c new file mode 100644 index 000000000000..68cae6947cc4 --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/ringbuf.c @@ -0,0 +1,64 @@ +{ + "ringbuf: invalid reservation offset 1", + .insns = { + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + /* store a pointer to the reserved memory in R6 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + /* spill R6(mem) into the stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + /* fill it back in R7 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8), + /* should be able to access *(R7) = 0 */ + BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0), + /* submit the reserved ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + /* add invalid offset to reserved ringbuf memory */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xcafe), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .result = REJECT, + .errstr = "dereference of modified mem ptr R1", +}, +{ + "ringbuf: invalid reservation offset 2", + .insns = { + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + /* store a pointer to the reserved memory in R6 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + /* spill R6(mem) into the stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + /* fill it back in R7 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8), + /* add invalid offset to reserved ringbuf memory */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0xcafe), + /* should be able to access *(R7) = 0 */ + BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0), + /* submit the reserved ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .result = REJECT, + .errstr = "R7 min value is outside of the allowed memory range", +}, -- Gitee From 7e43379d1e4e60d12cb754d8daad5b19b19b597e Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 8 Mar 2022 22:06:32 +0800 Subject: [PATCH 111/113] bpf/selftests: Test bpf_d_path on rdonly_mem. mainline inclusion from mainline-v5.17-rc1 commit 44bab87d8ca6f0544a9f8fc97bdf33aa5b3c899e category: bugfix issue: #I4U9Y8 CVE: CVE-2021-4204 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=44bab87d8ca6f0544a9f8fc97bdf33aa5b3c899e Signed-off-by: Yu Changchun -------------------------------- The second parameter of bpf_d_path() can only accept writable memories. Rdonly_mem obtained from bpf_per_cpu_ptr() can not be passed into bpf_d_path for modification. This patch adds a selftest to verify this behavior. Signed-off-by: Hao Luo Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220106205525.2116218-1-haoluo@google.com Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- .../testing/selftests/bpf/prog_tests/d_path.c | 22 ++++++++++++++- .../bpf/progs/test_d_path_check_rdonly_mem.c | 28 +++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c diff --git a/tools/testing/selftests/bpf/prog_tests/d_path.c b/tools/testing/selftests/bpf/prog_tests/d_path.c index 0a577a248d34..8c8bc495e7f4 100644 --- a/tools/testing/selftests/bpf/prog_tests/d_path.c +++ b/tools/testing/selftests/bpf/prog_tests/d_path.c @@ -9,6 +9,7 @@ #define MAX_FILES 7 #include "test_d_path.skel.h" +#include "test_d_path_check_rdonly_mem.skel.h" static int duration; @@ -99,7 +100,7 @@ static int trigger_fstat_events(pid_t pid) return ret; } -void test_d_path(void) +static void test_d_path_basic(void) { struct test_d_path__bss *bss; struct test_d_path *skel; @@ -155,3 +156,22 @@ void test_d_path(void) cleanup: test_d_path__destroy(skel); } + +static void test_d_path_check_rdonly_mem(void) +{ + struct test_d_path_check_rdonly_mem *skel; + + skel = test_d_path_check_rdonly_mem__open_and_load(); + CHECK(skel, "skel_open", "unexpected_load_overwriting_rdonly_mem"); + + test_d_path_check_rdonly_mem__destroy(skel); +} + +void test_d_path(void) +{ + if (test__start_subtest("basic")) + test_d_path_basic(); + + if (test__start_subtest("check_rdonly_mem")) + test_d_path_check_rdonly_mem(); +} diff --git a/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c b/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c new file mode 100644 index 000000000000..27c27cff6a3a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Google */ + +#include "vmlinux.h" +#include +#include + +extern const int bpf_prog_active __ksym; + +SEC("fentry/security_inode_getattr") +int BPF_PROG(d_path_check_rdonly_mem, struct path *path, struct kstat *stat, + __u32 request_mask, unsigned int query_flags) +{ + void *active; + __u32 cpu; + + cpu = bpf_get_smp_processor_id(); + active = (void *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); + if (active) { + /* FAIL here! 'active' points to readonly memory. bpf helpers + * that update its arguments can not write into it. + */ + bpf_d_path(path, active, sizeof(int)); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; -- Gitee From 6eb30de345996af21ca075631992d4886c1f3dfc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 8 Mar 2022 22:06:33 +0800 Subject: [PATCH 112/113] bpf, selftests: Add ringbuf memory type confusion test mainline inclusion from mainline-v5.17-rc1 commit 37c8d4807d1b8b521b30310dce97f6695dc2c2c6 category: bugfix issue: #I4U9Y8 CVE: CVE-2021-4204 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=37c8d4807d1b8b521b30310dce97f6695dc2c2c6 Signed-off-by: Yu Changchun -------------------------------- Add two tests, one which asserts that ring buffer memory can be passed to other helpers for populating its entry area, and another one where verifier rejects different type of memory passed to bpf_ringbuf_submit(). Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- .../testing/selftests/bpf/prog_tests/d_path.c | 14 ++++++++ .../bpf/progs/test_d_path_check_types.c | 32 ++++++++++++++++++ .../testing/selftests/bpf/verifier/ringbuf.c | 33 ++++++++++++++++++- 3 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/test_d_path_check_types.c diff --git a/tools/testing/selftests/bpf/prog_tests/d_path.c b/tools/testing/selftests/bpf/prog_tests/d_path.c index 8c8bc495e7f4..85f1386a97e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/d_path.c +++ b/tools/testing/selftests/bpf/prog_tests/d_path.c @@ -10,6 +10,7 @@ #include "test_d_path.skel.h" #include "test_d_path_check_rdonly_mem.skel.h" +#include "test_d_path_check_types.skel.h" static int duration; @@ -167,6 +168,16 @@ static void test_d_path_check_rdonly_mem(void) test_d_path_check_rdonly_mem__destroy(skel); } +static void test_d_path_check_types(void) +{ + struct test_d_path_check_types *skel; + + skel = test_d_path_check_types__open_and_load(); + CHECK(skel, "skel_open", "unexpected_load_passing_wrong_type"); + + test_d_path_check_types__destroy(skel); +} + void test_d_path(void) { if (test__start_subtest("basic")) @@ -174,4 +185,7 @@ void test_d_path(void) if (test__start_subtest("check_rdonly_mem")) test_d_path_check_rdonly_mem(); + + if (test__start_subtest("check_alloc_mem")) + test_d_path_check_types(); } diff --git a/tools/testing/selftests/bpf/progs/test_d_path_check_types.c b/tools/testing/selftests/bpf/progs/test_d_path_check_types.c new file mode 100644 index 000000000000..7e02b7361307 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_d_path_check_types.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +extern const int bpf_prog_active __ksym; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 12); +} ringbuf SEC(".maps"); + +SEC("fentry/security_inode_getattr") +int BPF_PROG(d_path_check_rdonly_mem, struct path *path, struct kstat *stat, + __u32 request_mask, unsigned int query_flags) +{ + void *active; + u32 cpu; + + cpu = bpf_get_smp_processor_id(); + active = (void *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); + if (active) { + /* FAIL here! 'active' points to 'regular' memory. It + * cannot be submitted to ring buffer. + */ + bpf_ringbuf_submit(active, 0); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c index 68cae6947cc4..b64d33e4833c 100644 --- a/tools/testing/selftests/bpf/verifier/ringbuf.c +++ b/tools/testing/selftests/bpf/verifier/ringbuf.c @@ -28,7 +28,7 @@ }, .fixup_map_ringbuf = { 1 }, .result = REJECT, - .errstr = "dereference of modified mem ptr R1", + .errstr = "dereference of modified alloc_mem ptr R1", }, { "ringbuf: invalid reservation offset 2", @@ -62,3 +62,34 @@ .result = REJECT, .errstr = "R7 min value is outside of the allowed memory range", }, +{ + "ringbuf: check passing rb mem to helpers", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + /* pass allocated ring buffer memory to fib lookup */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 8), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_fib_lookup), + /* submit the ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 2 }, + .prog_type = BPF_PROG_TYPE_XDP, + .result = ACCEPT, +}, -- Gitee From 1bd29b683bcc4ff5076e76d1748ee34537ce6759 Mon Sep 17 00:00:00 2001 From: Jordy Zomer Date: Tue, 11 Jan 2022 17:44:51 +0100 Subject: [PATCH 113/113] nfc: st21nfca: Fix potential buffer overflows in EVT_TRANSACTION mainline inclusion from mainline-v5.17-rc1 commit 4fbcc1a4cb20fe26ad0225679c536c80f1648221 category: bugfix issue: #I4U9Y8 CVE: CVE-2022-26490 Signed-off-by: Yu Changchun --------------------------------------- It appears that there are some buffer overflows in EVT_TRANSACTION. This happens because the length parameters that are passed to memcpy come directly from skb->data and are not guarded in any way. Signed-off-by: Jordy Zomer Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller Signed-off-by: Yu Changchun --- drivers/nfc/st21nfca/se.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c index c8bdf078d111..0841e0e370a0 100644 --- a/drivers/nfc/st21nfca/se.c +++ b/drivers/nfc/st21nfca/se.c @@ -320,6 +320,11 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, return -ENOMEM; transaction->aid_len = skb->data[1]; + + /* Checking if the length of the AID is valid */ + if (transaction->aid_len > sizeof(transaction->aid)) + return -EINVAL; + memcpy(transaction->aid, &skb->data[2], transaction->aid_len); @@ -329,6 +334,11 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, return -EPROTO; transaction->params_len = skb->data[transaction->aid_len + 3]; + + /* Total size is allocated (skb->len - 2) minus fixed array members */ + if (transaction->params_len > ((skb->len - 2) - sizeof(struct nfc_evt_transaction))) + return -EINVAL; + memcpy(transaction->params, skb->data + transaction->aid_len + 4, transaction->params_len); -- Gitee