ANDROID: Incremental fs: make remount log buffer change atomic
Read log buffer can have multiple threads doing any of these
operations simultaneously:
- Polling for changes
- Reading log records
- Adding new log records
- Updating log buffer size, or enabling/disabling it completely
As we don't control the userspace, and it turns out that they
all currently originate from different processes, code needs to
be safe against parallel access to a read buffer and a request
for reallocating it.
This CL add an r/w spinlock to protect the buffer and its size.
Each remount takes the write lock, while everything else takes
a read lock. Remount makes sure it doesn't take too long by
preallocating and precalculating all updates, while other
operations don't care much about their critical section size -
they all can still run together.
Bug: 152633648
Test: manual remount + reading
Signed-off-by: Yurii Zubrytskyi <zyy@google.com>
Signed-off-by: Paul Lawrence <paullawrence@google.com>
Change-Id: I7271b4cb89f1ae2cbee6e5b073758f344c4ba66a
Git-commit: 5e6feacb2a
Git-repo: https://android.googlesource.com/kernel/common/
Signed-off-by: Sayali Lokhande <sayalil@codeaurora.org>
This commit is contained in:
committed by
Blagovest Kolenichev
parent
aae486b462
commit
46c1f3f598
@@ -34,7 +34,8 @@ struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
|
||||
mutex_init(&mi->mi_pending_reads_mutex);
|
||||
init_waitqueue_head(&mi->mi_pending_reads_notif_wq);
|
||||
init_waitqueue_head(&mi->mi_log.ml_notif_wq);
|
||||
spin_lock_init(&mi->mi_log.rl_writer_lock);
|
||||
rwlock_init(&mi->mi_log.rl_access_lock);
|
||||
spin_lock_init(&mi->mi_log.rl_logging_lock);
|
||||
INIT_LIST_HEAD(&mi->mi_reads_list_head);
|
||||
|
||||
error = incfs_realloc_mount_info(mi, options);
|
||||
@@ -51,20 +52,38 @@ struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
|
||||
int incfs_realloc_mount_info(struct mount_info *mi,
|
||||
struct mount_options *options)
|
||||
{
|
||||
kfree(mi->mi_log.rl_ring_buf);
|
||||
mi->mi_log.rl_ring_buf = NULL;
|
||||
mi->mi_log.rl_size = 0;
|
||||
void *new_buffer = NULL;
|
||||
size_t new_buffer_size = 0;
|
||||
|
||||
mi->mi_options = *options;
|
||||
if (options->read_log_pages != 0) {
|
||||
size_t buf_size = PAGE_SIZE * options->read_log_pages;
|
||||
if (options->read_log_pages != mi->mi_options.read_log_pages) {
|
||||
struct read_log_state log_state;
|
||||
/*
|
||||
* Even though having two buffers allocated at once isn't
|
||||
* usually good, allocating a multipage buffer under a spinlock
|
||||
* is even worse, so let's optimize for the shorter lock
|
||||
* duration. It's not end of the world if we fail to increase
|
||||
* the buffer size anyway.
|
||||
*/
|
||||
if (options->read_log_pages > 0) {
|
||||
new_buffer_size = PAGE_SIZE * options->read_log_pages;
|
||||
new_buffer = kzalloc(new_buffer_size, GFP_NOFS);
|
||||
if (!new_buffer)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
mi->mi_log.rl_size = buf_size / sizeof(*mi->mi_log.rl_ring_buf);
|
||||
mi->mi_log.rl_ring_buf = kzalloc(buf_size, GFP_NOFS);
|
||||
if (!mi->mi_log.rl_ring_buf)
|
||||
return -ENOMEM;
|
||||
write_lock(&mi->mi_log.rl_access_lock);
|
||||
kfree(mi->mi_log.rl_ring_buf);
|
||||
WRITE_ONCE(mi->mi_log.rl_ring_buf, new_buffer);
|
||||
WRITE_ONCE(mi->mi_log.rl_size,
|
||||
new_buffer_size / sizeof(*mi->mi_log.rl_ring_buf));
|
||||
log_state = READ_ONCE(mi->mi_log.rl_state);
|
||||
log_state.generation_id++;
|
||||
log_state.next_index = log_state.current_pass_no = 0;
|
||||
WRITE_ONCE(mi->mi_log.rl_state, log_state);
|
||||
write_unlock(&mi->mi_log.rl_access_lock);
|
||||
}
|
||||
|
||||
mi->mi_options = *options;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -233,6 +252,7 @@ static void log_block_read(struct mount_info *mi, incfs_uuid_t *id,
|
||||
struct read_log *log = &mi->mi_log;
|
||||
struct read_log_state state;
|
||||
s64 now_us = ktime_to_us(ktime_get());
|
||||
int rl_size;
|
||||
struct read_log_record record = {
|
||||
.file_id = *id,
|
||||
.block_index = block_index,
|
||||
@@ -240,20 +260,23 @@ static void log_block_read(struct mount_info *mi, incfs_uuid_t *id,
|
||||
.timestamp_us = now_us
|
||||
};
|
||||
|
||||
if (log->rl_size == 0)
|
||||
return;
|
||||
|
||||
spin_lock(&log->rl_writer_lock);
|
||||
state = READ_ONCE(log->rl_state);
|
||||
log->rl_ring_buf[state.next_index] = record;
|
||||
if (++state.next_index == log->rl_size) {
|
||||
state.next_index = 0;
|
||||
++state.current_pass_no;
|
||||
read_lock(&log->rl_access_lock);
|
||||
rl_size = READ_ONCE(log->rl_size);
|
||||
if (rl_size != 0) {
|
||||
spin_lock(&log->rl_logging_lock);
|
||||
state = READ_ONCE(log->rl_state);
|
||||
log->rl_ring_buf[state.next_index] = record;
|
||||
if (++state.next_index == rl_size) {
|
||||
state.next_index = 0;
|
||||
++state.current_pass_no;
|
||||
}
|
||||
WRITE_ONCE(log->rl_state, state);
|
||||
spin_unlock(&log->rl_logging_lock);
|
||||
}
|
||||
WRITE_ONCE(log->rl_state, state);
|
||||
spin_unlock(&log->rl_writer_lock);
|
||||
read_unlock(&log->rl_access_lock);
|
||||
|
||||
wake_up_all(&log->ml_notif_wq);
|
||||
if (rl_size != 0)
|
||||
wake_up_all(&log->ml_notif_wq);
|
||||
}
|
||||
|
||||
static int validate_hash_tree(struct file *bf, struct data_file *df,
|
||||
@@ -1171,9 +1194,11 @@ struct read_log_state incfs_get_log_state(struct mount_info *mi)
|
||||
struct read_log *log = &mi->mi_log;
|
||||
struct read_log_state result;
|
||||
|
||||
spin_lock(&log->rl_writer_lock);
|
||||
read_lock(&log->rl_access_lock);
|
||||
spin_lock(&log->rl_logging_lock);
|
||||
result = READ_ONCE(log->rl_state);
|
||||
spin_unlock(&log->rl_writer_lock);
|
||||
spin_unlock(&log->rl_logging_lock);
|
||||
read_unlock(&log->rl_access_lock);
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -1186,10 +1211,21 @@ int incfs_get_uncollected_logs_count(struct mount_info *mi,
|
||||
struct read_log_state state)
|
||||
{
|
||||
struct read_log *log = &mi->mi_log;
|
||||
struct read_log_state rl_state;
|
||||
int rl_size;
|
||||
u64 count;
|
||||
|
||||
u64 count = calc_record_count(&log->rl_state, log->rl_size) -
|
||||
calc_record_count(&state, log->rl_size);
|
||||
return min_t(int, count, log->rl_size);
|
||||
read_lock(&log->rl_access_lock);
|
||||
rl_size = READ_ONCE(log->rl_size);
|
||||
spin_lock(&log->rl_logging_lock);
|
||||
rl_state = READ_ONCE(log->rl_state);
|
||||
spin_unlock(&log->rl_logging_lock);
|
||||
read_unlock(&log->rl_access_lock);
|
||||
|
||||
count = calc_record_count(&rl_state, rl_size);
|
||||
if (rl_state.generation_id == state.generation_id)
|
||||
count -= calc_record_count(&state, rl_size);
|
||||
return min_t(int, count, rl_size);
|
||||
}
|
||||
|
||||
static void fill_pending_read_from_log_record(
|
||||
@@ -1209,17 +1245,35 @@ int incfs_collect_logged_reads(struct mount_info *mi,
|
||||
int reads_size)
|
||||
{
|
||||
struct read_log *log = &mi->mi_log;
|
||||
struct read_log_state live_state = incfs_get_log_state(mi);
|
||||
u64 read_count = calc_record_count(reader_state, log->rl_size);
|
||||
u64 written_count = calc_record_count(&live_state, log->rl_size);
|
||||
struct read_log_state live_state;
|
||||
int dst_idx;
|
||||
int rl_size;
|
||||
int result = 0;
|
||||
u64 read_count;
|
||||
u64 written_count;
|
||||
|
||||
if (reader_state->next_index >= log->rl_size ||
|
||||
read_count > written_count)
|
||||
return -ERANGE;
|
||||
read_lock(&log->rl_access_lock);
|
||||
|
||||
if (read_count == written_count)
|
||||
return 0;
|
||||
rl_size = READ_ONCE(log->rl_size);
|
||||
spin_lock(&log->rl_logging_lock);
|
||||
live_state = READ_ONCE(log->rl_state);
|
||||
spin_unlock(&log->rl_logging_lock);
|
||||
|
||||
if (reader_state->generation_id != live_state.generation_id) {
|
||||
reader_state->generation_id = live_state.generation_id;
|
||||
reader_state->current_pass_no = reader_state->next_index = 0;
|
||||
}
|
||||
|
||||
read_count = calc_record_count(reader_state, rl_size);
|
||||
written_count = calc_record_count(&live_state, rl_size);
|
||||
if (read_count == written_count) {
|
||||
result = 0;
|
||||
goto out;
|
||||
}
|
||||
if (reader_state->next_index >= rl_size) {
|
||||
result = -ERANGE;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (read_count > written_count) {
|
||||
/* This reader is somehow ahead of the writer. */
|
||||
@@ -1227,16 +1281,17 @@ int incfs_collect_logged_reads(struct mount_info *mi,
|
||||
*reader_state = live_state;
|
||||
}
|
||||
|
||||
if (written_count - read_count > log->rl_size) {
|
||||
if (written_count - read_count > rl_size) {
|
||||
/*
|
||||
* Reading pointer is too far behind,
|
||||
* start from the record following the write pointer.
|
||||
*/
|
||||
pr_debug("incfs: read pointer is behind, moving: %u/%u -> %u/%u / %u\n",
|
||||
pr_debug(
|
||||
"incfs: read pointer is behind, moving: %u/%u -> %u/%u / %u\n",
|
||||
(u32)reader_state->next_index,
|
||||
(u32)reader_state->current_pass_no,
|
||||
(u32)live_state.next_index,
|
||||
(u32)live_state.current_pass_no - 1, (u32)log->rl_size);
|
||||
(u32)live_state.current_pass_no - 1, (u32)rl_size);
|
||||
|
||||
*reader_state = (struct read_log_state){
|
||||
.next_index = live_state.next_index,
|
||||
@@ -1252,15 +1307,19 @@ int incfs_collect_logged_reads(struct mount_info *mi,
|
||||
fill_pending_read_from_log_record(
|
||||
&reads[dst_idx],
|
||||
&log->rl_ring_buf[reader_state->next_index],
|
||||
reader_state, log->rl_size);
|
||||
reader_state, rl_size);
|
||||
|
||||
reader_state->next_index++;
|
||||
if (reader_state->next_index == log->rl_size) {
|
||||
if (reader_state->next_index == rl_size) {
|
||||
reader_state->next_index = 0;
|
||||
reader_state->current_pass_no++;
|
||||
}
|
||||
}
|
||||
return dst_idx;
|
||||
result = dst_idx;
|
||||
|
||||
out:
|
||||
read_unlock(&log->rl_access_lock);
|
||||
return result;
|
||||
}
|
||||
|
||||
bool incfs_equal_ranges(struct mem_range lhs, struct mem_range rhs)
|
||||
|
||||
@@ -31,10 +31,13 @@ struct read_log_record {
|
||||
} __packed;
|
||||
|
||||
struct read_log_state {
|
||||
/* Next slot in rl_ring_buf to write to. */
|
||||
u32 next_index;
|
||||
/* Log buffer generation id, incremented on configuration changes */
|
||||
u32 generation_id : 8;
|
||||
|
||||
/* Current number of writer pass over rl_ring_buf */
|
||||
/* Next slot in rl_ring_buf to write into. */
|
||||
u32 next_index : 24;
|
||||
|
||||
/* Current number of writer passes over rl_ring_buf */
|
||||
u32 current_pass_no;
|
||||
};
|
||||
|
||||
@@ -42,11 +45,21 @@ struct read_log_state {
|
||||
struct read_log {
|
||||
struct read_log_record *rl_ring_buf;
|
||||
|
||||
int rl_size;
|
||||
|
||||
struct read_log_state rl_state;
|
||||
|
||||
spinlock_t rl_writer_lock;
|
||||
/*
|
||||
* A lock for _all_ accesses to the struct, to protect against remounts.
|
||||
* Taken for writing when resizing the buffer.
|
||||
*/
|
||||
rwlock_t rl_access_lock;
|
||||
|
||||
int rl_size;
|
||||
/*
|
||||
* A lock to protect the actual logging - adding a new record.
|
||||
* Note: ALWAYS taken after and under the |rl_access_lock|.
|
||||
*/
|
||||
spinlock_t rl_logging_lock;
|
||||
|
||||
/*
|
||||
* A queue of waiters who want to be notified about reads.
|
||||
|
||||
@@ -581,22 +581,27 @@ static ssize_t log_read(struct file *f, char __user *buf, size_t len,
|
||||
{
|
||||
struct log_file_state *log_state = f->private_data;
|
||||
struct mount_info *mi = get_mount_info(file_superblock(f));
|
||||
struct incfs_pending_read_info *reads_buf =
|
||||
(struct incfs_pending_read_info *)__get_free_page(GFP_NOFS);
|
||||
size_t reads_to_collect = len / sizeof(*reads_buf);
|
||||
size_t reads_per_page = PAGE_SIZE / sizeof(*reads_buf);
|
||||
int total_reads_collected = 0;
|
||||
int rl_size;
|
||||
ssize_t result = 0;
|
||||
struct incfs_pending_read_info *reads_buf;
|
||||
ssize_t reads_to_collect = len / sizeof(*reads_buf);
|
||||
ssize_t reads_per_page = PAGE_SIZE / sizeof(*reads_buf);
|
||||
|
||||
rl_size = READ_ONCE(mi->mi_log.rl_size);
|
||||
if (rl_size == 0)
|
||||
return 0;
|
||||
|
||||
reads_buf = (struct incfs_pending_read_info *)__get_free_page(GFP_NOFS);
|
||||
if (!reads_buf)
|
||||
return -ENOMEM;
|
||||
|
||||
reads_to_collect = min_t(size_t, mi->mi_log.rl_size, reads_to_collect);
|
||||
reads_to_collect = min_t(ssize_t, rl_size, reads_to_collect);
|
||||
while (reads_to_collect > 0) {
|
||||
struct read_log_state next_state = READ_ONCE(log_state->state);
|
||||
int reads_collected = incfs_collect_logged_reads(
|
||||
mi, &next_state, reads_buf,
|
||||
min_t(size_t, reads_to_collect, reads_per_page));
|
||||
min_t(ssize_t, reads_to_collect, reads_per_page));
|
||||
if (reads_collected <= 0) {
|
||||
result = total_reads_collected ?
|
||||
total_reads_collected *
|
||||
|
||||
Reference in New Issue
Block a user