ANDROID: Incremental fs: make remount log buffer change atomic

Read log buffer can have multiple threads doing any of these
operations simultaneously:
- Polling for changes
- Reading log records
- Adding new log records
- Updating log buffer size, or enabling/disabling it completely

As we don't control the userspace, and it turns out that they
all currently originate from different processes, code needs to
be safe against parallel access to a read buffer and a request
for reallocating it.

This CL add an r/w spinlock to protect the buffer and its size.
Each remount takes the write lock, while everything else takes
a read lock. Remount makes sure it doesn't take too long by
preallocating and precalculating all updates, while other
operations don't care much about their critical section size -
they all can still run together.

Bug: 152633648
Test: manual remount + reading
Signed-off-by: Yurii Zubrytskyi <zyy@google.com>
Signed-off-by: Paul Lawrence <paullawrence@google.com>
Change-Id: I7271b4cb89f1ae2cbee6e5b073758f344c4ba66a
Git-commit: 5e6feacb2a
Git-repo: https://android.googlesource.com/kernel/common/
Signed-off-by: Sayali Lokhande <sayalil@codeaurora.org>
This commit is contained in:
Yurii Zubrytskyi
2020-04-06 12:49:41 -07:00
committed by Blagovest Kolenichev
parent aae486b462
commit 46c1f3f598
3 changed files with 130 additions and 53 deletions

View File

@@ -34,7 +34,8 @@ struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
mutex_init(&mi->mi_pending_reads_mutex); mutex_init(&mi->mi_pending_reads_mutex);
init_waitqueue_head(&mi->mi_pending_reads_notif_wq); init_waitqueue_head(&mi->mi_pending_reads_notif_wq);
init_waitqueue_head(&mi->mi_log.ml_notif_wq); init_waitqueue_head(&mi->mi_log.ml_notif_wq);
spin_lock_init(&mi->mi_log.rl_writer_lock); rwlock_init(&mi->mi_log.rl_access_lock);
spin_lock_init(&mi->mi_log.rl_logging_lock);
INIT_LIST_HEAD(&mi->mi_reads_list_head); INIT_LIST_HEAD(&mi->mi_reads_list_head);
error = incfs_realloc_mount_info(mi, options); error = incfs_realloc_mount_info(mi, options);
@@ -51,20 +52,38 @@ struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
int incfs_realloc_mount_info(struct mount_info *mi, int incfs_realloc_mount_info(struct mount_info *mi,
struct mount_options *options) struct mount_options *options)
{ {
kfree(mi->mi_log.rl_ring_buf); void *new_buffer = NULL;
mi->mi_log.rl_ring_buf = NULL; size_t new_buffer_size = 0;
mi->mi_log.rl_size = 0;
mi->mi_options = *options; if (options->read_log_pages != mi->mi_options.read_log_pages) {
if (options->read_log_pages != 0) { struct read_log_state log_state;
size_t buf_size = PAGE_SIZE * options->read_log_pages; /*
* Even though having two buffers allocated at once isn't
* usually good, allocating a multipage buffer under a spinlock
* is even worse, so let's optimize for the shorter lock
* duration. It's not end of the world if we fail to increase
* the buffer size anyway.
*/
if (options->read_log_pages > 0) {
new_buffer_size = PAGE_SIZE * options->read_log_pages;
new_buffer = kzalloc(new_buffer_size, GFP_NOFS);
if (!new_buffer)
return -ENOMEM;
}
mi->mi_log.rl_size = buf_size / sizeof(*mi->mi_log.rl_ring_buf); write_lock(&mi->mi_log.rl_access_lock);
mi->mi_log.rl_ring_buf = kzalloc(buf_size, GFP_NOFS); kfree(mi->mi_log.rl_ring_buf);
if (!mi->mi_log.rl_ring_buf) WRITE_ONCE(mi->mi_log.rl_ring_buf, new_buffer);
return -ENOMEM; WRITE_ONCE(mi->mi_log.rl_size,
new_buffer_size / sizeof(*mi->mi_log.rl_ring_buf));
log_state = READ_ONCE(mi->mi_log.rl_state);
log_state.generation_id++;
log_state.next_index = log_state.current_pass_no = 0;
WRITE_ONCE(mi->mi_log.rl_state, log_state);
write_unlock(&mi->mi_log.rl_access_lock);
} }
mi->mi_options = *options;
return 0; return 0;
} }
@@ -233,6 +252,7 @@ static void log_block_read(struct mount_info *mi, incfs_uuid_t *id,
struct read_log *log = &mi->mi_log; struct read_log *log = &mi->mi_log;
struct read_log_state state; struct read_log_state state;
s64 now_us = ktime_to_us(ktime_get()); s64 now_us = ktime_to_us(ktime_get());
int rl_size;
struct read_log_record record = { struct read_log_record record = {
.file_id = *id, .file_id = *id,
.block_index = block_index, .block_index = block_index,
@@ -240,20 +260,23 @@ static void log_block_read(struct mount_info *mi, incfs_uuid_t *id,
.timestamp_us = now_us .timestamp_us = now_us
}; };
if (log->rl_size == 0) read_lock(&log->rl_access_lock);
return; rl_size = READ_ONCE(log->rl_size);
if (rl_size != 0) {
spin_lock(&log->rl_writer_lock); spin_lock(&log->rl_logging_lock);
state = READ_ONCE(log->rl_state); state = READ_ONCE(log->rl_state);
log->rl_ring_buf[state.next_index] = record; log->rl_ring_buf[state.next_index] = record;
if (++state.next_index == log->rl_size) { if (++state.next_index == rl_size) {
state.next_index = 0; state.next_index = 0;
++state.current_pass_no; ++state.current_pass_no;
}
WRITE_ONCE(log->rl_state, state);
spin_unlock(&log->rl_logging_lock);
} }
WRITE_ONCE(log->rl_state, state); read_unlock(&log->rl_access_lock);
spin_unlock(&log->rl_writer_lock);
wake_up_all(&log->ml_notif_wq); if (rl_size != 0)
wake_up_all(&log->ml_notif_wq);
} }
static int validate_hash_tree(struct file *bf, struct data_file *df, static int validate_hash_tree(struct file *bf, struct data_file *df,
@@ -1171,9 +1194,11 @@ struct read_log_state incfs_get_log_state(struct mount_info *mi)
struct read_log *log = &mi->mi_log; struct read_log *log = &mi->mi_log;
struct read_log_state result; struct read_log_state result;
spin_lock(&log->rl_writer_lock); read_lock(&log->rl_access_lock);
spin_lock(&log->rl_logging_lock);
result = READ_ONCE(log->rl_state); result = READ_ONCE(log->rl_state);
spin_unlock(&log->rl_writer_lock); spin_unlock(&log->rl_logging_lock);
read_unlock(&log->rl_access_lock);
return result; return result;
} }
@@ -1186,10 +1211,21 @@ int incfs_get_uncollected_logs_count(struct mount_info *mi,
struct read_log_state state) struct read_log_state state)
{ {
struct read_log *log = &mi->mi_log; struct read_log *log = &mi->mi_log;
struct read_log_state rl_state;
int rl_size;
u64 count;
u64 count = calc_record_count(&log->rl_state, log->rl_size) - read_lock(&log->rl_access_lock);
calc_record_count(&state, log->rl_size); rl_size = READ_ONCE(log->rl_size);
return min_t(int, count, log->rl_size); spin_lock(&log->rl_logging_lock);
rl_state = READ_ONCE(log->rl_state);
spin_unlock(&log->rl_logging_lock);
read_unlock(&log->rl_access_lock);
count = calc_record_count(&rl_state, rl_size);
if (rl_state.generation_id == state.generation_id)
count -= calc_record_count(&state, rl_size);
return min_t(int, count, rl_size);
} }
static void fill_pending_read_from_log_record( static void fill_pending_read_from_log_record(
@@ -1209,17 +1245,35 @@ int incfs_collect_logged_reads(struct mount_info *mi,
int reads_size) int reads_size)
{ {
struct read_log *log = &mi->mi_log; struct read_log *log = &mi->mi_log;
struct read_log_state live_state = incfs_get_log_state(mi); struct read_log_state live_state;
u64 read_count = calc_record_count(reader_state, log->rl_size);
u64 written_count = calc_record_count(&live_state, log->rl_size);
int dst_idx; int dst_idx;
int rl_size;
int result = 0;
u64 read_count;
u64 written_count;
if (reader_state->next_index >= log->rl_size || read_lock(&log->rl_access_lock);
read_count > written_count)
return -ERANGE;
if (read_count == written_count) rl_size = READ_ONCE(log->rl_size);
return 0; spin_lock(&log->rl_logging_lock);
live_state = READ_ONCE(log->rl_state);
spin_unlock(&log->rl_logging_lock);
if (reader_state->generation_id != live_state.generation_id) {
reader_state->generation_id = live_state.generation_id;
reader_state->current_pass_no = reader_state->next_index = 0;
}
read_count = calc_record_count(reader_state, rl_size);
written_count = calc_record_count(&live_state, rl_size);
if (read_count == written_count) {
result = 0;
goto out;
}
if (reader_state->next_index >= rl_size) {
result = -ERANGE;
goto out;
}
if (read_count > written_count) { if (read_count > written_count) {
/* This reader is somehow ahead of the writer. */ /* This reader is somehow ahead of the writer. */
@@ -1227,16 +1281,17 @@ int incfs_collect_logged_reads(struct mount_info *mi,
*reader_state = live_state; *reader_state = live_state;
} }
if (written_count - read_count > log->rl_size) { if (written_count - read_count > rl_size) {
/* /*
* Reading pointer is too far behind, * Reading pointer is too far behind,
* start from the record following the write pointer. * start from the record following the write pointer.
*/ */
pr_debug("incfs: read pointer is behind, moving: %u/%u -> %u/%u / %u\n", pr_debug(
"incfs: read pointer is behind, moving: %u/%u -> %u/%u / %u\n",
(u32)reader_state->next_index, (u32)reader_state->next_index,
(u32)reader_state->current_pass_no, (u32)reader_state->current_pass_no,
(u32)live_state.next_index, (u32)live_state.next_index,
(u32)live_state.current_pass_no - 1, (u32)log->rl_size); (u32)live_state.current_pass_no - 1, (u32)rl_size);
*reader_state = (struct read_log_state){ *reader_state = (struct read_log_state){
.next_index = live_state.next_index, .next_index = live_state.next_index,
@@ -1252,15 +1307,19 @@ int incfs_collect_logged_reads(struct mount_info *mi,
fill_pending_read_from_log_record( fill_pending_read_from_log_record(
&reads[dst_idx], &reads[dst_idx],
&log->rl_ring_buf[reader_state->next_index], &log->rl_ring_buf[reader_state->next_index],
reader_state, log->rl_size); reader_state, rl_size);
reader_state->next_index++; reader_state->next_index++;
if (reader_state->next_index == log->rl_size) { if (reader_state->next_index == rl_size) {
reader_state->next_index = 0; reader_state->next_index = 0;
reader_state->current_pass_no++; reader_state->current_pass_no++;
} }
} }
return dst_idx; result = dst_idx;
out:
read_unlock(&log->rl_access_lock);
return result;
} }
bool incfs_equal_ranges(struct mem_range lhs, struct mem_range rhs) bool incfs_equal_ranges(struct mem_range lhs, struct mem_range rhs)

View File

@@ -31,10 +31,13 @@ struct read_log_record {
} __packed; } __packed;
struct read_log_state { struct read_log_state {
/* Next slot in rl_ring_buf to write to. */ /* Log buffer generation id, incremented on configuration changes */
u32 next_index; u32 generation_id : 8;
/* Current number of writer pass over rl_ring_buf */ /* Next slot in rl_ring_buf to write into. */
u32 next_index : 24;
/* Current number of writer passes over rl_ring_buf */
u32 current_pass_no; u32 current_pass_no;
}; };
@@ -42,11 +45,21 @@ struct read_log_state {
struct read_log { struct read_log {
struct read_log_record *rl_ring_buf; struct read_log_record *rl_ring_buf;
int rl_size;
struct read_log_state rl_state; struct read_log_state rl_state;
spinlock_t rl_writer_lock; /*
* A lock for _all_ accesses to the struct, to protect against remounts.
* Taken for writing when resizing the buffer.
*/
rwlock_t rl_access_lock;
int rl_size; /*
* A lock to protect the actual logging - adding a new record.
* Note: ALWAYS taken after and under the |rl_access_lock|.
*/
spinlock_t rl_logging_lock;
/* /*
* A queue of waiters who want to be notified about reads. * A queue of waiters who want to be notified about reads.

View File

@@ -581,22 +581,27 @@ static ssize_t log_read(struct file *f, char __user *buf, size_t len,
{ {
struct log_file_state *log_state = f->private_data; struct log_file_state *log_state = f->private_data;
struct mount_info *mi = get_mount_info(file_superblock(f)); struct mount_info *mi = get_mount_info(file_superblock(f));
struct incfs_pending_read_info *reads_buf =
(struct incfs_pending_read_info *)__get_free_page(GFP_NOFS);
size_t reads_to_collect = len / sizeof(*reads_buf);
size_t reads_per_page = PAGE_SIZE / sizeof(*reads_buf);
int total_reads_collected = 0; int total_reads_collected = 0;
int rl_size;
ssize_t result = 0; ssize_t result = 0;
struct incfs_pending_read_info *reads_buf;
ssize_t reads_to_collect = len / sizeof(*reads_buf);
ssize_t reads_per_page = PAGE_SIZE / sizeof(*reads_buf);
rl_size = READ_ONCE(mi->mi_log.rl_size);
if (rl_size == 0)
return 0;
reads_buf = (struct incfs_pending_read_info *)__get_free_page(GFP_NOFS);
if (!reads_buf) if (!reads_buf)
return -ENOMEM; return -ENOMEM;
reads_to_collect = min_t(size_t, mi->mi_log.rl_size, reads_to_collect); reads_to_collect = min_t(ssize_t, rl_size, reads_to_collect);
while (reads_to_collect > 0) { while (reads_to_collect > 0) {
struct read_log_state next_state = READ_ONCE(log_state->state); struct read_log_state next_state = READ_ONCE(log_state->state);
int reads_collected = incfs_collect_logged_reads( int reads_collected = incfs_collect_logged_reads(
mi, &next_state, reads_buf, mi, &next_state, reads_buf,
min_t(size_t, reads_to_collect, reads_per_page)); min_t(ssize_t, reads_to_collect, reads_per_page));
if (reads_collected <= 0) { if (reads_collected <= 0) {
result = total_reads_collected ? result = total_reads_collected ?
total_reads_collected * total_reads_collected *