ANDROID: Incremental fs: make remount log buffer change atomic
Read log buffer can have multiple threads doing any of these
operations simultaneously:
- Polling for changes
- Reading log records
- Adding new log records
- Updating log buffer size, or enabling/disabling it completely
As we don't control the userspace, and it turns out that they
all currently originate from different processes, code needs to
be safe against parallel access to a read buffer and a request
for reallocating it.
This CL add an r/w spinlock to protect the buffer and its size.
Each remount takes the write lock, while everything else takes
a read lock. Remount makes sure it doesn't take too long by
preallocating and precalculating all updates, while other
operations don't care much about their critical section size -
they all can still run together.
Bug: 152633648
Test: manual remount + reading
Signed-off-by: Yurii Zubrytskyi <zyy@google.com>
Signed-off-by: Paul Lawrence <paullawrence@google.com>
Change-Id: I7271b4cb89f1ae2cbee6e5b073758f344c4ba66a
Git-commit: 5e6feacb2a
Git-repo: https://android.googlesource.com/kernel/common/
Signed-off-by: Sayali Lokhande <sayalil@codeaurora.org>
This commit is contained in:
committed by
Blagovest Kolenichev
parent
aae486b462
commit
46c1f3f598
@@ -34,7 +34,8 @@ struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
|
|||||||
mutex_init(&mi->mi_pending_reads_mutex);
|
mutex_init(&mi->mi_pending_reads_mutex);
|
||||||
init_waitqueue_head(&mi->mi_pending_reads_notif_wq);
|
init_waitqueue_head(&mi->mi_pending_reads_notif_wq);
|
||||||
init_waitqueue_head(&mi->mi_log.ml_notif_wq);
|
init_waitqueue_head(&mi->mi_log.ml_notif_wq);
|
||||||
spin_lock_init(&mi->mi_log.rl_writer_lock);
|
rwlock_init(&mi->mi_log.rl_access_lock);
|
||||||
|
spin_lock_init(&mi->mi_log.rl_logging_lock);
|
||||||
INIT_LIST_HEAD(&mi->mi_reads_list_head);
|
INIT_LIST_HEAD(&mi->mi_reads_list_head);
|
||||||
|
|
||||||
error = incfs_realloc_mount_info(mi, options);
|
error = incfs_realloc_mount_info(mi, options);
|
||||||
@@ -51,20 +52,38 @@ struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
|
|||||||
int incfs_realloc_mount_info(struct mount_info *mi,
|
int incfs_realloc_mount_info(struct mount_info *mi,
|
||||||
struct mount_options *options)
|
struct mount_options *options)
|
||||||
{
|
{
|
||||||
kfree(mi->mi_log.rl_ring_buf);
|
void *new_buffer = NULL;
|
||||||
mi->mi_log.rl_ring_buf = NULL;
|
size_t new_buffer_size = 0;
|
||||||
mi->mi_log.rl_size = 0;
|
|
||||||
|
|
||||||
mi->mi_options = *options;
|
if (options->read_log_pages != mi->mi_options.read_log_pages) {
|
||||||
if (options->read_log_pages != 0) {
|
struct read_log_state log_state;
|
||||||
size_t buf_size = PAGE_SIZE * options->read_log_pages;
|
/*
|
||||||
|
* Even though having two buffers allocated at once isn't
|
||||||
|
* usually good, allocating a multipage buffer under a spinlock
|
||||||
|
* is even worse, so let's optimize for the shorter lock
|
||||||
|
* duration. It's not end of the world if we fail to increase
|
||||||
|
* the buffer size anyway.
|
||||||
|
*/
|
||||||
|
if (options->read_log_pages > 0) {
|
||||||
|
new_buffer_size = PAGE_SIZE * options->read_log_pages;
|
||||||
|
new_buffer = kzalloc(new_buffer_size, GFP_NOFS);
|
||||||
|
if (!new_buffer)
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
mi->mi_log.rl_size = buf_size / sizeof(*mi->mi_log.rl_ring_buf);
|
write_lock(&mi->mi_log.rl_access_lock);
|
||||||
mi->mi_log.rl_ring_buf = kzalloc(buf_size, GFP_NOFS);
|
kfree(mi->mi_log.rl_ring_buf);
|
||||||
if (!mi->mi_log.rl_ring_buf)
|
WRITE_ONCE(mi->mi_log.rl_ring_buf, new_buffer);
|
||||||
return -ENOMEM;
|
WRITE_ONCE(mi->mi_log.rl_size,
|
||||||
|
new_buffer_size / sizeof(*mi->mi_log.rl_ring_buf));
|
||||||
|
log_state = READ_ONCE(mi->mi_log.rl_state);
|
||||||
|
log_state.generation_id++;
|
||||||
|
log_state.next_index = log_state.current_pass_no = 0;
|
||||||
|
WRITE_ONCE(mi->mi_log.rl_state, log_state);
|
||||||
|
write_unlock(&mi->mi_log.rl_access_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mi->mi_options = *options;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -233,6 +252,7 @@ static void log_block_read(struct mount_info *mi, incfs_uuid_t *id,
|
|||||||
struct read_log *log = &mi->mi_log;
|
struct read_log *log = &mi->mi_log;
|
||||||
struct read_log_state state;
|
struct read_log_state state;
|
||||||
s64 now_us = ktime_to_us(ktime_get());
|
s64 now_us = ktime_to_us(ktime_get());
|
||||||
|
int rl_size;
|
||||||
struct read_log_record record = {
|
struct read_log_record record = {
|
||||||
.file_id = *id,
|
.file_id = *id,
|
||||||
.block_index = block_index,
|
.block_index = block_index,
|
||||||
@@ -240,20 +260,23 @@ static void log_block_read(struct mount_info *mi, incfs_uuid_t *id,
|
|||||||
.timestamp_us = now_us
|
.timestamp_us = now_us
|
||||||
};
|
};
|
||||||
|
|
||||||
if (log->rl_size == 0)
|
read_lock(&log->rl_access_lock);
|
||||||
return;
|
rl_size = READ_ONCE(log->rl_size);
|
||||||
|
if (rl_size != 0) {
|
||||||
spin_lock(&log->rl_writer_lock);
|
spin_lock(&log->rl_logging_lock);
|
||||||
state = READ_ONCE(log->rl_state);
|
state = READ_ONCE(log->rl_state);
|
||||||
log->rl_ring_buf[state.next_index] = record;
|
log->rl_ring_buf[state.next_index] = record;
|
||||||
if (++state.next_index == log->rl_size) {
|
if (++state.next_index == rl_size) {
|
||||||
state.next_index = 0;
|
state.next_index = 0;
|
||||||
++state.current_pass_no;
|
++state.current_pass_no;
|
||||||
|
}
|
||||||
|
WRITE_ONCE(log->rl_state, state);
|
||||||
|
spin_unlock(&log->rl_logging_lock);
|
||||||
}
|
}
|
||||||
WRITE_ONCE(log->rl_state, state);
|
read_unlock(&log->rl_access_lock);
|
||||||
spin_unlock(&log->rl_writer_lock);
|
|
||||||
|
|
||||||
wake_up_all(&log->ml_notif_wq);
|
if (rl_size != 0)
|
||||||
|
wake_up_all(&log->ml_notif_wq);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int validate_hash_tree(struct file *bf, struct data_file *df,
|
static int validate_hash_tree(struct file *bf, struct data_file *df,
|
||||||
@@ -1171,9 +1194,11 @@ struct read_log_state incfs_get_log_state(struct mount_info *mi)
|
|||||||
struct read_log *log = &mi->mi_log;
|
struct read_log *log = &mi->mi_log;
|
||||||
struct read_log_state result;
|
struct read_log_state result;
|
||||||
|
|
||||||
spin_lock(&log->rl_writer_lock);
|
read_lock(&log->rl_access_lock);
|
||||||
|
spin_lock(&log->rl_logging_lock);
|
||||||
result = READ_ONCE(log->rl_state);
|
result = READ_ONCE(log->rl_state);
|
||||||
spin_unlock(&log->rl_writer_lock);
|
spin_unlock(&log->rl_logging_lock);
|
||||||
|
read_unlock(&log->rl_access_lock);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1186,10 +1211,21 @@ int incfs_get_uncollected_logs_count(struct mount_info *mi,
|
|||||||
struct read_log_state state)
|
struct read_log_state state)
|
||||||
{
|
{
|
||||||
struct read_log *log = &mi->mi_log;
|
struct read_log *log = &mi->mi_log;
|
||||||
|
struct read_log_state rl_state;
|
||||||
|
int rl_size;
|
||||||
|
u64 count;
|
||||||
|
|
||||||
u64 count = calc_record_count(&log->rl_state, log->rl_size) -
|
read_lock(&log->rl_access_lock);
|
||||||
calc_record_count(&state, log->rl_size);
|
rl_size = READ_ONCE(log->rl_size);
|
||||||
return min_t(int, count, log->rl_size);
|
spin_lock(&log->rl_logging_lock);
|
||||||
|
rl_state = READ_ONCE(log->rl_state);
|
||||||
|
spin_unlock(&log->rl_logging_lock);
|
||||||
|
read_unlock(&log->rl_access_lock);
|
||||||
|
|
||||||
|
count = calc_record_count(&rl_state, rl_size);
|
||||||
|
if (rl_state.generation_id == state.generation_id)
|
||||||
|
count -= calc_record_count(&state, rl_size);
|
||||||
|
return min_t(int, count, rl_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void fill_pending_read_from_log_record(
|
static void fill_pending_read_from_log_record(
|
||||||
@@ -1209,17 +1245,35 @@ int incfs_collect_logged_reads(struct mount_info *mi,
|
|||||||
int reads_size)
|
int reads_size)
|
||||||
{
|
{
|
||||||
struct read_log *log = &mi->mi_log;
|
struct read_log *log = &mi->mi_log;
|
||||||
struct read_log_state live_state = incfs_get_log_state(mi);
|
struct read_log_state live_state;
|
||||||
u64 read_count = calc_record_count(reader_state, log->rl_size);
|
|
||||||
u64 written_count = calc_record_count(&live_state, log->rl_size);
|
|
||||||
int dst_idx;
|
int dst_idx;
|
||||||
|
int rl_size;
|
||||||
|
int result = 0;
|
||||||
|
u64 read_count;
|
||||||
|
u64 written_count;
|
||||||
|
|
||||||
if (reader_state->next_index >= log->rl_size ||
|
read_lock(&log->rl_access_lock);
|
||||||
read_count > written_count)
|
|
||||||
return -ERANGE;
|
|
||||||
|
|
||||||
if (read_count == written_count)
|
rl_size = READ_ONCE(log->rl_size);
|
||||||
return 0;
|
spin_lock(&log->rl_logging_lock);
|
||||||
|
live_state = READ_ONCE(log->rl_state);
|
||||||
|
spin_unlock(&log->rl_logging_lock);
|
||||||
|
|
||||||
|
if (reader_state->generation_id != live_state.generation_id) {
|
||||||
|
reader_state->generation_id = live_state.generation_id;
|
||||||
|
reader_state->current_pass_no = reader_state->next_index = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
read_count = calc_record_count(reader_state, rl_size);
|
||||||
|
written_count = calc_record_count(&live_state, rl_size);
|
||||||
|
if (read_count == written_count) {
|
||||||
|
result = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
if (reader_state->next_index >= rl_size) {
|
||||||
|
result = -ERANGE;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
if (read_count > written_count) {
|
if (read_count > written_count) {
|
||||||
/* This reader is somehow ahead of the writer. */
|
/* This reader is somehow ahead of the writer. */
|
||||||
@@ -1227,16 +1281,17 @@ int incfs_collect_logged_reads(struct mount_info *mi,
|
|||||||
*reader_state = live_state;
|
*reader_state = live_state;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (written_count - read_count > log->rl_size) {
|
if (written_count - read_count > rl_size) {
|
||||||
/*
|
/*
|
||||||
* Reading pointer is too far behind,
|
* Reading pointer is too far behind,
|
||||||
* start from the record following the write pointer.
|
* start from the record following the write pointer.
|
||||||
*/
|
*/
|
||||||
pr_debug("incfs: read pointer is behind, moving: %u/%u -> %u/%u / %u\n",
|
pr_debug(
|
||||||
|
"incfs: read pointer is behind, moving: %u/%u -> %u/%u / %u\n",
|
||||||
(u32)reader_state->next_index,
|
(u32)reader_state->next_index,
|
||||||
(u32)reader_state->current_pass_no,
|
(u32)reader_state->current_pass_no,
|
||||||
(u32)live_state.next_index,
|
(u32)live_state.next_index,
|
||||||
(u32)live_state.current_pass_no - 1, (u32)log->rl_size);
|
(u32)live_state.current_pass_no - 1, (u32)rl_size);
|
||||||
|
|
||||||
*reader_state = (struct read_log_state){
|
*reader_state = (struct read_log_state){
|
||||||
.next_index = live_state.next_index,
|
.next_index = live_state.next_index,
|
||||||
@@ -1252,15 +1307,19 @@ int incfs_collect_logged_reads(struct mount_info *mi,
|
|||||||
fill_pending_read_from_log_record(
|
fill_pending_read_from_log_record(
|
||||||
&reads[dst_idx],
|
&reads[dst_idx],
|
||||||
&log->rl_ring_buf[reader_state->next_index],
|
&log->rl_ring_buf[reader_state->next_index],
|
||||||
reader_state, log->rl_size);
|
reader_state, rl_size);
|
||||||
|
|
||||||
reader_state->next_index++;
|
reader_state->next_index++;
|
||||||
if (reader_state->next_index == log->rl_size) {
|
if (reader_state->next_index == rl_size) {
|
||||||
reader_state->next_index = 0;
|
reader_state->next_index = 0;
|
||||||
reader_state->current_pass_no++;
|
reader_state->current_pass_no++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return dst_idx;
|
result = dst_idx;
|
||||||
|
|
||||||
|
out:
|
||||||
|
read_unlock(&log->rl_access_lock);
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool incfs_equal_ranges(struct mem_range lhs, struct mem_range rhs)
|
bool incfs_equal_ranges(struct mem_range lhs, struct mem_range rhs)
|
||||||
|
|||||||
@@ -31,10 +31,13 @@ struct read_log_record {
|
|||||||
} __packed;
|
} __packed;
|
||||||
|
|
||||||
struct read_log_state {
|
struct read_log_state {
|
||||||
/* Next slot in rl_ring_buf to write to. */
|
/* Log buffer generation id, incremented on configuration changes */
|
||||||
u32 next_index;
|
u32 generation_id : 8;
|
||||||
|
|
||||||
/* Current number of writer pass over rl_ring_buf */
|
/* Next slot in rl_ring_buf to write into. */
|
||||||
|
u32 next_index : 24;
|
||||||
|
|
||||||
|
/* Current number of writer passes over rl_ring_buf */
|
||||||
u32 current_pass_no;
|
u32 current_pass_no;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -42,11 +45,21 @@ struct read_log_state {
|
|||||||
struct read_log {
|
struct read_log {
|
||||||
struct read_log_record *rl_ring_buf;
|
struct read_log_record *rl_ring_buf;
|
||||||
|
|
||||||
|
int rl_size;
|
||||||
|
|
||||||
struct read_log_state rl_state;
|
struct read_log_state rl_state;
|
||||||
|
|
||||||
spinlock_t rl_writer_lock;
|
/*
|
||||||
|
* A lock for _all_ accesses to the struct, to protect against remounts.
|
||||||
|
* Taken for writing when resizing the buffer.
|
||||||
|
*/
|
||||||
|
rwlock_t rl_access_lock;
|
||||||
|
|
||||||
int rl_size;
|
/*
|
||||||
|
* A lock to protect the actual logging - adding a new record.
|
||||||
|
* Note: ALWAYS taken after and under the |rl_access_lock|.
|
||||||
|
*/
|
||||||
|
spinlock_t rl_logging_lock;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* A queue of waiters who want to be notified about reads.
|
* A queue of waiters who want to be notified about reads.
|
||||||
|
|||||||
@@ -581,22 +581,27 @@ static ssize_t log_read(struct file *f, char __user *buf, size_t len,
|
|||||||
{
|
{
|
||||||
struct log_file_state *log_state = f->private_data;
|
struct log_file_state *log_state = f->private_data;
|
||||||
struct mount_info *mi = get_mount_info(file_superblock(f));
|
struct mount_info *mi = get_mount_info(file_superblock(f));
|
||||||
struct incfs_pending_read_info *reads_buf =
|
|
||||||
(struct incfs_pending_read_info *)__get_free_page(GFP_NOFS);
|
|
||||||
size_t reads_to_collect = len / sizeof(*reads_buf);
|
|
||||||
size_t reads_per_page = PAGE_SIZE / sizeof(*reads_buf);
|
|
||||||
int total_reads_collected = 0;
|
int total_reads_collected = 0;
|
||||||
|
int rl_size;
|
||||||
ssize_t result = 0;
|
ssize_t result = 0;
|
||||||
|
struct incfs_pending_read_info *reads_buf;
|
||||||
|
ssize_t reads_to_collect = len / sizeof(*reads_buf);
|
||||||
|
ssize_t reads_per_page = PAGE_SIZE / sizeof(*reads_buf);
|
||||||
|
|
||||||
|
rl_size = READ_ONCE(mi->mi_log.rl_size);
|
||||||
|
if (rl_size == 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
reads_buf = (struct incfs_pending_read_info *)__get_free_page(GFP_NOFS);
|
||||||
if (!reads_buf)
|
if (!reads_buf)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
reads_to_collect = min_t(size_t, mi->mi_log.rl_size, reads_to_collect);
|
reads_to_collect = min_t(ssize_t, rl_size, reads_to_collect);
|
||||||
while (reads_to_collect > 0) {
|
while (reads_to_collect > 0) {
|
||||||
struct read_log_state next_state = READ_ONCE(log_state->state);
|
struct read_log_state next_state = READ_ONCE(log_state->state);
|
||||||
int reads_collected = incfs_collect_logged_reads(
|
int reads_collected = incfs_collect_logged_reads(
|
||||||
mi, &next_state, reads_buf,
|
mi, &next_state, reads_buf,
|
||||||
min_t(size_t, reads_to_collect, reads_per_page));
|
min_t(ssize_t, reads_to_collect, reads_per_page));
|
||||||
if (reads_collected <= 0) {
|
if (reads_collected <= 0) {
|
||||||
result = total_reads_collected ?
|
result = total_reads_collected ?
|
||||||
total_reads_collected *
|
total_reads_collected *
|
||||||
|
|||||||
Reference in New Issue
Block a user