Drivers: hv: vmbus: Offload the handling of channels to two workqueues
commit 37c2578c0c40e286bc0d30bdc05290b2058cf66e upstream. vmbus_process_offer() mustn't call channel->sc_creation_callback() directly for sub-channels, because sc_creation_callback() -> vmbus_open() may never get the host's response to the OPEN_CHANNEL message (the host may rescind a channel at any time, e.g. in the case of hot removing a NIC), and vmbus_onoffer_rescind() may not wake up the vmbus_open() as it's blocked due to a non-zero vmbus_connection.offer_in_progress, and finally we have a deadlock. The above is also true for primary channels, if the related device drivers use sync probing mode by default. And, usually the handling of primary channels and sub-channels can depend on each other, so we should offload them to different workqueues to avoid possible deadlock, e.g. in sync-probing mode, NIC1's netvsc_subchan_work() can race with NIC2's netvsc_probe() -> rtnl_lock(), and causes deadlock: the former gets the rtnl_lock and waits for all the sub-channels to appear, but the latter can't get the rtnl_lock and this blocks the handling of sub-channels. The patch can fix the multiple-NIC deadlock described above for v3.x kernels (e.g. RHEL 7.x) which don't support async-probing of devices, and v4.4, v4.9, v4.14 and v4.18 which support async-probing but don't enable async-probing for Hyper-V drivers (yet). The patch can also fix the hang issue in sub-channel's handling described above for all versions of kernels, including v4.19 and v4.20-rc4. So actually the patch should be applied to all the existing kernels, not only the kernels that have8195b1396e. Fixes:8195b1396e("hv_netvsc: fix deadlock on hotplug") Cc: stable@vger.kernel.org Cc: Stephen Hemminger <sthemmin@microsoft.com> Cc: K. Y. Srinivasan <kys@microsoft.com> Cc: Haiyang Zhang <haiyangz@microsoft.com> Signed-off-by: Dexuan Cui <decui@microsoft.com> Signed-off-by: K. Y. Srinivasan <kys@microsoft.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
committed by
Greg Kroah-Hartman
parent
e88ebc06bd
commit
b02b86bc74
@@ -447,61 +447,16 @@ void vmbus_free_channels(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* Note: the function can run concurrently for primary/sub channels. */
|
||||||
* vmbus_process_offer - Process the offer by creating a channel/device
|
static void vmbus_add_channel_work(struct work_struct *work)
|
||||||
* associated with this offer
|
|
||||||
*/
|
|
||||||
static void vmbus_process_offer(struct vmbus_channel *newchannel)
|
|
||||||
{
|
{
|
||||||
struct vmbus_channel *channel;
|
struct vmbus_channel *newchannel =
|
||||||
bool fnew = true;
|
container_of(work, struct vmbus_channel, add_channel_work);
|
||||||
|
struct vmbus_channel *primary_channel = newchannel->primary_channel;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
u16 dev_type;
|
u16 dev_type;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
/* Make sure this is a new offer */
|
|
||||||
mutex_lock(&vmbus_connection.channel_mutex);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Now that we have acquired the channel_mutex,
|
|
||||||
* we can release the potentially racing rescind thread.
|
|
||||||
*/
|
|
||||||
atomic_dec(&vmbus_connection.offer_in_progress);
|
|
||||||
|
|
||||||
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
|
|
||||||
if (!uuid_le_cmp(channel->offermsg.offer.if_type,
|
|
||||||
newchannel->offermsg.offer.if_type) &&
|
|
||||||
!uuid_le_cmp(channel->offermsg.offer.if_instance,
|
|
||||||
newchannel->offermsg.offer.if_instance)) {
|
|
||||||
fnew = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (fnew)
|
|
||||||
list_add_tail(&newchannel->listentry,
|
|
||||||
&vmbus_connection.chn_list);
|
|
||||||
|
|
||||||
mutex_unlock(&vmbus_connection.channel_mutex);
|
|
||||||
|
|
||||||
if (!fnew) {
|
|
||||||
/*
|
|
||||||
* Check to see if this is a sub-channel.
|
|
||||||
*/
|
|
||||||
if (newchannel->offermsg.offer.sub_channel_index != 0) {
|
|
||||||
/*
|
|
||||||
* Process the sub-channel.
|
|
||||||
*/
|
|
||||||
newchannel->primary_channel = channel;
|
|
||||||
spin_lock_irqsave(&channel->lock, flags);
|
|
||||||
list_add_tail(&newchannel->sc_list, &channel->sc_list);
|
|
||||||
channel->num_sc++;
|
|
||||||
spin_unlock_irqrestore(&channel->lock, flags);
|
|
||||||
} else {
|
|
||||||
goto err_free_chan;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
dev_type = hv_get_dev_type(newchannel);
|
dev_type = hv_get_dev_type(newchannel);
|
||||||
|
|
||||||
init_vp_index(newchannel, dev_type);
|
init_vp_index(newchannel, dev_type);
|
||||||
@@ -519,27 +474,26 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
|
|||||||
/*
|
/*
|
||||||
* This state is used to indicate a successful open
|
* This state is used to indicate a successful open
|
||||||
* so that when we do close the channel normally, we
|
* so that when we do close the channel normally, we
|
||||||
* can cleanup properly
|
* can cleanup properly.
|
||||||
*/
|
*/
|
||||||
newchannel->state = CHANNEL_OPEN_STATE;
|
newchannel->state = CHANNEL_OPEN_STATE;
|
||||||
|
|
||||||
if (!fnew) {
|
if (primary_channel != NULL) {
|
||||||
struct hv_device *dev
|
/* newchannel is a sub-channel. */
|
||||||
= newchannel->primary_channel->device_obj;
|
struct hv_device *dev = primary_channel->device_obj;
|
||||||
|
|
||||||
if (vmbus_add_channel_kobj(dev, newchannel))
|
if (vmbus_add_channel_kobj(dev, newchannel))
|
||||||
goto err_free_chan;
|
goto err_deq_chan;
|
||||||
|
|
||||||
|
if (primary_channel->sc_creation_callback != NULL)
|
||||||
|
primary_channel->sc_creation_callback(newchannel);
|
||||||
|
|
||||||
if (channel->sc_creation_callback != NULL)
|
|
||||||
channel->sc_creation_callback(newchannel);
|
|
||||||
newchannel->probe_done = true;
|
newchannel->probe_done = true;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Start the process of binding this offer to the driver
|
* Start the process of binding the primary channel to the driver
|
||||||
* We need to set the DeviceObject field before calling
|
|
||||||
* vmbus_child_dev_add()
|
|
||||||
*/
|
*/
|
||||||
newchannel->device_obj = vmbus_device_create(
|
newchannel->device_obj = vmbus_device_create(
|
||||||
&newchannel->offermsg.offer.if_type,
|
&newchannel->offermsg.offer.if_type,
|
||||||
@@ -568,13 +522,28 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
|
|||||||
|
|
||||||
err_deq_chan:
|
err_deq_chan:
|
||||||
mutex_lock(&vmbus_connection.channel_mutex);
|
mutex_lock(&vmbus_connection.channel_mutex);
|
||||||
list_del(&newchannel->listentry);
|
|
||||||
|
/*
|
||||||
|
* We need to set the flag, otherwise
|
||||||
|
* vmbus_onoffer_rescind() can be blocked.
|
||||||
|
*/
|
||||||
|
newchannel->probe_done = true;
|
||||||
|
|
||||||
|
if (primary_channel == NULL) {
|
||||||
|
list_del(&newchannel->listentry);
|
||||||
|
} else {
|
||||||
|
spin_lock_irqsave(&primary_channel->lock, flags);
|
||||||
|
list_del(&newchannel->sc_list);
|
||||||
|
spin_unlock_irqrestore(&primary_channel->lock, flags);
|
||||||
|
}
|
||||||
|
|
||||||
mutex_unlock(&vmbus_connection.channel_mutex);
|
mutex_unlock(&vmbus_connection.channel_mutex);
|
||||||
|
|
||||||
if (newchannel->target_cpu != get_cpu()) {
|
if (newchannel->target_cpu != get_cpu()) {
|
||||||
put_cpu();
|
put_cpu();
|
||||||
smp_call_function_single(newchannel->target_cpu,
|
smp_call_function_single(newchannel->target_cpu,
|
||||||
percpu_channel_deq, newchannel, true);
|
percpu_channel_deq,
|
||||||
|
newchannel, true);
|
||||||
} else {
|
} else {
|
||||||
percpu_channel_deq(newchannel);
|
percpu_channel_deq(newchannel);
|
||||||
put_cpu();
|
put_cpu();
|
||||||
@@ -582,14 +551,104 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
|
|||||||
|
|
||||||
vmbus_release_relid(newchannel->offermsg.child_relid);
|
vmbus_release_relid(newchannel->offermsg.child_relid);
|
||||||
|
|
||||||
err_free_chan:
|
|
||||||
free_channel(newchannel);
|
free_channel(newchannel);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* vmbus_process_offer - Process the offer by creating a channel/device
|
||||||
|
* associated with this offer
|
||||||
|
*/
|
||||||
|
static void vmbus_process_offer(struct vmbus_channel *newchannel)
|
||||||
|
{
|
||||||
|
struct vmbus_channel *channel;
|
||||||
|
struct workqueue_struct *wq;
|
||||||
|
unsigned long flags;
|
||||||
|
bool fnew = true;
|
||||||
|
|
||||||
|
mutex_lock(&vmbus_connection.channel_mutex);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Now that we have acquired the channel_mutex,
|
||||||
|
* we can release the potentially racing rescind thread.
|
||||||
|
*/
|
||||||
|
atomic_dec(&vmbus_connection.offer_in_progress);
|
||||||
|
|
||||||
|
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
|
||||||
|
if (!uuid_le_cmp(channel->offermsg.offer.if_type,
|
||||||
|
newchannel->offermsg.offer.if_type) &&
|
||||||
|
!uuid_le_cmp(channel->offermsg.offer.if_instance,
|
||||||
|
newchannel->offermsg.offer.if_instance)) {
|
||||||
|
fnew = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fnew)
|
||||||
|
list_add_tail(&newchannel->listentry,
|
||||||
|
&vmbus_connection.chn_list);
|
||||||
|
else {
|
||||||
|
/*
|
||||||
|
* Check to see if this is a valid sub-channel.
|
||||||
|
*/
|
||||||
|
if (newchannel->offermsg.offer.sub_channel_index == 0) {
|
||||||
|
mutex_unlock(&vmbus_connection.channel_mutex);
|
||||||
|
/*
|
||||||
|
* Don't call free_channel(), because newchannel->kobj
|
||||||
|
* is not initialized yet.
|
||||||
|
*/
|
||||||
|
kfree(newchannel);
|
||||||
|
WARN_ON_ONCE(1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Process the sub-channel.
|
||||||
|
*/
|
||||||
|
newchannel->primary_channel = channel;
|
||||||
|
spin_lock_irqsave(&channel->lock, flags);
|
||||||
|
list_add_tail(&newchannel->sc_list, &channel->sc_list);
|
||||||
|
spin_unlock_irqrestore(&channel->lock, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
mutex_unlock(&vmbus_connection.channel_mutex);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* vmbus_process_offer() mustn't call channel->sc_creation_callback()
|
||||||
|
* directly for sub-channels, because sc_creation_callback() ->
|
||||||
|
* vmbus_open() may never get the host's response to the
|
||||||
|
* OPEN_CHANNEL message (the host may rescind a channel at any time,
|
||||||
|
* e.g. in the case of hot removing a NIC), and vmbus_onoffer_rescind()
|
||||||
|
* may not wake up the vmbus_open() as it's blocked due to a non-zero
|
||||||
|
* vmbus_connection.offer_in_progress, and finally we have a deadlock.
|
||||||
|
*
|
||||||
|
* The above is also true for primary channels, if the related device
|
||||||
|
* drivers use sync probing mode by default.
|
||||||
|
*
|
||||||
|
* And, usually the handling of primary channels and sub-channels can
|
||||||
|
* depend on each other, so we should offload them to different
|
||||||
|
* workqueues to avoid possible deadlock, e.g. in sync-probing mode,
|
||||||
|
* NIC1's netvsc_subchan_work() can race with NIC2's netvsc_probe() ->
|
||||||
|
* rtnl_lock(), and causes deadlock: the former gets the rtnl_lock
|
||||||
|
* and waits for all the sub-channels to appear, but the latter
|
||||||
|
* can't get the rtnl_lock and this blocks the handling of
|
||||||
|
* sub-channels.
|
||||||
|
*/
|
||||||
|
INIT_WORK(&newchannel->add_channel_work, vmbus_add_channel_work);
|
||||||
|
wq = fnew ? vmbus_connection.handle_primary_chan_wq :
|
||||||
|
vmbus_connection.handle_sub_chan_wq;
|
||||||
|
queue_work(wq, &newchannel->add_channel_work);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We use this state to statically distribute the channel interrupt load.
|
* We use this state to statically distribute the channel interrupt load.
|
||||||
*/
|
*/
|
||||||
static int next_numa_node_id;
|
static int next_numa_node_id;
|
||||||
|
/*
|
||||||
|
* init_vp_index() accesses global variables like next_numa_node_id, and
|
||||||
|
* it can run concurrently for primary channels and sub-channels: see
|
||||||
|
* vmbus_process_offer(), so we need the lock to protect the global
|
||||||
|
* variables.
|
||||||
|
*/
|
||||||
|
static DEFINE_SPINLOCK(bind_channel_to_cpu_lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Starting with Win8, we can statically distribute the incoming
|
* Starting with Win8, we can statically distribute the incoming
|
||||||
@@ -625,6 +684,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
spin_lock(&bind_channel_to_cpu_lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Based on the channel affinity policy, we will assign the NUMA
|
* Based on the channel affinity policy, we will assign the NUMA
|
||||||
* nodes.
|
* nodes.
|
||||||
@@ -707,6 +768,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
|
|||||||
channel->target_cpu = cur_cpu;
|
channel->target_cpu = cur_cpu;
|
||||||
channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu);
|
channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu);
|
||||||
|
|
||||||
|
spin_unlock(&bind_channel_to_cpu_lock);
|
||||||
|
|
||||||
free_cpumask_var(available_mask);
|
free_cpumask_var(available_mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -190,6 +190,20 @@ int vmbus_connect(void)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vmbus_connection.handle_primary_chan_wq =
|
||||||
|
create_workqueue("hv_pri_chan");
|
||||||
|
if (!vmbus_connection.handle_primary_chan_wq) {
|
||||||
|
ret = -ENOMEM;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
vmbus_connection.handle_sub_chan_wq =
|
||||||
|
create_workqueue("hv_sub_chan");
|
||||||
|
if (!vmbus_connection.handle_sub_chan_wq) {
|
||||||
|
ret = -ENOMEM;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
INIT_LIST_HEAD(&vmbus_connection.chn_msg_list);
|
INIT_LIST_HEAD(&vmbus_connection.chn_msg_list);
|
||||||
spin_lock_init(&vmbus_connection.channelmsg_lock);
|
spin_lock_init(&vmbus_connection.channelmsg_lock);
|
||||||
|
|
||||||
@@ -280,10 +294,14 @@ void vmbus_disconnect(void)
|
|||||||
*/
|
*/
|
||||||
vmbus_initiate_unload(false);
|
vmbus_initiate_unload(false);
|
||||||
|
|
||||||
if (vmbus_connection.work_queue) {
|
if (vmbus_connection.handle_sub_chan_wq)
|
||||||
drain_workqueue(vmbus_connection.work_queue);
|
destroy_workqueue(vmbus_connection.handle_sub_chan_wq);
|
||||||
|
|
||||||
|
if (vmbus_connection.handle_primary_chan_wq)
|
||||||
|
destroy_workqueue(vmbus_connection.handle_primary_chan_wq);
|
||||||
|
|
||||||
|
if (vmbus_connection.work_queue)
|
||||||
destroy_workqueue(vmbus_connection.work_queue);
|
destroy_workqueue(vmbus_connection.work_queue);
|
||||||
}
|
|
||||||
|
|
||||||
if (vmbus_connection.int_page) {
|
if (vmbus_connection.int_page) {
|
||||||
free_pages((unsigned long)vmbus_connection.int_page, 0);
|
free_pages((unsigned long)vmbus_connection.int_page, 0);
|
||||||
|
|||||||
@@ -335,7 +335,14 @@ struct vmbus_connection {
|
|||||||
struct list_head chn_list;
|
struct list_head chn_list;
|
||||||
struct mutex channel_mutex;
|
struct mutex channel_mutex;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* An offer message is handled first on the work_queue, and then
|
||||||
|
* is further handled on handle_primary_chan_wq or
|
||||||
|
* handle_sub_chan_wq.
|
||||||
|
*/
|
||||||
struct workqueue_struct *work_queue;
|
struct workqueue_struct *work_queue;
|
||||||
|
struct workqueue_struct *handle_primary_chan_wq;
|
||||||
|
struct workqueue_struct *handle_sub_chan_wq;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -904,6 +904,13 @@ struct vmbus_channel {
|
|||||||
|
|
||||||
bool probe_done;
|
bool probe_done;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We must offload the handling of the primary/sub channels
|
||||||
|
* from the single-threaded vmbus_connection.work_queue to
|
||||||
|
* two different workqueue, otherwise we can block
|
||||||
|
* vmbus_connection.work_queue and hang: see vmbus_process_offer().
|
||||||
|
*/
|
||||||
|
struct work_struct add_channel_work;
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline bool is_hvsock_channel(const struct vmbus_channel *c)
|
static inline bool is_hvsock_channel(const struct vmbus_channel *c)
|
||||||
|
|||||||
Reference in New Issue
Block a user