Re: [PATCH v3 45/55] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()
From: Xiubo Li
Date: Mon Apr 10 2023 - 08:21:49 EST
David,
With these patches I can reproduce the hang every time, and I haven't
gotten a chance to debug it yet, the logs:
<3>[ 727.042312] INFO: task kworker/u20:4:78 blocked for more than 245
seconds.
<3>[ 727.042381] Tainted: G W 6.3.0-rc3+ #9
<3>[ 727.042417] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
<6>[ 727.042449] task:kworker/u20:4 state:D stack:0 pid:78
ppid:2 flags:0x00004000
<6>[ 727.042496] Workqueue: writeback wb_workfn (flush-ceph-1)
<6>[ 727.042558] Call Trace:
<6>[ 727.042576] <TASK>
<6>[ 727.042630] __schedule+0x4a8/0xa80
<6>[ 727.042942] ? __pfx___schedule+0x10/0x10
<6>[ 727.042997] ? __pfx___lock_release+0x10/0x10
<6>[ 727.043118] ? check_chain_key+0x205/0x2b0
<6>[ 727.043299] schedule+0x8e/0x120
<6>[ 727.043372] schedule_preempt_disabled+0x11/0x20
<6>[ 727.043420] __mutex_lock+0x97b/0x1270
<6>[ 727.043532] ? ceph_con_send+0xa4/0x310 [libceph]
<6>[ 727.044348] ? __pfx___mutex_lock+0x10/0x10
<6>[ 727.044410] ? encode_oloc+0x16e/0x1b0 [libceph]
<6>[ 727.045473] ? ceph_con_send+0xa4/0x310 [libceph]
<6>[ 727.046298] ceph_con_send+0xa4/0x310 [libceph]
<6>[ 727.047200] send_request+0x2b1/0x760 [libceph]
<6>[ 727.048122] ? __pfx___lock_release+0x10/0x10
<6>[ 727.048186] ? __lock_acquired+0x1ef/0x3d0
<6>[ 727.048264] ? __pfx_send_request+0x10/0x10 [libceph]
<6>[ 727.049104] ? check_chain_key+0x205/0x2b0
<6>[ 727.049215] ? link_request+0xcd/0x1a0 [libceph]
<6>[ 727.050047] ? do_raw_spin_unlock+0x99/0x100
<6>[ 727.050159] ? _raw_spin_unlock+0x1f/0x40
<6>[ 727.050279] __submit_request+0x2b7/0x4e0 [libceph]
<6>[ 727.051259] ceph_osdc_start_request+0x31/0x40 [libceph]
<6>[ 727.052141] ceph_writepages_start+0x1d13/0x2490 [ceph]
<6>[ 727.053258] ? __pfx_ceph_writepages_start+0x10/0x10 [ceph]
<6>[ 727.053872] ? validate_chain+0xa1/0x760
<6>[ 727.054023] ? __pfx_validate_chain+0x10/0x10
<6>[ 727.054171] ? check_chain_key+0x205/0x2b0
<6>[ 727.054308] ? __lock_acquire+0x7b3/0xfc0
<6>[ 727.054726] ? reacquire_held_locks+0x18b/0x290
<6>[ 727.054790] ? writeback_sb_inodes+0x263/0x7c0
<6>[ 727.054990] do_writepages+0x106/0x320
<6>[ 727.055068] ? __pfx_do_writepages+0x10/0x10
<6>[ 727.055152] ? __pfx___lock_release+0x10/0x10
<6>[ 727.055205] ? __lock_acquired+0x1ef/0x3d0
<6>[ 727.055314] ? check_chain_key+0x205/0x2b0
<6>[ 727.055594] __writeback_single_inode+0x95/0x450
<6>[ 727.055725] writeback_sb_inodes+0x392/0x7c0
<6>[ 727.055912] ? __pfx_writeback_sb_inodes+0x10/0x10
<6>[ 727.055994] ? __pfx_lock_acquire+0x10/0x10
<6>[ 727.056045] ? do_raw_spin_unlock+0x99/0x100
<6>[ 727.056221] ? __pfx_move_expired_inodes+0x10/0x10
<6>[ 727.056541] __writeback_inodes_wb+0x6a/0x130
<6>[ 727.056674] wb_writeback+0x45b/0x530
<6>[ 727.056780] ? __pfx_wb_writeback+0x10/0x10
<6>[ 727.056910] ? _find_next_bit+0x37/0xc0
<6>[ 727.057102] wb_do_writeback+0x434/0x4f0
<6>[ 727.057216] ? __pfx_wb_do_writeback+0x10/0x10
<6>[ 727.057445] ? __lock_acquire+0x7b3/0xfc0
<6>[ 727.057614] wb_workfn+0xe0/0x400
<6>[ 727.057692] ? __pfx_wb_workfn+0x10/0x10
<6>[ 727.057752] ? lock_acquire+0x15c/0x3e0
<6>[ 727.057802] ? process_one_work+0x436/0x990
<6>[ 727.057903] ? __pfx_lock_acquire+0x10/0x10
<6>[ 727.058039] ? check_chain_key+0x205/0x2b0
<6>[ 727.058086] ? __pfx_try_to_wake_up+0x10/0x10
<6>[ 727.058138] ? mark_held_locks+0x23/0x90
<6>[ 727.058306] process_one_work+0x505/0x990
<6>[ 727.058610] ? __pfx_process_one_work+0x10/0x10
<6>[ 727.058731] ? mark_held_locks+0x23/0x90
<6>[ 727.058827] ? worker_thread+0xce/0x670
<6>[ 727.058958] worker_thread+0x2dd/0x670
<6>[ 727.059145] ? __pfx_worker_thread+0x10/0x10
<6>[ 727.059201] kthread+0x16f/0x1a0
<6>[ 727.059252] ? __pfx_kthread+0x10/0x10
<6>[ 727.059341] ret_from_fork+0x2c/0x50
<6>[ 727.059683] </TASK>
<3>[ 727.059964] INFO: task kworker/9:6:8586 blocked for more than 245
seconds.
<3>[ 727.060031] Tainted: G W 6.3.0-rc3+ #9
<3>[ 727.060086] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
<6>[ 727.060140] task:kworker/9:6 state:D stack:0 pid:8586
ppid:2 flags:0x00004000
<6>[ 727.060211] Workqueue: events handle_osds_timeout [libceph]
<6>[ 727.060836] Call Trace:
<6>[ 727.060892] <TASK>
<6>[ 727.061089] __schedule+0x4a8/0xa80
<6>[ 727.061243] ? __pfx___schedule+0x10/0x10
<6>[ 727.061488] ? mark_held_locks+0x6b/0x90
<6>[ 727.061581] ? lockdep_hardirqs_on_prepare.part.0+0xea/0x1b0
<6>[ 727.061696] schedule+0x8e/0x120
<6>[ 727.061767] schedule_preempt_disabled+0x11/0x20
<6>[ 727.061812] rwsem_down_write_slowpath+0x2d6/0x840
<6>[ 727.061944] ? __pfx_rwsem_down_write_slowpath+0x10/0x10
<6>[ 727.062240] down_write+0x1bf/0x1d0
<6>[ 727.062307] ? __pfx_down_write+0x10/0x10
<6>[ 727.062517] ? check_chain_key+0x205/0x2b0
<6>[ 727.062635] handle_osds_timeout+0x6f/0x1b0 [libceph]
<6>[ 727.063487] process_one_work+0x505/0x990
<6>[ 727.063657] ? __pfx_process_one_work+0x10/0x10
<6>[ 727.063789] ? mark_held_locks+0x23/0x90
<6>[ 727.063913] ? worker_thread+0xce/0x670
<6>[ 727.064024] worker_thread+0x2dd/0x670
<6>[ 727.064161] ? __kthread_parkme+0xc9/0xe0
<6>[ 727.064353] ? __pfx_worker_thread+0x10/0x10
<6>[ 727.064411] kthread+0x16f/0x1a0
<6>[ 727.064462] ? __pfx_kthread+0x10/0x10
<6>[ 727.064556] ret_from_fork+0x2c/0x50
<6>[ 727.064816] </TASK>
<3>[ 727.064853] INFO: task kworker/6:6:8587 blocked for more than 245
seconds.
<3>[ 727.064917] Tainted: G W 6.3.0-rc3+ #9
<3>[ 727.064970] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
<6>[ 727.065022] task:kworker/6:6 state:D stack:0 pid:8587
ppid:2 flags:0x00004000
<6>[ 727.065081] Workqueue: events delayed_work [ceph]
<6>[ 727.065693] Call Trace:
<6>[ 727.065724] <TASK>
<6>[ 727.065833] __schedule+0x4a8/0xa80
<6>[ 727.065932] ? __pfx___schedule+0x10/0x10
<6>[ 727.065983] ? rwsem_down_read_slowpath+0x294/0x940
<6>[ 727.066112] ? mark_lock.part.0+0xd7/0x6c0
<6>[ 727.066292] ? _raw_spin_unlock_irq+0x24/0x40
<6>[ 727.066404] ? rwsem_down_read_slowpath+0x294/0x940
<6>[ 727.066460] schedule+0x8e/0x120
<6>[ 727.066529] schedule_preempt_disabled+0x11/0x20
<6>[ 727.066574] rwsem_down_read_slowpath+0x486/0x940
<6>[ 727.066686] ? __pfx_rwsem_down_read_slowpath+0x10/0x10
<6>[ 727.066885] ? lock_acquire+0x15c/0x3e0
<6>[ 727.066949] ? find_held_lock+0x8c/0xa0
<6>[ 727.066994] ? kvm_clock_read+0x14/0x30
<6>[ 727.067036] ? kvm_sched_clock_read+0x5/0x20
<6>[ 727.067269] __down_read_common+0xad/0x310
<6>[ 727.067344] ? __pfx___down_read_common+0x10/0x10
<6>[ 727.067450] ? ceph_send_cap_releases+0xbe/0x6a0 [ceph]
<6>[ 727.068071] down_read+0x7a/0x90
<6>[ 727.068140] ceph_send_cap_releases+0xbe/0x6a0 [ceph]
<6>[ 727.068869] ? mark_held_locks+0x6b/0x90
<6>[ 727.068990] ? __pfx_ceph_send_cap_releases+0x10/0x10 [ceph]
<6>[ 727.069733] delayed_work+0x30b/0x310 [ceph]
<6>[ 727.070445] process_one_work+0x505/0x990
<6>[ 727.070625] ? __pfx_process_one_work+0x10/0x10
<6>[ 727.070769] ? mark_held_locks+0x23/0x90
<6>[ 727.070838] ? worker_thread+0xce/0x670
<6>[ 727.070916] worker_thread+0x2dd/0x670
<6>[ 727.071016] ? __kthread_parkme+0xc9/0xe0
<6>[ 727.071076] ? __pfx_worker_thread+0x10/0x10
<6>[ 727.071115] kthread+0x16f/0x1a0
<6>[ 727.071150] ? __pfx_kthread+0x10/0x10
<6>[ 727.071368] ret_from_fork+0x2c/0x50
<6>[ 727.071532] </TASK>
<3>[ 727.071593] INFO: task kworker/9:18:9987 blocked for more than 245
seconds.
<3>[ 727.071642] Tainted: G W 6.3.0-rc3+ #9
<3>[ 727.071711] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
<6>[ 727.071750] task:kworker/9:18 state:D stack:0 pid:9987
ppid:2 flags:0x00004000
<6>[ 727.071797] Workqueue: events handle_timeout [libceph]
<6>[ 727.072375] Call Trace:
<6>[ 727.072397] <TASK>
<6>[ 727.072457] __schedule+0x4a8/0xa80
<6>[ 727.072536] ? __pfx___schedule+0x10/0x10
<6>[ 727.072659] ? mark_lock.part.0+0xd7/0x6c0
<6>[ 727.072711] ? _raw_spin_unlock_irq+0x24/0x40
<6>[ 727.072790] schedule+0x8e/0x120
<6>[ 727.072839] schedule_preempt_disabled+0x11/0x20
<6>[ 727.072871] rwsem_down_write_slowpath+0x2d6/0x840
<6>[ 727.072945] ? __pfx_rwsem_down_write_slowpath+0x10/0x10
<6>[ 727.073161] down_write+0x1bf/0x1d0
<6>[ 727.073424] ? __pfx_down_write+0x10/0x10
<6>[ 727.073551] handle_timeout+0x12a/0x6f0 [libceph]
<6>[ 727.074072] ? __pfx_lock_acquire+0x10/0x10
<6>[ 727.074188] ? __pfx_handle_timeout+0x10/0x10 [libceph]
<6>[ 727.075342] process_one_work+0x505/0x990
<6>[ 727.075473] ? __pfx_process_one_work+0x10/0x10
<6>[ 727.075558] ? mark_held_locks+0x23/0x90
<6>[ 727.075655] ? worker_thread+0xce/0x670
<6>[ 727.075716] worker_thread+0x2dd/0x670
<6>[ 727.075820] ? __kthread_parkme+0xc9/0xe0
<6>[ 727.075865] ? __pfx_worker_thread+0x10/0x10
<6>[ 727.075895] kthread+0x16f/0x1a0
<6>[ 727.075923] ? __pfx_kthread+0x10/0x10
<6>[ 727.076036] ret_from_fork+0x2c/0x50
<6>[ 727.076169] </TASK>
<3>[ 727.076201] INFO: task ffsb:13945 blocked for more than 245 seconds.
<3>[ 727.076239] Tainted: G W 6.3.0-rc3+ #9
<3>[ 727.076272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
<6>[ 727.076302] task:ffsb state:D stack:0 pid:13945
ppid:10657 flags:0x00000002
<6>[ 727.076340] Call Trace:
<6>[ 727.076357] <TASK>
<6>[ 727.076400] __schedule+0x4a8/0xa80
<6>[ 727.076459] ? __pfx___schedule+0x10/0x10
<6>[ 727.076489] ? asm_sysvec_apic_timer_interrupt+0x16/0x20
<6>[ 727.076629] ? rwsem_down_read_slowpath+0x294/0x940
<6>[ 727.076663] schedule+0x8e/0x120
<6>[ 727.076701] schedule_preempt_disabled+0x11/0x20
<6>[ 727.076726] rwsem_down_read_slowpath+0x486/0x940
<6>[ 727.076789] ? __pfx_rwsem_down_read_slowpath+0x10/0x10
<6>[ 727.076887] ? lock_acquire+0x15c/0x3e0
<6>[ 727.076921] ? find_held_lock+0x8c/0xa0
<6>[ 727.077014] ? kvm_clock_read+0x14/0x30
<6>[ 727.077047] ? kvm_sched_clock_read+0x5/0x20
<6>[ 727.077110] __down_read_common+0xad/0x310
<6>[ 727.077147] ? __pfx___down_read_common+0x10/0x10
<6>[ 727.077182] ? __pfx_generic_write_checks+0x10/0x10
<6>[ 727.077231] ? ceph_write_iter+0x33c/0xad0 [ceph]
<6>[ 727.077557] down_read+0x7a/0x90
<6>[ 727.077596] ceph_write_iter+0x33c/0xad0 [ceph]
<6>[ 727.078041] ? __pfx_ceph_write_iter+0x10/0x10 [ceph]
<6>[ 727.078324] ? __pfx_lock_acquire+0x10/0x10
<6>[ 727.078358] ? __might_resched+0x213/0x300
<6>[ 727.078413] ? inode_security+0x6d/0x90
<6>[ 727.078454] ? selinux_file_permission+0x1d5/0x210
<6>[ 727.078558] vfs_write+0x567/0x750
<6>[ 727.078608] ? __pfx_vfs_write+0x10/0x10
<6>[ 727.078786] ksys_write+0xc9/0x170
<6>[ 727.078824] ? __pfx_ksys_write+0x10/0x10
<6>[ 727.078857] ? ktime_get_coarse_real_ts64+0x100/0x110
<6>[ 727.078883] ? ktime_get_coarse_real_ts64+0xa4/0x110
<6>[ 727.079029] do_syscall_64+0x5c/0x90
<6>[ 727.079071] ? do_syscall_64+0x69/0x90
<6>[ 727.079105] ? lockdep_hardirqs_on_prepare.part.0+0xea/0x1b0
<6>[ 727.079151] ? do_syscall_64+0x69/0x90
<6>[ 727.079184] ? lockdep_hardirqs_on_prepare.part.0+0xea/0x1b0
<6>[ 727.079234] ? do_syscall_64+0x69/0x90
<6>[ 727.079257] ? lockdep_hardirqs_on_prepare.part.0+0xea/0x1b0
<6>[ 727.079303] entry_SYSCALL_64_after_hwframe+0x72/0xdc
<6>[ 727.079336] RIP: 0033:0x7fa52913ebcf
<6>[ 727.079361] RSP: 002b:00007fa522dfba30 EFLAGS: 00000293 ORIG_RAX:
0000000000000001
<6>[ 727.079396] RAX: ffffffffffffffda RBX: 0000000000001000 RCX:
00007fa52913ebcf
<6>[ 727.079418] RDX: 0000000000001000 RSI: 00007fa51c001450 RDI:
0000000000000004
<6>[ 727.079438] RBP: 0000000000000004 R08: 0000000000000000 R09:
0000000001cc3580
<6>[ 727.079456] R10: 00000000000001c0 R11: 0000000000000293 R12:
0000000000000000
<6>[ 727.079474] R13: 0000000001cc3580 R14: 0000000000001000 R15:
00007fa51c001450
<6>[ 727.079614] </TASK>
<6>[ 727.079633] Future hung task reports are suppressed, see sysctl
kernel.hung_task_warnings
<4>[ 727.079652]
<4>[ 727.079652] Showing all locks held in the system:
<4>[ 727.079671] 1 lock held by rcu_tasks_kthre/12:
<4>[ 727.079691] #0: ffffffff8bf1fc80
(rcu_tasks.tasks_gp_mutex){+.+.}-{4:4}, at: rcu_tasks_one_gp+0x32/0x280
<4>[ 727.079788] 1 lock held by rcu_tasks_rude_/13:
<4>[ 727.079805] #0: ffffffff8bf1f9a0
(rcu_tasks_rude.tasks_gp_mutex){+.+.}-{4:4}, at: rcu_tasks_one_gp+0x32/0x280
<4>[ 727.079896] 1 lock held by rcu_tasks_trace/14:
<4>[ 727.079976] #0: ffffffff8bf1f660
(rcu_tasks_trace.tasks_gp_mutex){+.+.}-{4:4}, at:
rcu_tasks_one_gp+0x32/0x280
<4>[ 727.080085] 6 locks held by kworker/u20:4/78:
<4>[ 727.080105] #0: ffff8884644d3148
((wq_completion)writeback){+.+.}-{0:0}, at: process_one_work+0x436/0x990
<4>[ 727.080192] #1: ffff888101027de8
((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}, at:
process_one_work+0x436/0x990
<4>[ 727.080282] #2: ffff8881022740e8
(&type->s_umount_key#62){.+.+}-{4:4}, at: trylock_super+0x16/0x70
<4>[ 727.080385] #3: ffff88811e378d70 (&osdc->lock){++++}-{4:4}, at:
ceph_osdc_start_request+0x17/0x40 [libceph]
<4>[ 727.080760] #4: ffff88810c116980 (&osd->lock){+.+.}-{4:4}, at:
__submit_request+0xfa/0x4e0 [libceph]
<4>[ 727.081139] #5: ffff88810c116178 (&con->mutex){+.+.}-{4:4}, at:
ceph_con_send+0xa4/0x310 [libceph]
<4>[ 727.081458] 3 locks held by kworker/3:1/90:
<4>[ 727.081517] #0: ffff888130cced48
((wq_completion)ceph-msgr){+.+.}-{0:0}, at: process_one_work+0x436/0x990
<4>[ 727.081593] #1: ffff888101117de8
((work_completion)(&(&con->work)->work)){+.+.}-{0:0}, at:
process_one_work+0x436/0x990
<4>[ 727.081668] #2: ffff88810c116178 (&con->mutex){+.+.}-{4:4}, at:
ceph_con_workfn+0x3c/0x5c0 [libceph]
<4>[ 727.082041] 1 lock held by khungtaskd/96:
<4>[ 727.082057] #0: ffffffff8bf20840 (rcu_read_lock){....}-{1:3}, at:
debug_show_all_locks+0x29/0x230
<4>[ 727.082149] 1 lock held by systemd-journal/687:
<4>[ 727.082179] 5 locks held by in:imjournal/882:
<4>[ 727.082270] 1 lock held by sshd/3493:
<4>[ 727.082320] 3 locks held by kworker/3:6/8369:
<4>[ 727.082337] #0: ffff888130cced48
((wq_completion)ceph-msgr){+.+.}-{0:0}, at: process_one_work+0x436/0x990
<4>[ 727.082413] #1: ffff8881105cfde8
((work_completion)(&(&con->work)->work)){+.+.}-{0:0}, at:
process_one_work+0x436/0x990
<4>[ 727.082502] #2: ffff8881803ce098 (&s->s_mutex){+.+.}-{4:4}, at:
send_mds_reconnect+0x13e/0x7c0 [ceph]
<4>[ 727.082791] 3 locks held by kworker/9:6/8586:
<4>[ 727.082807] #0: ffff888100063548
((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x436/0x990
<4>[ 727.082927] #1: ffff88812a767de8
((work_completion)(&(&osdc->osds_timeout_work)->work)){+.+.}-{0:0}, at:
process_one_work+0x436/0x990
<4>[ 727.083002] #2: ffff88811e378d70 (&osdc->lock){++++}-{4:4}, at:
handle_osds_timeout+0x6f/0x1b0 [libceph]
<4>[ 727.083336] 4 locks held by kworker/6:6/8587:
<4>[ 727.083352] #0: ffff888100063548
((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x436/0x990
<4>[ 727.083424] #1: ffff8881248a7de8
((work_completion)(&(&mdsc->delayed_work)->work)){+.+.}-{0:0}, at:
process_one_work+0x436/0x990
<4>[ 727.083509] #2: ffff8881803ce098 (&s->s_mutex){+.+.}-{4:4}, at:
delayed_work+0x1d8/0x310 [ceph]
<4>[ 727.083794] #3: ffff88811e378d70 (&osdc->lock){++++}-{4:4}, at:
ceph_send_cap_releases+0xbe/0x6a0 [ceph]
<4>[ 727.084151] 3 locks held by kworker/8:16/8663:
<4>[ 727.084170] #0: ffff888130cced48
((wq_completion)ceph-msgr){+.+.}-{0:0}, at: process_one_work+0x436/0x990
<4>[ 727.084245] #1: ffff88810aac7de8
((work_completion)(&(&con->work)->work)){+.+.}-{0:0}, at:
process_one_work+0x436/0x990
<4>[ 727.084316] #2: ffff888464998178 (&con->mutex){+.+.}-{4:4}, at:
ceph_con_workfn+0x3c/0x5c0 [libceph]
<4>[ 727.084684] 3 locks held by kworker/9:18/9987:
<4>[ 727.084701] #0: ffff888100063548
((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x436/0x990
<4>[ 727.084776] #1: ffff888112b17de8
((work_completion)(&(&osdc->timeout_work)->work)){+.+.}-{0:0}, at:
process_one_work+0x436/0x990
<4>[ 727.084938] #2: ffff88811e378d70 (&osdc->lock){++++}-{4:4}, at:
handle_timeout+0x12a/0x6f0 [libceph]
<4>[ 727.085274] 2 locks held by less/11897:
<4>[ 727.085291] #0: ffff8881138800a0 (&tty->ldisc_sem){++++}-{0:0},
at: tty_ldisc_ref_wait+0x24/0x70
<4>[ 727.085370] #1: ffffc900019122f8
(&ldata->atomic_read_lock){+.+.}-{4:4}, at: n_tty_read+0x849/0xa70
<4>[ 727.085469] 3 locks held by cat/13283:
<4>[ 727.085484] 4 locks held by ffsb/13945:
<4>[ 727.085497] #0: ffff88812b95ba78 (&f->f_pos_lock){+.+.}-{4:4},
at: __fdget_pos+0x75/0x80
<4>[ 727.085558] #1: ffff888102274480 (sb_writers#12){.+.+}-{0:0}, at:
ksys_write+0xc9/0x170
<4>[ 727.085654] #2: ffff88812aeca1b8
(&sb->s_type->i_mutex_key#15){++++}-{4:4}, at:
ceph_start_io_write+0x15/0x30 [ceph]
<4>[ 727.085963] #3: ffff88811e378d70 (&osdc->lock){++++}-{4:4}, at:
ceph_write_iter+0x33c/0xad0 [ceph]
<4>[ 727.086198] 5 locks held by kworker/3:0/13962:
<4>[ 727.086213]
Thanks
- Xiubo
On 4/1/23 00:09, David Howells wrote:
Use sendmsg() and MSG_SPLICE_PAGES rather than sendpage in ceph when
transmitting data. For the moment, this can only transmit one page at a
time because of the architecture of net/ceph/, but if
write_partial_message_data() can be given a bvec[] at a time by the
iteration code, this would allow pages to be sent in a batch.
Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
cc: Ilya Dryomov <idryomov@xxxxxxxxx>
cc: Xiubo Li <xiubli@xxxxxxxxxx>
cc: Jeff Layton <jlayton@xxxxxxxxxx>
cc: "David S. Miller" <davem@xxxxxxxxxxxxx>
cc: Eric Dumazet <edumazet@xxxxxxxxxx>
cc: Jakub Kicinski <kuba@xxxxxxxxxx>
cc: Paolo Abeni <pabeni@xxxxxxxxxx>
cc: Jens Axboe <axboe@xxxxxxxxx>
cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
cc: ceph-devel@xxxxxxxxxxxxxxx
cc: netdev@xxxxxxxxxxxxxxx
---
net/ceph/messenger_v2.c | 89 +++++++++--------------------------------
1 file changed, 18 insertions(+), 71 deletions(-)
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
index 301a991dc6a6..1637a0c21126 100644
--- a/net/ceph/messenger_v2.c
+++ b/net/ceph/messenger_v2.c
@@ -117,91 +117,38 @@ static int ceph_tcp_recv(struct ceph_connection *con)
return ret;
}
-static int do_sendmsg(struct socket *sock, struct iov_iter *it)
-{
- struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
- int ret;
-
- msg.msg_iter = *it;
- while (iov_iter_count(it)) {
- ret = sock_sendmsg(sock, &msg);
- if (ret <= 0) {
- if (ret == -EAGAIN)
- ret = 0;
- return ret;
- }
-
- iov_iter_advance(it, ret);
- }
-
- WARN_ON(msg_data_left(&msg));
- return 1;
-}
-
-static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
-{
- struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
- struct bio_vec bv;
- int ret;
-
- if (WARN_ON(!iov_iter_is_bvec(it)))
- return -EINVAL;
-
- while (iov_iter_count(it)) {
- /* iov_iter_iovec() for ITER_BVEC */
- bvec_set_page(&bv, it->bvec->bv_page,
- min(iov_iter_count(it),
- it->bvec->bv_len - it->iov_offset),
- it->bvec->bv_offset + it->iov_offset);
-
- /*
- * sendpage cannot properly handle pages with
- * page_count == 0, we need to fall back to sendmsg if
- * that's the case.
- *
- * Same goes for slab pages: skb_can_coalesce() allows
- * coalescing neighboring slab objects into a single frag
- * which triggers one of hardened usercopy checks.
- */
- if (sendpage_ok(bv.bv_page)) {
- ret = sock->ops->sendpage(sock, bv.bv_page,
- bv.bv_offset, bv.bv_len,
- CEPH_MSG_FLAGS);
- } else {
- iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len);
- ret = sock_sendmsg(sock, &msg);
- }
- if (ret <= 0) {
- if (ret == -EAGAIN)
- ret = 0;
- return ret;
- }
-
- iov_iter_advance(it, ret);
- }
-
- return 1;
-}
-
/*
* Write as much as possible. The socket is expected to be corked,
* so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
*
* Return:
- * 1 - done, nothing (else) to write
+ * >0 - done, nothing (else) to write
* 0 - socket is full, need to wait
* <0 - error
*/
static int ceph_tcp_send(struct ceph_connection *con)
{
+ struct msghdr msg = {
+ .msg_iter = con->v2.out_iter,
+ .msg_flags = CEPH_MSG_FLAGS,
+ };
int ret;
+ if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
+ return -EINVAL;
+
+ if (con->v2.out_iter_sendpage)
+ msg.msg_flags |= MSG_SPLICE_PAGES;
+
dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
- if (con->v2.out_iter_sendpage)
- ret = do_try_sendpage(con->sock, &con->v2.out_iter);
- else
- ret = do_sendmsg(con->sock, &con->v2.out_iter);
+
+ ret = sock_sendmsg(con->sock, &msg);
+ if (ret > 0)
+ iov_iter_advance(&con->v2.out_iter, ret);
+ else if (ret == -EAGAIN)
+ ret = 0;
+
dout("%s con %p ret %d left %zu\n", __func__, con, ret,
iov_iter_count(&con->v2.out_iter));
return ret;