diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst index 3d97b8d8f7354..4b30daee399af 100644 --- a/Documentation/filesystems/nfs/exporting.rst +++ b/Documentation/filesystems/nfs/exporting.rst @@ -215,3 +215,29 @@ following flags are defined: This flag causes nfsd to close any open files for this inode _before_ calling into the vfs to do an unlink or a rename that would replace an existing file. + + EXPORT_OP_REMOTE_FS - Backing storage for this filesystem is remote + PF_LOCAL_THROTTLE exists for loopback NFSD, where a thread needs to + write to one bdi (the final bdi) in order to free up writes queued + to another bdi (the client bdi). Such threads get a private balance + of dirty pages so that dirty pages for the client bdi do not imact + the daemon writing to the final bdi. For filesystems whose durable + storage is not local (such as exported NFS filesystems), this + constraint has negative consequences. EXPORT_OP_REMOTE_FS enables + an export to disable writeback throttling. + + EXPORT_OP_NOATOMIC_ATTR - Filesystem does not update attributes atomically + EXPORT_OP_NOATOMIC_ATTR indicates that the exported filesystem + cannot provide the semantics required by the "atomic" boolean in + NFSv4's change_info4. This boolean indicates to a client whether the + returned before and after change attributes were obtained atomically + with the respect to the requested metadata operation (UNLINK, + OPEN/CREATE, MKDIR, etc). + + EXPORT_OP_FLUSH_ON_CLOSE - Filesystem flushes file data on close(2) + On most filesystems, inodes can remain under writeback after the + file is closed. NFSD relies on client activity or local flusher + threads to handle writeback. Certain filesystems, such as NFS, flush + all of an inode's dirty data on last close. Exports that behave this + way should set EXPORT_OP_FLUSH_ON_CLOSE so that NFSD knows to skip + waiting for writeback when closing such files. diff --git a/arch/arm/common/locomo.c b/arch/arm/common/locomo.c index 309b747834684..70480dd9e96db 100644 --- a/arch/arm/common/locomo.c +++ b/arch/arm/common/locomo.c @@ -350,19 +350,6 @@ static int locomo_resume(struct platform_device *dev) } #endif - -/** - * locomo_probe - probe for a single LoCoMo chip. - * @phys_addr: physical address of device. - * - * Probe for a LoCoMo chip. This must be called - * before any other locomo-specific code. - * - * Returns: - * %-ENODEV device not found. - * %-EBUSY physical address already marked in-use. - * %0 successful. - */ static int __locomo_probe(struct device *me, struct resource *mem, int irq) { @@ -479,6 +466,21 @@ static void __locomo_remove(struct locomo *lchip) kfree(lchip); } +/** + * locomo_probe - probe for a single LoCoMo chip. + * @dev: platform device + * + * Probe for a LoCoMo chip. This must be called + * before any other locomo-specific code. + * + * Returns: + * * %-EINVAL - device's IORESOURCE_MEM not found + * * %-ENXIO - could not allocate an IRQ for the device + * * %-ENODEV - device not found. + * * %-EBUSY - physical address already marked in-use. + * * %-ENOMEM - could not allocate or iomap memory. + * * %0 - successful. + */ static int locomo_probe(struct platform_device *dev) { struct resource *mem; diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 7f092cb55a417..943ffcf069d29 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -40,6 +40,7 @@ struct task_struct; DECLARE_PER_CPU(struct task_struct *, __entry_task); #include +#include struct cpu_context_save { __u32 r4; @@ -66,7 +67,6 @@ struct thread_info { __u32 cpu_domain; /* cpu domain */ struct cpu_context_save cpu_context; /* cpu context */ __u32 abi_syscall; /* ABI type and syscall nr */ - __u8 used_cp[16]; /* thread used copro */ unsigned long tp_value[2]; /* TLS registers */ union fp_state fpstate __attribute__((aligned(8))); union vfp_state vfpstate; @@ -105,6 +105,21 @@ extern void iwmmxt_task_restore(struct thread_info *, void *); extern void iwmmxt_task_release(struct thread_info *); extern void iwmmxt_task_switch(struct thread_info *); +extern int iwmmxt_undef_handler(struct pt_regs *, u32); + +static inline void register_iwmmxt_undef_handler(void) +{ + static struct undef_hook iwmmxt_undef_hook = { + .instr_mask = 0x0c000e00, + .instr_val = 0x0c000000, + .cpsr_mask = MODE_MASK | PSR_T_BIT, + .cpsr_val = USR_MODE, + .fn = iwmmxt_undef_handler, + }; + + register_undef_hook(&iwmmxt_undef_hook); +} + extern void vfp_sync_hwstate(struct thread_info *); extern void vfp_flush_hwstate(struct thread_info *); diff --git a/arch/arm/include/asm/vfp.h b/arch/arm/include/asm/vfp.h index 5b57b8768bacc..157ea34261586 100644 --- a/arch/arm/include/asm/vfp.h +++ b/arch/arm/include/asm/vfp.h @@ -102,7 +102,6 @@ #ifndef __ASSEMBLY__ void vfp_disable(void); -void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs); #endif #endif /* __ASM_VFP_H */ diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 6a80d4be743b4..219cbc7e5d134 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -47,7 +47,6 @@ int main(void) DEFINE(TI_CPU_DOMAIN, offsetof(struct thread_info, cpu_domain)); DEFINE(TI_CPU_SAVE, offsetof(struct thread_info, cpu_context)); DEFINE(TI_ABI_SYSCALL, offsetof(struct thread_info, abi_syscall)); - DEFINE(TI_USED_CP, offsetof(struct thread_info, used_cp)); DEFINE(TI_TP_VALUE, offsetof(struct thread_info, tp_value)); DEFINE(TI_FPSTATE, offsetof(struct thread_info, fpstate)); #ifdef CONFIG_VFP diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 76e8125d05d26..6150a716828c3 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -446,258 +446,26 @@ ENDPROC(__irq_usr) __und_usr: usr_entry uaccess=0 - mov r2, r4 - mov r3, r5 - - @ r2 = regs->ARM_pc, which is either 2 or 4 bytes ahead of the - @ faulting instruction depending on Thumb mode. - @ r3 = regs->ARM_cpsr - @ - @ The emulation code returns using r9 if it has emulated the - @ instruction, or the more conventional lr if we are to treat - @ this as a real undefined instruction - @ - badr r9, ret_from_exception - @ IRQs must be enabled before attempting to read the instruction from @ user space since that could cause a page/translation fault if the @ page table was modified by another CPU. enable_irq - tst r3, #PSR_T_BIT @ Thumb mode? - bne __und_usr_thumb - sub r4, r2, #4 @ ARM instr at LR - 4 -1: ldrt r0, [r4] - ARM_BE8(rev r0, r0) @ little endian instruction - - uaccess_disable ip - - @ r0 = 32-bit ARM instruction which caused the exception - @ r2 = PC value for the following instruction (:= regs->ARM_pc) - @ r4 = PC value for the faulting instruction - @ lr = 32-bit undefined instruction function - badr lr, __und_usr_fault_32 - b call_fpe - -__und_usr_thumb: - @ Thumb instruction - sub r4, r2, #2 @ First half of thumb instr at LR - 2 -#if CONFIG_ARM_THUMB && __LINUX_ARM_ARCH__ >= 6 && CONFIG_CPU_V7 -/* - * Thumb-2 instruction handling. Note that because pre-v6 and >= v6 platforms - * can never be supported in a single kernel, this code is not applicable at - * all when __LINUX_ARM_ARCH__ < 6. This allows simplifying assumptions to be - * made about .arch directives. - */ -#if __LINUX_ARM_ARCH__ < 7 -/* If the target CPU may not be Thumb-2-capable, a run-time check is needed: */ - ldr_va r5, cpu_architecture - cmp r5, #CPU_ARCH_ARMv7 - blo __und_usr_fault_16 @ 16bit undefined instruction -/* - * The following code won't get run unless the running CPU really is v7, so - * coding round the lack of ldrht on older arches is pointless. Temporarily - * override the assembler target arch with the minimum required instead: - */ - .arch armv6t2 + tst r5, #PSR_T_BIT @ Thumb mode? + mov r1, #2 @ set insn size to 2 for Thumb + bne 0f @ handle as Thumb undef exception +#ifdef CONFIG_FPE_NWFPE + adr r9, ret_from_exception + bl call_fpe @ returns via R9 on success #endif -2: ldrht r5, [r4] -ARM_BE8(rev16 r5, r5) @ little endian instruction - cmp r5, #0xe800 @ 32bit instruction if xx != 0 - blo __und_usr_fault_16_pan @ 16bit undefined instruction -3: ldrht r0, [r2] -ARM_BE8(rev16 r0, r0) @ little endian instruction + mov r1, #4 @ set insn size to 4 for ARM +0: mov r0, sp uaccess_disable ip - add r2, r2, #2 @ r2 is PC + 2, make it PC + 4 - str r2, [sp, #S_PC] @ it's a 2x16bit instr, update - orr r0, r0, r5, lsl #16 - badr lr, __und_usr_fault_32 - @ r0 = the two 16-bit Thumb instructions which caused the exception - @ r2 = PC value for the following Thumb instruction (:= regs->ARM_pc) - @ r4 = PC value for the first 16-bit Thumb instruction - @ lr = 32bit undefined instruction function - -#if __LINUX_ARM_ARCH__ < 7 -/* If the target arch was overridden, change it back: */ -#ifdef CONFIG_CPU_32v6K - .arch armv6k -#else - .arch armv6 -#endif -#endif /* __LINUX_ARM_ARCH__ < 7 */ -#else /* !(CONFIG_ARM_THUMB && __LINUX_ARM_ARCH__ >= 6 && CONFIG_CPU_V7) */ - b __und_usr_fault_16 -#endif + bl __und_fault + b ret_from_exception UNWIND(.fnend) ENDPROC(__und_usr) -/* - * The out of line fixup for the ldrt instructions above. - */ - .pushsection .text.fixup, "ax" - .align 2 -4: str r4, [sp, #S_PC] @ retry current instruction - ret r9 - .popsection - .pushsection __ex_table,"a" - .long 1b, 4b -#if CONFIG_ARM_THUMB && __LINUX_ARM_ARCH__ >= 6 && CONFIG_CPU_V7 - .long 2b, 4b - .long 3b, 4b -#endif - .popsection - -/* - * Check whether the instruction is a co-processor instruction. - * If yes, we need to call the relevant co-processor handler. - * - * Note that we don't do a full check here for the co-processor - * instructions; all instructions with bit 27 set are well - * defined. The only instructions that should fault are the - * co-processor instructions. However, we have to watch out - * for the ARM6/ARM7 SWI bug. - * - * NEON is a special case that has to be handled here. Not all - * NEON instructions are co-processor instructions, so we have - * to make a special case of checking for them. Plus, there's - * five groups of them, so we have a table of mask/opcode pairs - * to check against, and if any match then we branch off into the - * NEON handler code. - * - * Emulators may wish to make use of the following registers: - * r0 = instruction opcode (32-bit ARM or two 16-bit Thumb) - * r2 = PC value to resume execution after successful emulation - * r9 = normal "successful" return address - * r10 = this threads thread_info structure - * lr = unrecognised instruction return address - * IRQs enabled, FIQs enabled. - */ - @ - @ Fall-through from Thumb-2 __und_usr - @ -#ifdef CONFIG_NEON - get_thread_info r10 @ get current thread - adr r6, .LCneon_thumb_opcodes - b 2f -#endif -call_fpe: - get_thread_info r10 @ get current thread -#ifdef CONFIG_NEON - adr r6, .LCneon_arm_opcodes -2: ldr r5, [r6], #4 @ mask value - ldr r7, [r6], #4 @ opcode bits matching in mask - cmp r5, #0 @ end mask? - beq 1f - and r8, r0, r5 - cmp r8, r7 @ NEON instruction? - bne 2b - mov r7, #1 - strb r7, [r10, #TI_USED_CP + 10] @ mark CP#10 as used - strb r7, [r10, #TI_USED_CP + 11] @ mark CP#11 as used - b do_vfp @ let VFP handler handle this -1: -#endif - tst r0, #0x08000000 @ only CDP/CPRT/LDC/STC have bit 27 - tstne r0, #0x04000000 @ bit 26 set on both ARM and Thumb-2 - reteq lr - and r8, r0, #0x00000f00 @ mask out CP number - mov r7, #1 - add r6, r10, r8, lsr #8 @ add used_cp[] array offset first - strb r7, [r6, #TI_USED_CP] @ set appropriate used_cp[] -#ifdef CONFIG_IWMMXT - @ Test if we need to give access to iWMMXt coprocessors - ldr r5, [r10, #TI_FLAGS] - rsbs r7, r8, #(1 << 8) @ CP 0 or 1 only - movscs r7, r5, lsr #(TIF_USING_IWMMXT + 1) - bcs iwmmxt_task_enable -#endif - ARM( add pc, pc, r8, lsr #6 ) - THUMB( lsr r8, r8, #6 ) - THUMB( add pc, r8 ) - nop - - ret.w lr @ CP#0 - W(b) do_fpe @ CP#1 (FPE) - W(b) do_fpe @ CP#2 (FPE) - ret.w lr @ CP#3 - ret.w lr @ CP#4 - ret.w lr @ CP#5 - ret.w lr @ CP#6 - ret.w lr @ CP#7 - ret.w lr @ CP#8 - ret.w lr @ CP#9 -#ifdef CONFIG_VFP - W(b) do_vfp @ CP#10 (VFP) - W(b) do_vfp @ CP#11 (VFP) -#else - ret.w lr @ CP#10 (VFP) - ret.w lr @ CP#11 (VFP) -#endif - ret.w lr @ CP#12 - ret.w lr @ CP#13 - ret.w lr @ CP#14 (Debug) - ret.w lr @ CP#15 (Control) - -#ifdef CONFIG_NEON - .align 6 - -.LCneon_arm_opcodes: - .word 0xfe000000 @ mask - .word 0xf2000000 @ opcode - - .word 0xff100000 @ mask - .word 0xf4000000 @ opcode - - .word 0x00000000 @ mask - .word 0x00000000 @ opcode - -.LCneon_thumb_opcodes: - .word 0xef000000 @ mask - .word 0xef000000 @ opcode - - .word 0xff100000 @ mask - .word 0xf9000000 @ opcode - - .word 0x00000000 @ mask - .word 0x00000000 @ opcode -#endif - -do_fpe: - add r10, r10, #TI_FPSTATE @ r10 = workspace - ldr_va pc, fp_enter, tmp=r4 @ Call FP module USR entry point - -/* - * The FP module is called with these registers set: - * r0 = instruction - * r2 = PC+4 - * r9 = normal "successful" return address - * r10 = FP workspace - * lr = unrecognised FP instruction return address - */ - - .pushsection .data - .align 2 -ENTRY(fp_enter) - .word no_fp - .popsection - -ENTRY(no_fp) - ret lr -ENDPROC(no_fp) - -__und_usr_fault_32: - mov r1, #4 - b 1f -__und_usr_fault_16_pan: - uaccess_disable ip -__und_usr_fault_16: - mov r1, #2 -1: mov r0, sp - badr lr, ret_from_exception - b __und_fault -ENDPROC(__und_usr_fault_32) -ENDPROC(__und_usr_fault_16) - .align 5 __pabt_usr: usr_entry diff --git a/arch/arm/kernel/iwmmxt.S b/arch/arm/kernel/iwmmxt.S index d2b4ac06e4ed8..a0218c4867b9b 100644 --- a/arch/arm/kernel/iwmmxt.S +++ b/arch/arm/kernel/iwmmxt.S @@ -58,9 +58,19 @@ .text .arm +ENTRY(iwmmxt_undef_handler) + push {r9, r10, lr} + get_thread_info r10 + mov r9, pc + b iwmmxt_task_enable + mov r0, #0 + pop {r9, r10, pc} +ENDPROC(iwmmxt_undef_handler) + /* * Lazy switching of Concan coprocessor context * + * r0 = struct pt_regs pointer * r10 = struct thread_info pointer * r9 = ret_from_exception * lr = undefined instr exit @@ -84,12 +94,12 @@ ENTRY(iwmmxt_task_enable) PJ4(mcr p15, 0, r2, c1, c0, 2) ldr r3, =concan_owner - add r0, r10, #TI_IWMMXT_STATE @ get task Concan save area - ldr r2, [sp, #60] @ current task pc value + ldr r2, [r0, #S_PC] @ current task pc value ldr r1, [r3] @ get current Concan owner - str r0, [r3] @ this task now owns Concan regs sub r2, r2, #4 @ adjust pc back - str r2, [sp, #60] + str r2, [r0, #S_PC] + add r0, r10, #TI_IWMMXT_STATE @ get task Concan save area + str r0, [r3] @ this task now owns Concan regs mrc p15, 0, r2, c2, c0, 0 mov r2, r2 @ cpwait diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 46364b699cc30..5d07cf9e0044d 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -94,16 +94,28 @@ static void machine_crash_nonpanic_core(void *unused) } } +static DEFINE_PER_CPU(call_single_data_t, cpu_stop_csd) = + CSD_INIT(machine_crash_nonpanic_core, NULL); + void crash_smp_send_stop(void) { static int cpus_stopped; unsigned long msecs; + call_single_data_t *csd; + int cpu, this_cpu = raw_smp_processor_id(); if (cpus_stopped) return; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - smp_call_function(machine_crash_nonpanic_core, NULL, false); + for_each_online_cpu(cpu) { + if (cpu == this_cpu) + continue; + + csd = &per_cpu(cpu_stop_csd, cpu); + smp_call_function_single_async(cpu, csd); + } + msecs = 1000; /* Wait at most a second for the other cpus to stop */ while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { mdelay(1); diff --git a/arch/arm/kernel/pj4-cp0.c b/arch/arm/kernel/pj4-cp0.c index 1d1fb22f44f37..4bca8098c4ff5 100644 --- a/arch/arm/kernel/pj4-cp0.c +++ b/arch/arm/kernel/pj4-cp0.c @@ -126,6 +126,7 @@ static int __init pj4_cp0_init(void) pr_info("PJ4 iWMMXt v%d coprocessor enabled.\n", vers); elf_hwcap |= HWCAP_IWMMXT; thread_register_notifier(&iwmmxt_notifier_block); + register_iwmmxt_undef_handler(); #endif return 0; diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 0e8ff85890ade..e16ed102960cb 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -222,7 +222,6 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); - memset(thread->used_cp, 0, sizeof(thread->used_cp)); memset(&tsk->thread.debug, 0, sizeof(struct debug_info)); memset(&thread->fpstate, 0, sizeof(union fp_state)); diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index fef32d73f9120..c421a899fc84c 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -584,8 +584,6 @@ static int fpa_set(struct task_struct *target, { struct thread_info *thread = task_thread_info(target); - thread->used_cp[1] = thread->used_cp[2] = 1; - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, &thread->fpstate, 0, sizeof(struct user_fp)); diff --git a/arch/arm/kernel/xscale-cp0.c b/arch/arm/kernel/xscale-cp0.c index ed4f6e77616da..00d00d3aae972 100644 --- a/arch/arm/kernel/xscale-cp0.c +++ b/arch/arm/kernel/xscale-cp0.c @@ -166,6 +166,7 @@ static int __init xscale_cp0_init(void) pr_info("XScale iWMMXt coprocessor detected.\n"); elf_hwcap |= HWCAP_IWMMXT; thread_register_notifier(&iwmmxt_notifier_block); + register_iwmmxt_undef_handler(); #endif } else { pr_info("XScale DSP coprocessor detected.\n"); diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S index 61ce82aca6f0d..072ff9b451f84 100644 --- a/arch/arm/mm/proc-feroceon.S +++ b/arch/arm/mm/proc-feroceon.S @@ -56,6 +56,10 @@ ENTRY(cpu_feroceon_proc_init) movne r2, r2, lsr #2 @ turned into # of sets sub r2, r2, #(1 << 5) stmia r1, {r2, r3} +#ifdef CONFIG_VFP + mov r1, #1 @ disable quirky VFP + str_l r1, VFP_arch_feroceon, r2 +#endif ret lr /* diff --git a/arch/arm/nwfpe/entry.S b/arch/arm/nwfpe/entry.S index d8f9915566e15..354d297a193bb 100644 --- a/arch/arm/nwfpe/entry.S +++ b/arch/arm/nwfpe/entry.S @@ -7,6 +7,7 @@ Direct questions, comments to Scott Bambrough */ +#include #include #include @@ -104,6 +105,7 @@ next: @ plain LDR instruction. Weird, but it seems harmless. .pushsection .text.fixup,"ax" .align 2 +.Lrep: str r4, [sp, #S_PC] @ retry current instruction .Lfix: ret r9 @ let the user eat segfaults .popsection @@ -111,3 +113,78 @@ next: .align 3 .long .Lx1, .Lfix .popsection + + @ + @ Check whether the instruction is a co-processor instruction. + @ If yes, we need to call the relevant co-processor handler. + @ Only FPE instructions are dispatched here, everything else + @ is handled by undef hooks. + @ + @ Emulators may wish to make use of the following registers: + @ r4 = PC value to resume execution after successful emulation + @ r9 = normal "successful" return address + @ lr = unrecognised instruction return address + @ IRQs enabled, FIQs enabled. + @ +ENTRY(call_fpe) + mov r2, r4 + sub r4, r4, #4 @ ARM instruction at user PC - 4 +USERL( .Lrep, ldrt r0, [r4]) @ load opcode from user space +ARM_BE8(rev r0, r0) @ little endian instruction + + uaccess_disable ip + + get_thread_info r10 @ get current thread + tst r0, #0x08000000 @ only CDP/CPRT/LDC/STC have bit 27 + reteq lr + and r8, r0, #0x00000f00 @ mask out CP number +#ifdef CONFIG_IWMMXT + @ Test if we need to give access to iWMMXt coprocessors + ldr r5, [r10, #TI_FLAGS] + rsbs r7, r8, #(1 << 8) @ CP 0 or 1 only + movscs r7, r5, lsr #(TIF_USING_IWMMXT + 1) + movcs r0, sp @ pass struct pt_regs + bcs iwmmxt_task_enable +#endif + add pc, pc, r8, lsr #6 + nop + + ret lr @ CP#0 + b do_fpe @ CP#1 (FPE) + b do_fpe @ CP#2 (FPE) + ret lr @ CP#3 + ret lr @ CP#4 + ret lr @ CP#5 + ret lr @ CP#6 + ret lr @ CP#7 + ret lr @ CP#8 + ret lr @ CP#9 + ret lr @ CP#10 (VFP) + ret lr @ CP#11 (VFP) + ret lr @ CP#12 + ret lr @ CP#13 + ret lr @ CP#14 (Debug) + ret lr @ CP#15 (Control) + +do_fpe: + add r10, r10, #TI_FPSTATE @ r10 = workspace + ldr_va pc, fp_enter, tmp=r4 @ Call FP module USR entry point + + @ + @ The FP module is called with these registers set: + @ r0 = instruction + @ r2 = PC+4 + @ r9 = normal "successful" return address + @ r10 = FP workspace + @ lr = unrecognised FP instruction return address + @ + + .pushsection .data + .align 2 +ENTRY(fp_enter) + .word no_fp + .popsection + +no_fp: + ret lr +ENDPROC(no_fp) diff --git a/arch/arm/vfp/Makefile b/arch/arm/vfp/Makefile index 749901a72d6dc..dfd64bc2b2fbd 100644 --- a/arch/arm/vfp/Makefile +++ b/arch/arm/vfp/Makefile @@ -8,4 +8,4 @@ # ccflags-y := -DDEBUG # asflags-y := -DDEBUG -obj-y += vfpmodule.o entry.o vfphw.o vfpsingle.o vfpdouble.o +obj-y += vfpmodule.o vfphw.o vfpsingle.o vfpdouble.o diff --git a/arch/arm/vfp/entry.S b/arch/arm/vfp/entry.S deleted file mode 100644 index 62206ef250371..0000000000000 --- a/arch/arm/vfp/entry.S +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/arm/vfp/entry.S - * - * Copyright (C) 2004 ARM Limited. - * Written by Deep Blue Solutions Limited. - */ -#include -#include -#include -#include -#include -#include - -@ VFP entry point. -@ -@ r0 = instruction opcode (32-bit ARM or two 16-bit Thumb) -@ r2 = PC value to resume execution after successful emulation -@ r9 = normal "successful" return address -@ r10 = this threads thread_info structure -@ lr = unrecognised instruction return address -@ IRQs enabled. -@ -ENTRY(do_vfp) - mov r1, r10 - str lr, [sp, #-8]! - add r3, sp, #4 - str r9, [r3] - bl vfp_entry - ldr pc, [sp], #8 -ENDPROC(do_vfp) diff --git a/arch/arm/vfp/vfp.h b/arch/arm/vfp/vfp.h index 5cd6d50532717..e43a630f8a164 100644 --- a/arch/arm/vfp/vfp.h +++ b/arch/arm/vfp/vfp.h @@ -375,3 +375,4 @@ struct op { }; asmlinkage void vfp_save_state(void *location, u32 fpexc); +asmlinkage u32 vfp_load_state(const void *location); diff --git a/arch/arm/vfp/vfphw.S b/arch/arm/vfp/vfphw.S index a4610d0f32152..d5a03f3c10c50 100644 --- a/arch/arm/vfp/vfphw.S +++ b/arch/arm/vfp/vfphw.S @@ -4,12 +4,6 @@ * * Copyright (C) 2004 ARM Limited. * Written by Deep Blue Solutions Limited. - * - * This code is called from the kernel's undefined instruction trap. - * r1 holds the thread_info pointer - * r3 holds the return address for successful handling. - * lr holds the return address for unrecognised instructions. - * sp points to a struct pt_regs (as defined in include/asm/proc/ptrace.h) */ #include #include @@ -19,20 +13,6 @@ #include #include - .macro DBGSTR, str -#ifdef DEBUG - stmfd sp!, {r0-r3, ip, lr} - ldr r0, =1f - bl _printk - ldmfd sp!, {r0-r3, ip, lr} - - .pushsection .rodata, "a" -1: .ascii KERN_DEBUG "VFP: \str\n" - .byte 0 - .previous -#endif - .endm - .macro DBGSTR1, str, arg #ifdef DEBUG stmfd sp!, {r0-r3, ip, lr} @@ -48,181 +28,25 @@ #endif .endm - .macro DBGSTR3, str, arg1, arg2, arg3 -#ifdef DEBUG - stmfd sp!, {r0-r3, ip, lr} - mov r3, \arg3 - mov r2, \arg2 - mov r1, \arg1 - ldr r0, =1f - bl _printk - ldmfd sp!, {r0-r3, ip, lr} - - .pushsection .rodata, "a" -1: .ascii KERN_DEBUG "VFP: \str\n" - .byte 0 - .previous -#endif - .endm - - -@ VFP hardware support entry point. -@ -@ r0 = instruction opcode (32-bit ARM or two 16-bit Thumb) -@ r1 = thread_info pointer -@ r2 = PC value to resume execution after successful emulation -@ r3 = normal "successful" return address -@ lr = unrecognised instruction return address -@ IRQs enabled. -ENTRY(vfp_support_entry) - ldr r11, [r1, #TI_CPU] @ CPU number - add r10, r1, #TI_VFPSTATE @ r10 = workspace - - DBGSTR3 "instr %08x pc %08x state %p", r0, r2, r10 - - .fpu vfpv2 - VFPFMRX r1, FPEXC @ Is the VFP enabled? - DBGSTR1 "fpexc %08x", r1 - tst r1, #FPEXC_EN - bne look_for_VFP_exceptions @ VFP is already enabled - - DBGSTR1 "enable %x", r10 - ldr r9, vfp_current_hw_state_address - orr r1, r1, #FPEXC_EN @ user FPEXC has the enable bit set - ldr r4, [r9, r11, lsl #2] @ vfp_current_hw_state pointer - bic r5, r1, #FPEXC_EX @ make sure exceptions are disabled - cmp r4, r10 @ this thread owns the hw context? -#ifndef CONFIG_SMP - @ For UP, checking that this thread owns the hw context is - @ sufficient to determine that the hardware state is valid. - beq vfp_hw_state_valid - - @ On UP, we lazily save the VFP context. As a different - @ thread wants ownership of the VFP hardware, save the old - @ state if there was a previous (valid) owner. - - VFPFMXR FPEXC, r5 @ enable VFP, disable any pending - @ exceptions, so we can get at the - @ rest of it - - DBGSTR1 "save old state %p", r4 - cmp r4, #0 @ if the vfp_current_hw_state is NULL - beq vfp_reload_hw @ then the hw state needs reloading - VFPFSTMIA r4, r5 @ save the working registers - VFPFMRX r5, FPSCR @ current status -#ifndef CONFIG_CPU_FEROCEON - tst r1, #FPEXC_EX @ is there additional state to save? - beq 1f - VFPFMRX r6, FPINST @ FPINST (only if FPEXC.EX is set) - tst r1, #FPEXC_FP2V @ is there an FPINST2 to read? - beq 1f - VFPFMRX r8, FPINST2 @ FPINST2 if needed (and present) -1: -#endif - stmia r4, {r1, r5, r6, r8} @ save FPEXC, FPSCR, FPINST, FPINST2 -vfp_reload_hw: - -#else - @ For SMP, if this thread does not own the hw context, then we - @ need to reload it. No need to save the old state as on SMP, - @ we always save the state when we switch away from a thread. - bne vfp_reload_hw - - @ This thread has ownership of the current hardware context. - @ However, it may have been migrated to another CPU, in which - @ case the saved state is newer than the hardware context. - @ Check this by looking at the CPU number which the state was - @ last loaded onto. - ldr ip, [r10, #VFP_CPU] - teq ip, r11 - beq vfp_hw_state_valid - -vfp_reload_hw: - @ We're loading this threads state into the VFP hardware. Update - @ the CPU number which contains the most up to date VFP context. - str r11, [r10, #VFP_CPU] - - VFPFMXR FPEXC, r5 @ enable VFP, disable any pending - @ exceptions, so we can get at the - @ rest of it -#endif - - DBGSTR1 "load state %p", r10 - str r10, [r9, r11, lsl #2] @ update the vfp_current_hw_state pointer +ENTRY(vfp_load_state) + @ Load the current VFP state + @ r0 - load location + @ returns FPEXC + DBGSTR1 "load VFP state %p", r0 @ Load the saved state back into the VFP - VFPFLDMIA r10, r5 @ reload the working registers while + VFPFLDMIA r0, r1 @ reload the working registers while @ FPEXC is in a safe state - ldmia r10, {r1, r5, r6, r8} @ load FPEXC, FPSCR, FPINST, FPINST2 -#ifndef CONFIG_CPU_FEROCEON - tst r1, #FPEXC_EX @ is there additional state to restore? + ldmia r0, {r0-r3} @ load FPEXC, FPSCR, FPINST, FPINST2 + tst r0, #FPEXC_EX @ is there additional state to restore? beq 1f - VFPFMXR FPINST, r6 @ restore FPINST (only if FPEXC.EX is set) - tst r1, #FPEXC_FP2V @ is there an FPINST2 to write? + VFPFMXR FPINST, r2 @ restore FPINST (only if FPEXC.EX is set) + tst r0, #FPEXC_FP2V @ is there an FPINST2 to write? beq 1f - VFPFMXR FPINST2, r8 @ FPINST2 if needed (and present) + VFPFMXR FPINST2, r3 @ FPINST2 if needed (and present) 1: -#endif - VFPFMXR FPSCR, r5 @ restore status - -@ The context stored in the VFP hardware is up to date with this thread -vfp_hw_state_valid: - tst r1, #FPEXC_EX - bne process_exception @ might as well handle the pending - @ exception before retrying branch - @ out before setting an FPEXC that - @ stops us reading stuff - VFPFMXR FPEXC, r1 @ Restore FPEXC last - mov sp, r3 @ we think we have handled things - pop {lr} - sub r2, r2, #4 @ Retry current instruction - if Thumb - str r2, [sp, #S_PC] @ mode it's two 16-bit instructions, - @ else it's one 32-bit instruction, so - @ always subtract 4 from the following - @ instruction address. - -local_bh_enable_and_ret: - adr r0, . - mov r1, #SOFTIRQ_DISABLE_OFFSET - b __local_bh_enable_ip @ tail call - -look_for_VFP_exceptions: - @ Check for synchronous or asynchronous exception - tst r1, #FPEXC_EX | FPEXC_DEX - bne process_exception - @ On some implementations of the VFP subarch 1, setting FPSCR.IXE - @ causes all the CDP instructions to be bounced synchronously without - @ setting the FPEXC.EX bit - VFPFMRX r5, FPSCR - tst r5, #FPSCR_IXE - bne process_exception - - tst r5, #FPSCR_LENGTH_MASK - beq skip - orr r1, r1, #FPEXC_DEX - b process_exception -skip: - - @ Fall into hand on to next handler - appropriate coproc instr - @ not recognised by VFP - - DBGSTR "not VFP" - b local_bh_enable_and_ret - -process_exception: - DBGSTR "bounce" - mov sp, r3 @ setup for a return to the user code. - pop {lr} - mov r2, sp @ nothing stacked - regdump is at TOS - - @ Now call the C code to package up the bounce to the support code - @ r0 holds the trigger instruction - @ r1 holds the FPEXC value - @ r2 pointer to register dump - b VFP_bounce @ we have handled this - the support - @ code will raise an exception if - @ required. If not, the user code will - @ retry the faulted instruction -ENDPROC(vfp_support_entry) + VFPFMXR FPSCR, r1 @ restore status + ret lr +ENDPROC(vfp_load_state) ENTRY(vfp_save_state) @ Save the current VFP state @@ -242,10 +66,6 @@ ENTRY(vfp_save_state) ret lr ENDPROC(vfp_save_state) - .align -vfp_current_hw_state_address: - .word vfp_current_hw_state - .macro tbl_branch, base, tmp, shift #ifdef CONFIG_THUMB2_KERNEL adr \tmp, 1f diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index 1ba5078c10258..7e8773a2d99d0 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -30,11 +31,6 @@ #include "vfpinstr.h" #include "vfp.h" -/* - * Our undef handlers (in entry.S) - */ -asmlinkage void vfp_support_entry(u32, void *, u32, u32); - static bool have_vfp __ro_after_init; /* @@ -42,7 +38,11 @@ static bool have_vfp __ro_after_init; * Used in startup: set to non-zero if VFP checks fail * After startup, holds VFP architecture */ -static unsigned int __initdata VFP_arch; +static unsigned int VFP_arch; + +#ifdef CONFIG_CPU_FEROCEON +extern unsigned int VFP_arch_feroceon __alias(VFP_arch); +#endif /* * The pointer to the vfpstate structure of the thread which currently @@ -314,13 +314,14 @@ static u32 vfp_emulate_instruction(u32 inst, u32 fpscr, struct pt_regs *regs) * emulate it. */ } + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, regs->ARM_pc); return exceptions & ~VFP_NAN_FLAG; } /* * Package up a bounce condition. */ -void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) +static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) { u32 fpscr, orig_fpscr, fpsid, exceptions; @@ -356,14 +357,12 @@ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) } if (fpexc & FPEXC_EX) { -#ifndef CONFIG_CPU_FEROCEON /* * Asynchronous exception. The instruction is read from FPINST * and the interrupted instruction has to be restarted. */ trigger = fmrx(FPINST); regs->ARM_pc -= 4; -#endif } else if (!(fpexc & FPEXC_DEX)) { /* * Illegal combination of bits. It can be caused by an @@ -371,7 +370,7 @@ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) * on VFP subarch 1. */ vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr, regs); - goto exit; + return; } /* @@ -402,7 +401,7 @@ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) * the FPEXC.FP2V bit is valid only if FPEXC.EX is 1. */ if ((fpexc & (FPEXC_EX | FPEXC_FP2V)) != (FPEXC_EX | FPEXC_FP2V)) - goto exit; + return; /* * The barrier() here prevents fpinst2 being read @@ -415,8 +414,6 @@ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) exceptions = vfp_emulate_instruction(trigger, orig_fpscr, regs); if (exceptions) vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs); - exit: - local_bh_enable(); } static void vfp_enable(void *unused) @@ -645,27 +642,6 @@ static int vfp_starting_cpu(unsigned int unused) return 0; } -/* - * Entered with: - * - * r0 = instruction opcode (32-bit ARM or two 16-bit Thumb) - * r1 = thread_info pointer - * r2 = PC value to resume execution after successful emulation - * r3 = normal "successful" return address - * lr = unrecognised instruction return address - */ -asmlinkage void vfp_entry(u32 trigger, struct thread_info *ti, u32 resume_pc, - u32 resume_return_address) -{ - if (unlikely(!have_vfp)) - return; - - local_bh_disable(); - vfp_support_entry(trigger, ti, resume_pc, resume_return_address); -} - -#ifdef CONFIG_KERNEL_MODE_NEON - static int vfp_kmode_exception(struct pt_regs *regs, unsigned int instr) { /* @@ -688,47 +664,151 @@ static int vfp_kmode_exception(struct pt_regs *regs, unsigned int instr) return 1; } -static struct undef_hook vfp_kmode_exception_hook[] = {{ +/* + * vfp_support_entry - Handle VFP exception + * + * @regs: pt_regs structure holding the register state at exception entry + * @trigger: The opcode of the instruction that triggered the exception + * + * Returns 0 if the exception was handled, or an error code otherwise. + */ +static int vfp_support_entry(struct pt_regs *regs, u32 trigger) +{ + struct thread_info *ti = current_thread_info(); + u32 fpexc; + + if (unlikely(!have_vfp)) + return -ENODEV; + + if (!user_mode(regs)) + return vfp_kmode_exception(regs, trigger); + + local_bh_disable(); + fpexc = fmrx(FPEXC); + + /* + * If the VFP unit was not enabled yet, we have to check whether the + * VFP state in the CPU's registers is the most recent VFP state + * associated with the process. On UP systems, we don't save the VFP + * state eagerly on a context switch, so we may need to save the + * VFP state to memory first, as it may belong to another process. + */ + if (!(fpexc & FPEXC_EN)) { + /* + * Enable the VFP unit but mask the FP exception flag for the + * time being, so we can access all the registers. + */ + fpexc |= FPEXC_EN; + fmxr(FPEXC, fpexc & ~FPEXC_EX); + + /* + * Check whether or not the VFP state in the CPU's registers is + * the most recent VFP state associated with this task. On SMP, + * migration may result in multiple CPUs holding VFP states + * that belong to the same task, but only the most recent one + * is valid. + */ + if (!vfp_state_in_hw(ti->cpu, ti)) { + if (!IS_ENABLED(CONFIG_SMP) && + vfp_current_hw_state[ti->cpu] != NULL) { + /* + * This CPU is currently holding the most + * recent VFP state associated with another + * task, and we must save that to memory first. + */ + vfp_save_state(vfp_current_hw_state[ti->cpu], + fpexc); + } + + /* + * We can now proceed with loading the task's VFP state + * from memory into the CPU registers. + */ + fpexc = vfp_load_state(&ti->vfpstate); + vfp_current_hw_state[ti->cpu] = &ti->vfpstate; +#ifdef CONFIG_SMP + /* + * Record that this CPU is now the one holding the most + * recent VFP state of the task. + */ + ti->vfpstate.hard.cpu = ti->cpu; +#endif + } + + if (fpexc & FPEXC_EX) + /* + * Might as well handle the pending exception before + * retrying branch out before setting an FPEXC that + * stops us reading stuff. + */ + goto bounce; + + /* + * No FP exception is pending: just enable the VFP and + * replay the instruction that trapped. + */ + fmxr(FPEXC, fpexc); + } else { + /* Check for synchronous or asynchronous exceptions */ + if (!(fpexc & (FPEXC_EX | FPEXC_DEX))) { + u32 fpscr = fmrx(FPSCR); + + /* + * On some implementations of the VFP subarch 1, + * setting FPSCR.IXE causes all the CDP instructions to + * be bounced synchronously without setting the + * FPEXC.EX bit + */ + if (!(fpscr & FPSCR_IXE)) { + if (!(fpscr & FPSCR_LENGTH_MASK)) { + pr_debug("not VFP\n"); + local_bh_enable(); + return -ENOEXEC; + } + fpexc |= FPEXC_DEX; + } + } +bounce: regs->ARM_pc += 4; + VFP_bounce(trigger, fpexc, regs); + } + + local_bh_enable(); + return 0; +} + +static struct undef_hook neon_support_hook[] = {{ .instr_mask = 0xfe000000, .instr_val = 0xf2000000, - .cpsr_mask = MODE_MASK | PSR_T_BIT, - .cpsr_val = SVC_MODE, - .fn = vfp_kmode_exception, + .cpsr_mask = PSR_T_BIT, + .cpsr_val = 0, + .fn = vfp_support_entry, }, { .instr_mask = 0xff100000, .instr_val = 0xf4000000, - .cpsr_mask = MODE_MASK | PSR_T_BIT, - .cpsr_val = SVC_MODE, - .fn = vfp_kmode_exception, + .cpsr_mask = PSR_T_BIT, + .cpsr_val = 0, + .fn = vfp_support_entry, }, { .instr_mask = 0xef000000, .instr_val = 0xef000000, - .cpsr_mask = MODE_MASK | PSR_T_BIT, - .cpsr_val = SVC_MODE | PSR_T_BIT, - .fn = vfp_kmode_exception, + .cpsr_mask = PSR_T_BIT, + .cpsr_val = PSR_T_BIT, + .fn = vfp_support_entry, }, { .instr_mask = 0xff100000, .instr_val = 0xf9000000, - .cpsr_mask = MODE_MASK | PSR_T_BIT, - .cpsr_val = SVC_MODE | PSR_T_BIT, - .fn = vfp_kmode_exception, -}, { - .instr_mask = 0x0c000e00, - .instr_val = 0x0c000a00, - .cpsr_mask = MODE_MASK, - .cpsr_val = SVC_MODE, - .fn = vfp_kmode_exception, + .cpsr_mask = PSR_T_BIT, + .cpsr_val = PSR_T_BIT, + .fn = vfp_support_entry, }}; -static int __init vfp_kmode_exception_hook_init(void) -{ - int i; +static struct undef_hook vfp_support_hook = { + .instr_mask = 0x0c000e00, + .instr_val = 0x0c000a00, + .fn = vfp_support_entry, +}; - for (i = 0; i < ARRAY_SIZE(vfp_kmode_exception_hook); i++) - register_undef_hook(&vfp_kmode_exception_hook[i]); - return 0; -} -subsys_initcall(vfp_kmode_exception_hook_init); +#ifdef CONFIG_KERNEL_MODE_NEON /* * Kernel-side NEON support functions @@ -833,8 +913,11 @@ static int __init vfp_init(void) * for NEON if the hardware has the MVFR registers. */ if (IS_ENABLED(CONFIG_NEON) && - (fmrx(MVFR1) & 0x000fff00) == 0x00011100) + (fmrx(MVFR1) & 0x000fff00) == 0x00011100) { elf_hwcap |= HWCAP_NEON; + for (int i = 0; i < ARRAY_SIZE(neon_support_hook); i++) + register_undef_hook(&neon_support_hook[i]); + } if (IS_ENABLED(CONFIG_VFPv3)) { u32 mvfr0 = fmrx(MVFR0); @@ -903,6 +986,7 @@ static int __init vfp_init(void) have_vfp = true; + register_undef_hook(&vfp_support_hook); thread_register_notifier(&vfp_notifier_block); vfp_pm_init(); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 136232a897392..5c497c862d757 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -931,7 +931,7 @@ static inline pte_t *pudp_ptep(pud_t *pud) #define pud_mkdirty(pud) pte_pud(pte_mkdirty(pud_pte(pud))) #define pud_mkclean(pud) pte_pud(pte_mkclean(pud_pte(pud))) #define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) -#define pud_mkwrite(pud) pte_pud(pte_mkwrite(pud_pte(pud))) +#define pud_mkwrite(pud) pte_pud(pte_mkwrite_novma(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c index 5ff001140ef4a..b7e3763c47f0c 100644 --- a/drivers/mtd/mtdsuper.c +++ b/drivers/mtd/mtdsuper.c @@ -19,38 +19,6 @@ #include #include "mtdcore.h" -/* - * compare superblocks to see if they're equivalent - * - they are if the underlying MTD device is the same - */ -static int mtd_test_super(struct super_block *sb, struct fs_context *fc) -{ - struct mtd_info *mtd = fc->sget_key; - - if (sb->s_mtd == fc->sget_key) { - pr_debug("MTDSB: Match on device %d (\"%s\")\n", - mtd->index, mtd->name); - return 1; - } - - pr_debug("MTDSB: No match, device %d (\"%s\"), device %d (\"%s\")\n", - sb->s_mtd->index, sb->s_mtd->name, mtd->index, mtd->name); - return 0; -} - -/* - * mark the superblock by the MTD device it is using - * - set the device number to be the correct MTD block device for pesuperstence - * of NFS exports - */ -static int mtd_set_super(struct super_block *sb, struct fs_context *fc) -{ - sb->s_mtd = fc->sget_key; - sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, sb->s_mtd->index); - sb->s_bdi = bdi_get(mtd_bdi); - return 0; -} - /* * get a superblock on an MTD-backed filesystem */ @@ -62,8 +30,7 @@ static int mtd_get_sb(struct fs_context *fc, struct super_block *sb; int ret; - fc->sget_key = mtd; - sb = sget_fc(fc, mtd_test_super, mtd_set_super); + sb = sget_dev(fc, MKDEV(MTD_BLOCK_MAJOR, mtd->index)); if (IS_ERR(sb)) return PTR_ERR(sb); @@ -77,6 +44,16 @@ static int mtd_get_sb(struct fs_context *fc, pr_debug("MTDSB: New superblock for device %d (\"%s\")\n", mtd->index, mtd->name); + /* + * Would usually have been set with @sb_lock held but in + * contrast to sb->s_bdev that's checked with only + * @sb_lock held, nothing checks sb->s_mtd without also + * holding sb->s_umount and we're holding sb->s_umount + * here. + */ + sb->s_mtd = mtd; + sb->s_bdi = bdi_get(mtd_bdi); + ret = fill_super(sb, fc); if (ret < 0) goto error_sb; diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 2beceff024e39..e55e0a2cd2e86 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -664,7 +664,7 @@ static ssize_t comm_addr_store(struct config_item *item, const char *buf, memcpy(addr, buf, len); - rv = dlm_lowcomms_addr(cm->nodeid, addr, len); + rv = dlm_midcomms_addr(cm->nodeid, addr, len); if (rv) { kfree(addr); return rv; diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index a1aca41c49d06..5aabcb6f0f157 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -18,6 +18,7 @@ #include "dlm_internal.h" #include "midcomms.h" #include "lock.h" +#include "ast.h" #define DLM_DEBUG_BUF_LEN 4096 static char debug_buf[DLM_DEBUG_BUF_LEN]; @@ -365,6 +366,52 @@ static void print_format4(struct dlm_rsb *r, struct seq_file *s) unlock_rsb(r); } +static void print_format5_lock(struct seq_file *s, struct dlm_lkb *lkb) +{ + struct dlm_callback *cb; + + /* lkb_id lkb_flags mode flags sb_status sb_flags */ + + spin_lock(&lkb->lkb_cb_lock); + list_for_each_entry(cb, &lkb->lkb_callbacks, list) { + seq_printf(s, "%x %x %d %x %d %x\n", + lkb->lkb_id, + dlm_iflags_val(lkb), + cb->mode, + cb->flags, + cb->sb_status, + cb->sb_flags); + } + spin_unlock(&lkb->lkb_cb_lock); +} + +static void print_format5(struct dlm_rsb *r, struct seq_file *s) +{ + struct dlm_lkb *lkb; + + lock_rsb(r); + + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + print_format5_lock(s, lkb); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + print_format5_lock(s, lkb); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) { + print_format5_lock(s, lkb); + if (seq_has_overflowed(s)) + goto out; + } + out: + unlock_rsb(r); +} + struct rsbtbl_iter { struct dlm_rsb *rsb; unsigned bucket; @@ -408,6 +455,13 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr) } print_format4(ri->rsb, seq); break; + case 5: + if (ri->header) { + seq_puts(seq, "lkb_id lkb_flags mode flags sb_status sb_flags\n"); + ri->header = 0; + } + print_format5(ri->rsb, seq); + break; } return 0; @@ -417,6 +471,7 @@ static const struct seq_operations format1_seq_ops; static const struct seq_operations format2_seq_ops; static const struct seq_operations format3_seq_ops; static const struct seq_operations format4_seq_ops; +static const struct seq_operations format5_seq_ops; static void *table_seq_start(struct seq_file *seq, loff_t *pos) { @@ -448,6 +503,8 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos) ri->format = 3; if (seq->op == &format4_seq_ops) ri->format = 4; + if (seq->op == &format5_seq_ops) + ri->format = 5; tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; @@ -602,10 +659,18 @@ static const struct seq_operations format4_seq_ops = { .show = table_seq_show, }; +static const struct seq_operations format5_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + static const struct file_operations format1_fops; static const struct file_operations format2_fops; static const struct file_operations format3_fops; static const struct file_operations format4_fops; +static const struct file_operations format5_fops; static int table_open1(struct inode *inode, struct file *file) { @@ -683,7 +748,21 @@ static int table_open4(struct inode *inode, struct file *file) struct seq_file *seq; int ret; - ret = seq_open(file, &format4_seq_ops); + ret = seq_open(file, &format5_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open5(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format5_seq_ops); if (ret) return ret; @@ -725,6 +804,14 @@ static const struct file_operations format4_fops = { .release = seq_release }; +static const struct file_operations format5_fops = { + .owner = THIS_MODULE, + .open = table_open5, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + /* * dump lkb's on the ls_waiters list */ @@ -793,6 +880,7 @@ void dlm_delete_debug_file(struct dlm_ls *ls) debugfs_remove(ls->ls_debug_locks_dentry); debugfs_remove(ls->ls_debug_all_dentry); debugfs_remove(ls->ls_debug_toss_dentry); + debugfs_remove(ls->ls_debug_queued_asts_dentry); } static int dlm_state_show(struct seq_file *file, void *offset) @@ -936,6 +1024,17 @@ void dlm_create_debug_file(struct dlm_ls *ls) dlm_root, ls, &waiters_fops); + + /* format 5 */ + + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_queued_asts", ls->ls_name); + + ls->ls_debug_queued_asts_dentry = debugfs_create_file(name, + 0644, + dlm_root, + ls, + &format5_fops); } void __init dlm_register_debugfs(void) diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index fb1981654bb24..f6acba4310a7b 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -58,7 +58,7 @@ void dlm_recover_dir_nodeid(struct dlm_ls *ls) up_read(&ls->ls_root_sem); } -int dlm_recover_directory(struct dlm_ls *ls) +int dlm_recover_directory(struct dlm_ls *ls, uint64_t seq) { struct dlm_member *memb; char *b, *last_name = NULL; @@ -90,7 +90,7 @@ int dlm_recover_directory(struct dlm_ls *ls) } error = dlm_rcom_names(ls, memb->nodeid, - last_name, last_len); + last_name, last_len, seq); if (error) goto out_free; @@ -196,7 +196,8 @@ int dlm_recover_directory(struct dlm_ls *ls) return error; } -static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len) +static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, const char *name, + int len) { struct dlm_rsb *r; uint32_t hash, bucket; @@ -232,7 +233,7 @@ static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len) for rsb's we're master of and whose directory node matches the requesting node. inbuf is the rsb name last sent, inlen is the name's length */ -void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, +void dlm_copy_master_names(struct dlm_ls *ls, const char *inbuf, int inlen, char *outbuf, int outlen, int nodeid) { struct list_head *list; @@ -245,9 +246,8 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, if (inlen > 1) { r = find_rsb_root(ls, inbuf, inlen); if (!r) { - inbuf[inlen - 1] = '\0'; - log_error(ls, "copy_master_names from %d start %d %s", - nodeid, inlen, inbuf); + log_error(ls, "copy_master_names from %d start %d %.*s", + nodeid, inlen, inlen, inbuf); goto out; } list = r->res_root_list.next; diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h index 03844d086be23..39ecb69d7ef36 100644 --- a/fs/dlm/dir.h +++ b/fs/dlm/dir.h @@ -15,9 +15,9 @@ int dlm_dir_nodeid(struct dlm_rsb *rsb); int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash); void dlm_recover_dir_nodeid(struct dlm_ls *ls); -int dlm_recover_directory(struct dlm_ls *ls); -void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, - char *outbuf, int outlen, int nodeid); +int dlm_recover_directory(struct dlm_ls *ls, uint64_t seq); +void dlm_copy_master_names(struct dlm_ls *ls, const char *inbuf, int inlen, + char *outbuf, int outlen, int nodeid); #endif /* __DIR_DOT_H__ */ diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index c8156770205e6..dfc444dad3298 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -598,6 +598,7 @@ struct dlm_ls { struct dentry *ls_debug_locks_dentry; /* debugfs */ struct dentry *ls_debug_all_dentry; /* debugfs */ struct dentry *ls_debug_toss_dentry; /* debugfs */ + struct dentry *ls_debug_queued_asts_dentry; /* debugfs */ wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index f511a9d7d416e..652c51fbbf768 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -86,8 +86,8 @@ static int send_remove(struct dlm_rsb *r); static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms, bool local); -static int receive_extralen(struct dlm_message *ms); + const struct dlm_message *ms, bool local); +static int receive_extralen(const struct dlm_message *ms); static void do_purge(struct dlm_ls *ls, int nodeid, int pid); static void toss_rsb(struct kref *kref); @@ -984,8 +984,8 @@ static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_no * . dlm_master_lookup RECOVER_MASTER (fix_master 1, from_master 0) */ -int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len, - unsigned int flags, int *r_nodeid, int *result) +int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name, + int len, unsigned int flags, int *r_nodeid, int *result) { struct dlm_rsb *r = NULL; uint32_t hash, b; @@ -1106,7 +1106,7 @@ static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash) } } -void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len) +void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len) { struct dlm_rsb *r = NULL; uint32_t hash, b; @@ -1459,7 +1459,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) set RESEND and dlm_recover_waiters_post() */ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, - struct dlm_message *ms) + const struct dlm_message *ms) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; int overlap_done = 0; @@ -1557,8 +1557,8 @@ static int remove_from_waiters(struct dlm_lkb *lkb, int mstype) /* Handles situations where we might be processing a "fake" or "local" reply in which we can't try to take waiters_mutex again. */ -static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms, - bool local) +static int remove_from_waiters_ms(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; int error; @@ -1800,7 +1800,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) /* lkb is process copy (pc) */ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { int b; @@ -1907,7 +1907,7 @@ static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) } static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { set_lvb_lock_pc(r, lkb, ms); _grant_lock(r, lkb); @@ -1945,7 +1945,7 @@ static void munge_demoted(struct dlm_lkb *lkb) lkb->lkb_grmode = DLM_LOCK_NL; } -static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms) +static void munge_altmode(struct dlm_lkb *lkb, const struct dlm_message *ms) { if (ms->m_type != cpu_to_le32(DLM_MSG_REQUEST_REPLY) && ms->m_type != cpu_to_le32(DLM_MSG_GRANT)) { @@ -3641,8 +3641,9 @@ static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv); } -static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, - int ret_nodeid, int rv) +static int send_lookup_reply(struct dlm_ls *ls, + const struct dlm_message *ms_in, int ret_nodeid, + int rv) { struct dlm_rsb *r = &ls->ls_local_rsb; struct dlm_message *ms; @@ -3667,14 +3668,15 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, of message, unlike the send side where we can safely send everything about the lkb for any type of message */ -static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms) +static void receive_flags(struct dlm_lkb *lkb, const struct dlm_message *ms) { lkb->lkb_exflags = le32_to_cpu(ms->m_exflags); dlm_set_sbflags_val(lkb, le32_to_cpu(ms->m_sbflags)); dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags)); } -static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms, +static void receive_flags_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { if (local) @@ -3684,14 +3686,14 @@ static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms, dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags)); } -static int receive_extralen(struct dlm_message *ms) +static int receive_extralen(const struct dlm_message *ms) { return (le16_to_cpu(ms->m_header.h_length) - sizeof(struct dlm_message)); } static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { int len; @@ -3719,7 +3721,7 @@ static void fake_astfn(void *astparam) } static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid); lkb->lkb_ownpid = le32_to_cpu(ms->m_pid); @@ -3741,7 +3743,7 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, } static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { if (lkb->lkb_status != DLM_LKSTS_GRANTED) return -EBUSY; @@ -3756,7 +3758,7 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, } static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { if (receive_lvb(ls, lkb, ms)) return -ENOMEM; @@ -3766,7 +3768,7 @@ static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, /* We fill in the local-lkb fields with the info that send_xxxx_reply() uses to send a reply and that the remote end uses to process the reply. */ -static void setup_local_lkb(struct dlm_ls *ls, struct dlm_message *ms) +static void setup_local_lkb(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb = &ls->ls_local_lkb; lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid); @@ -3776,7 +3778,7 @@ static void setup_local_lkb(struct dlm_ls *ls, struct dlm_message *ms) /* This is called after the rsb is locked so that we can safely inspect fields in the lkb. */ -static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) +static int validate_message(struct dlm_lkb *lkb, const struct dlm_message *ms) { int from = le32_to_cpu(ms->m_header.h_nodeid); int error = 0; @@ -3828,7 +3830,7 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) return error; } -static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_request(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3907,7 +3909,7 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) return error; } -static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_convert(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3963,7 +3965,7 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) return error; } -static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_unlock(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4015,7 +4017,7 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) return error; } -static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_cancel(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4051,7 +4053,7 @@ static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) return error; } -static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_grant(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4082,7 +4084,7 @@ static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_bast(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4110,7 +4112,7 @@ static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) +static void receive_lookup(struct dlm_ls *ls, const struct dlm_message *ms) { int len, error, ret_nodeid, from_nodeid, our_nodeid; @@ -4130,7 +4132,7 @@ static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) send_lookup_reply(ls, ms, ret_nodeid, error); } -static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) +static void receive_remove(struct dlm_ls *ls, const struct dlm_message *ms) { char name[DLM_RESNAME_MAXLEN+1]; struct dlm_rsb *r; @@ -4218,12 +4220,13 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) } } -static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms) +static void receive_purge(struct dlm_ls *ls, const struct dlm_message *ms) { do_purge(ls, le32_to_cpu(ms->m_nodeid), le32_to_cpu(ms->m_pid)); } -static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_request_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4345,7 +4348,7 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) } static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms, bool local) + const struct dlm_message *ms, bool local) { /* this is the value returned from do_convert() on the master */ switch (from_dlm_errno(le32_to_cpu(ms->m_result))) { @@ -4388,8 +4391,8 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, } } -static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms, - bool local) +static void _receive_convert_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { struct dlm_rsb *r = lkb->lkb_resource; int error; @@ -4412,7 +4415,8 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms, put_rsb(r); } -static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_convert_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; int error; @@ -4426,8 +4430,8 @@ static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms, - bool local) +static void _receive_unlock_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { struct dlm_rsb *r = lkb->lkb_resource; int error; @@ -4463,7 +4467,8 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms, put_rsb(r); } -static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_unlock_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; int error; @@ -4477,8 +4482,8 @@ static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms, - bool local) +static void _receive_cancel_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { struct dlm_rsb *r = lkb->lkb_resource; int error; @@ -4515,7 +4520,8 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms, put_rsb(r); } -static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_cancel_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; int error; @@ -4529,7 +4535,8 @@ static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) +static void receive_lookup_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4608,7 +4615,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) dlm_put_lkb(lkb); } -static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, +static void _receive_message(struct dlm_ls *ls, const struct dlm_message *ms, uint32_t saved_seq) { int error = 0, noent = 0; @@ -4744,7 +4751,7 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, requestqueue, to processing all the saved messages, to processing new messages as they arrive. */ -static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms, +static void dlm_receive_message(struct dlm_ls *ls, const struct dlm_message *ms, int nodeid) { if (dlm_locking_stopped(ls)) { @@ -4767,7 +4774,7 @@ static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms, /* This is called by dlm_recoverd to process messages that were saved on the requestqueue. */ -void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, +void dlm_receive_message_saved(struct dlm_ls *ls, const struct dlm_message *ms, uint32_t saved_seq) { _receive_message(ls, ms, saved_seq); @@ -4778,9 +4785,9 @@ void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, standard locking activity) or an RCOM (recovery message sent as part of lockspace recovery). */ -void dlm_receive_buffer(union dlm_packet *p, int nodeid) +void dlm_receive_buffer(const union dlm_packet *p, int nodeid) { - struct dlm_header *hd = &p->header; + const struct dlm_header *hd = &p->header; struct dlm_ls *ls; int type = 0; @@ -5334,7 +5341,7 @@ static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid, /* needs at least dlm_rcom + rcom_lock */ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_rsb *r, struct dlm_rcom *rc) + struct dlm_rsb *r, const struct dlm_rcom *rc) { struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; @@ -5384,7 +5391,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, back the rcom_lock struct we got but with the remid field filled in. */ /* needs at least dlm_rcom + rcom_lock */ -int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) +int dlm_recover_master_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + __le32 *rl_remid, __le32 *rl_result) { struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; struct dlm_rsb *r; @@ -5393,6 +5401,9 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) int from_nodeid = le32_to_cpu(rc->rc_header.h_nodeid); int error; + /* init rl_remid with rcom lock rl_remid */ + *rl_remid = rl->rl_remid; + if (rl->rl_parent_lkid) { error = -EOPNOTSUPP; goto out; @@ -5448,7 +5459,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) out_remid: /* this is the new value returned to the lock holder for saving in its process-copy lkb */ - rl->rl_remid = cpu_to_le32(lkb->lkb_id); + *rl_remid = cpu_to_le32(lkb->lkb_id); lkb->lkb_recover_seq = ls->ls_recover_seq; @@ -5459,12 +5470,13 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) if (error && error != -EEXIST) log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d", from_nodeid, remid, error); - rl->rl_result = cpu_to_le32(error); + *rl_result = cpu_to_le32(error); return error; } /* needs at least dlm_rcom + rcom_lock */ -int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) +int dlm_recover_process_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + uint64_t seq) { struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; struct dlm_rsb *r; @@ -5509,7 +5521,7 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid, result); - dlm_send_rcom_lock(r, lkb); + dlm_send_rcom_lock(r, lkb, seq); goto out; case -EEXIST: case 0: diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index aa5ad44d902bf..b54e2cbbe6e27 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -12,11 +12,11 @@ #define __LOCK_DOT_H__ void dlm_dump_rsb(struct dlm_rsb *r); -void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len); +void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len); void dlm_print_lkb(struct dlm_lkb *lkb); -void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, +void dlm_receive_message_saved(struct dlm_ls *ls, const struct dlm_message *ms, uint32_t saved_seq); -void dlm_receive_buffer(union dlm_packet *p, int nodeid); +void dlm_receive_buffer(const union dlm_packet *p, int nodeid); int dlm_modes_compat(int mode1, int mode2); void dlm_put_rsb(struct dlm_rsb *r); void dlm_hold_rsb(struct dlm_rsb *r); @@ -25,8 +25,8 @@ void dlm_scan_rsbs(struct dlm_ls *ls); int dlm_lock_recovery_try(struct dlm_ls *ls); void dlm_unlock_recovery(struct dlm_ls *ls); -int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, - unsigned int flags, int *r_nodeid, int *result); +int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name, + int len, unsigned int flags, int *r_nodeid, int *result); int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len, struct dlm_rsb **r_ret); @@ -36,8 +36,10 @@ void dlm_purge_mstcpy_locks(struct dlm_rsb *r); void dlm_recover_grant(struct dlm_ls *ls); int dlm_recover_waiters_post(struct dlm_ls *ls); void dlm_recover_waiters_pre(struct dlm_ls *ls); -int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); -int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); +int dlm_recover_master_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + __le32 *rl_remid, __le32 *rl_result); +int dlm_recover_process_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + uint64_t seq); int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9f14ea9f63224..f7bc22e74db27 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -863,7 +863,6 @@ struct dlm_processed_nodes { static void process_dlm_messages(struct work_struct *work) { struct processqueue_entry *pentry; - LIST_HEAD(processed_nodes); spin_lock(&processqueue_lock); pentry = list_first_entry_or_null(&processqueue, diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 77d202e4a02a4..be7909ead71b4 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -18,7 +18,7 @@ #include "midcomms.h" #include "lowcomms.h" -int dlm_slots_version(struct dlm_header *h) +int dlm_slots_version(const struct dlm_header *h) { if ((le32_to_cpu(h->h_version) & 0x0000FFFF) < DLM_HEADER_SLOTS) return 0; @@ -393,14 +393,9 @@ static void remove_remote_member(int nodeid) dlm_midcomms_remove_member(nodeid); } -static void clear_members_cb(int nodeid) -{ - remove_remote_member(nodeid); -} - void dlm_clear_members(struct dlm_ls *ls) { - clear_memb_list(&ls->ls_nodes, clear_members_cb); + clear_memb_list(&ls->ls_nodes, remove_remote_member); ls->ls_num_nodes = 0; } @@ -454,7 +449,7 @@ static void make_member_array(struct dlm_ls *ls) /* send a status request to all members just to establish comms connections */ -static int ping_members(struct dlm_ls *ls) +static int ping_members(struct dlm_ls *ls, uint64_t seq) { struct dlm_member *memb; int error = 0; @@ -464,7 +459,7 @@ static int ping_members(struct dlm_ls *ls) error = -EINTR; break; } - error = dlm_rcom_status(ls, memb->nodeid, 0); + error = dlm_rcom_status(ls, memb->nodeid, 0, seq); if (error) break; } @@ -612,7 +607,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) make_member_array(ls); *neg_out = neg; - error = ping_members(ls); + error = ping_members(ls, rv->seq); log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; } diff --git a/fs/dlm/member.h b/fs/dlm/member.h index 433b2fac9f4a7..f61cfde463140 100644 --- a/fs/dlm/member.h +++ b/fs/dlm/member.h @@ -18,7 +18,7 @@ void dlm_clear_members_gone(struct dlm_ls *ls); int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); int dlm_is_removed(struct dlm_ls *ls, int nodeid); int dlm_is_member(struct dlm_ls *ls, int nodeid); -int dlm_slots_version(struct dlm_header *h); +int dlm_slots_version(const struct dlm_header *h); void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc, struct dlm_member *memb); void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc); diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index e1a0df67b5669..f641b36a36db0 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -330,18 +330,23 @@ static void midcomms_node_reset(struct midcomms_node *node) wake_up(&node->shutdown_wait); } -static struct midcomms_node *nodeid2node(int nodeid, gfp_t alloc) +static struct midcomms_node *nodeid2node(int nodeid) { - struct midcomms_node *node, *tmp; - int r = nodeid_hash(nodeid); + return __find_node(nodeid, nodeid_hash(nodeid)); +} + +int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) +{ + int ret, r = nodeid_hash(nodeid); + struct midcomms_node *node; - node = __find_node(nodeid, r); - if (node || !alloc) - return node; + ret = dlm_lowcomms_addr(nodeid, addr, len); + if (ret) + return ret; - node = kmalloc(sizeof(*node), alloc); + node = kmalloc(sizeof(*node), GFP_NOFS); if (!node) - return NULL; + return -ENOMEM; node->nodeid = nodeid; spin_lock_init(&node->state_lock); @@ -353,21 +358,11 @@ static struct midcomms_node *nodeid2node(int nodeid, gfp_t alloc) midcomms_node_reset(node); spin_lock(&nodes_lock); - /* check again if there was somebody else - * earlier here to add the node - */ - tmp = __find_node(nodeid, r); - if (tmp) { - spin_unlock(&nodes_lock); - kfree(node); - return tmp; - } - hlist_add_head_rcu(&node->hlist, &node_hash[r]); spin_unlock(&nodes_lock); node->debugfs = dlm_create_debug_comms_file(nodeid, node); - return node; + return 0; } static int dlm_send_ack(int nodeid, uint32_t seq) @@ -499,7 +494,8 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) spin_unlock(&node->state_lock); } -static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p) +static void dlm_receive_buffer_3_2_trace(uint32_t seq, + const union dlm_packet *p) { switch (p->header.h_cmd) { case DLM_MSG: @@ -513,7 +509,7 @@ static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p) } } -static void dlm_midcomms_receive_buffer(union dlm_packet *p, +static void dlm_midcomms_receive_buffer(const union dlm_packet *p, struct midcomms_node *node, uint32_t seq) { @@ -602,113 +598,8 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, } } -static struct midcomms_node * -dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p, - uint16_t msglen, int (*cb)(struct midcomms_node *node)) -{ - struct midcomms_node *node = NULL; - gfp_t allocation = 0; - int ret; - - switch (p->header.h_cmd) { - case DLM_RCOM: - if (msglen < sizeof(struct dlm_rcom)) { - log_print("rcom msg too small: %u, will skip this message from node %d", - msglen, nodeid); - return NULL; - } - - switch (p->rcom.rc_type) { - case cpu_to_le32(DLM_RCOM_NAMES): - fallthrough; - case cpu_to_le32(DLM_RCOM_NAMES_REPLY): - fallthrough; - case cpu_to_le32(DLM_RCOM_STATUS): - fallthrough; - case cpu_to_le32(DLM_RCOM_STATUS_REPLY): - node = nodeid2node(nodeid, 0); - if (node) { - spin_lock(&node->state_lock); - if (node->state != DLM_ESTABLISHED) - pr_debug("receive begin RCOM msg from node %d with state %s\n", - node->nodeid, dlm_state_str(node->state)); - - switch (node->state) { - case DLM_CLOSED: - node->state = DLM_ESTABLISHED; - pr_debug("switch node %d to state %s\n", - node->nodeid, dlm_state_str(node->state)); - break; - case DLM_ESTABLISHED: - break; - default: - spin_unlock(&node->state_lock); - return NULL; - } - spin_unlock(&node->state_lock); - } - - allocation = GFP_NOFS; - break; - default: - break; - } - - break; - default: - break; - } - - node = nodeid2node(nodeid, allocation); - if (!node) { - switch (p->header.h_cmd) { - case DLM_OPTS: - if (msglen < sizeof(struct dlm_opts)) { - log_print("opts msg too small: %u, will skip this message from node %d", - msglen, nodeid); - return NULL; - } - - log_print_ratelimited("received dlm opts message nextcmd %d from node %d in an invalid sequence", - p->opts.o_nextcmd, nodeid); - break; - default: - log_print_ratelimited("received dlm message cmd %d from node %d in an invalid sequence", - p->header.h_cmd, nodeid); - break; - } - - return NULL; - } - - ret = cb(node); - if (ret < 0) - return NULL; - - return node; -} - -static int dlm_midcomms_version_check_3_2(struct midcomms_node *node) -{ - switch (node->version) { - case DLM_VERSION_NOT_SET: - node->version = DLM_VERSION_3_2; - wake_up(&node->shutdown_wait); - log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, - node->nodeid); - break; - case DLM_VERSION_3_2: - break; - default: - log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", - DLM_VERSION_3_2, node->nodeid, node->version); - return -1; - } - - return 0; -} - -static int dlm_opts_check_msglen(union dlm_packet *p, uint16_t msglen, int nodeid) +static int dlm_opts_check_msglen(const union dlm_packet *p, uint16_t msglen, + int nodeid) { int len = msglen; @@ -757,7 +648,7 @@ static int dlm_opts_check_msglen(union dlm_packet *p, uint16_t msglen, int nodei return 0; } -static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) +static void dlm_midcomms_receive_buffer_3_2(const union dlm_packet *p, int nodeid) { uint16_t msglen = le16_to_cpu(p->header.h_length); struct midcomms_node *node; @@ -765,10 +656,37 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) int ret, idx; idx = srcu_read_lock(&nodes_srcu); - node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen, - dlm_midcomms_version_check_3_2); - if (!node) + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) + goto out; + + switch (node->version) { + case DLM_VERSION_NOT_SET: + node->version = DLM_VERSION_3_2; + wake_up(&node->shutdown_wait); + log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, + node->nodeid); + + spin_lock(&node->state_lock); + switch (node->state) { + case DLM_CLOSED: + node->state = DLM_ESTABLISHED; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + default: + break; + } + spin_unlock(&node->state_lock); + + break; + case DLM_VERSION_3_2: + break; + default: + log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", + DLM_VERSION_3_2, node->nodeid, node->version); goto out; + } switch (p->header.h_cmd) { case DLM_RCOM: @@ -858,8 +776,19 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) srcu_read_unlock(&nodes_srcu, idx); } -static int dlm_midcomms_version_check_3_1(struct midcomms_node *node) +static void dlm_midcomms_receive_buffer_3_1(const union dlm_packet *p, int nodeid) { + uint16_t msglen = le16_to_cpu(p->header.h_length); + struct midcomms_node *node; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + switch (node->version) { case DLM_VERSION_NOT_SET: node->version = DLM_VERSION_3_1; @@ -872,22 +801,6 @@ static int dlm_midcomms_version_check_3_1(struct midcomms_node *node) default: log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", DLM_VERSION_3_1, node->nodeid, node->version); - return -1; - } - - return 0; -} - -static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid) -{ - uint16_t msglen = le16_to_cpu(p->header.h_length); - struct midcomms_node *node; - int idx; - - idx = srcu_read_lock(&nodes_srcu); - node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen, - dlm_midcomms_version_check_3_1); - if (!node) { srcu_read_unlock(&nodes_srcu, idx); return; } @@ -977,10 +890,10 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) switch (hd->h_version) { case cpu_to_le32(DLM_VERSION_3_1): - dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid); + dlm_midcomms_receive_buffer_3_1((const union dlm_packet *)ptr, nodeid); break; case cpu_to_le32(DLM_VERSION_3_2): - dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid); + dlm_midcomms_receive_buffer_3_2((const union dlm_packet *)ptr, nodeid); break; default: log_print("received invalid version header: %u from node %d, will skip this message", @@ -1003,8 +916,8 @@ void dlm_midcomms_unack_msg_resend(int nodeid) int idx, ret; idx = srcu_read_lock(&nodes_srcu); - node = nodeid2node(nodeid, 0); - if (!node) { + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { srcu_read_unlock(&nodes_srcu, idx); return; } @@ -1090,11 +1003,9 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, int idx; idx = srcu_read_lock(&nodes_srcu); - node = nodeid2node(nodeid, 0); - if (!node) { - WARN_ON_ONCE(1); + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) goto err; - } /* this is a bug, however we going on and hope it will be resolved */ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); @@ -1235,8 +1146,34 @@ void dlm_midcomms_init(void) dlm_lowcomms_init(); } +static void midcomms_node_release(struct rcu_head *rcu) +{ + struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); + + WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); + dlm_send_queue_flush(node); + kfree(node); +} + void dlm_midcomms_exit(void) { + struct midcomms_node *node; + int i, idx; + + idx = srcu_read_lock(&nodes_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + dlm_delete_debug_comms_file(node->debugfs); + + spin_lock(&nodes_lock); + hlist_del_rcu(&node->hlist); + spin_unlock(&nodes_lock); + + call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); + } + } + srcu_read_unlock(&nodes_srcu, idx); + dlm_lowcomms_exit(); } @@ -1277,8 +1214,8 @@ void dlm_midcomms_add_member(int nodeid) int idx; idx = srcu_read_lock(&nodes_srcu); - node = nodeid2node(nodeid, GFP_NOFS); - if (!node) { + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { srcu_read_unlock(&nodes_srcu, idx); return; } @@ -1322,8 +1259,8 @@ void dlm_midcomms_remove_member(int nodeid) int idx; idx = srcu_read_lock(&nodes_srcu); - node = nodeid2node(nodeid, 0); - if (!node) { + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { srcu_read_unlock(&nodes_srcu, idx); return; } @@ -1367,15 +1304,6 @@ void dlm_midcomms_remove_member(int nodeid) srcu_read_unlock(&nodes_srcu, idx); } -static void midcomms_node_release(struct rcu_head *rcu) -{ - struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); - - WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); - dlm_send_queue_flush(node); - kfree(node); -} - void dlm_midcomms_version_wait(void) { struct midcomms_node *node; @@ -1438,7 +1366,7 @@ static void midcomms_shutdown(struct midcomms_node *node) node->state == DLM_CLOSED || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), DLM_SHUTDOWN_TIMEOUT); - if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) + if (!ret) pr_debug("active shutdown timed out for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); else @@ -1456,14 +1384,6 @@ void dlm_midcomms_shutdown(void) for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { midcomms_shutdown(node); - - dlm_delete_debug_comms_file(node->debugfs); - - spin_lock(&nodes_lock); - hlist_del_rcu(&node->hlist); - spin_unlock(&nodes_lock); - - call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); } } srcu_read_unlock(&nodes_srcu, idx); @@ -1479,7 +1399,7 @@ int dlm_midcomms_close(int nodeid) idx = srcu_read_lock(&nodes_srcu); /* Abort pending close/remove operation */ - node = nodeid2node(nodeid, 0); + node = nodeid2node(nodeid); if (node) { /* let shutdown waiters leave */ set_bit(DLM_NODE_FLAG_CLOSE, &node->flags); @@ -1489,20 +1409,32 @@ int dlm_midcomms_close(int nodeid) synchronize_srcu(&nodes_srcu); - idx = srcu_read_lock(&nodes_srcu); mutex_lock(&close_lock); - node = nodeid2node(nodeid, 0); + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid); if (!node) { - mutex_unlock(&close_lock); srcu_read_unlock(&nodes_srcu, idx); + mutex_unlock(&close_lock); return dlm_lowcomms_close(nodeid); } ret = dlm_lowcomms_close(nodeid); - spin_lock(&node->state_lock); - midcomms_node_reset(node); - spin_unlock(&node->state_lock); + dlm_delete_debug_comms_file(node->debugfs); + + spin_lock(&nodes_lock); + hlist_del_rcu(&node->hlist); + spin_unlock(&nodes_lock); srcu_read_unlock(&nodes_srcu, idx); + + /* wait that all readers left until flush send queue */ + synchronize_srcu(&nodes_srcu); + + /* drop all pending dlm messages, this is fine as + * this function get called when the node is fenced + */ + dlm_send_queue_flush(node); + + call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); mutex_unlock(&close_lock); return ret; diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 9f8c9605013d0..e7246fb3ef577 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -20,6 +20,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, gfp_t allocation, char **ppc); void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, int namelen); +int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); void dlm_midcomms_version_wait(void); int dlm_midcomms_close(int nodeid); int dlm_midcomms_start(void); diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 70a4752ed913a..e6b4c1a214466 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -11,6 +11,8 @@ #include #include +#include + #include "dlm_internal.h" #include "lockspace.h" @@ -42,6 +44,27 @@ static inline void set_version(struct dlm_plock_info *info) info->version[2] = DLM_PLOCK_VERSION_PATCH; } +static struct plock_op *plock_lookup_waiter(const struct dlm_plock_info *info) +{ + struct plock_op *op = NULL, *iter; + + list_for_each_entry(iter, &recv_list, list) { + if (iter->info.fsid == info->fsid && + iter->info.number == info->number && + iter->info.owner == info->owner && + iter->info.pid == info->pid && + iter->info.start == info->start && + iter->info.end == info->end && + iter->info.ex == info->ex && + iter->info.wait) { + op = iter; + break; + } + } + + return op; +} + static int check_version(struct dlm_plock_info *info) { if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) || @@ -74,30 +97,26 @@ static void send_op(struct plock_op *op) wake_up(&send_wq); } -/* If a process was killed while waiting for the only plock on a file, - locks_remove_posix will not see any lock on the file so it won't - send an unlock-close to us to pass on to userspace to clean up the - abandoned waiter. So, we have to insert the unlock-close when the - lock call is interrupted. */ - -static void do_unlock_close(const struct dlm_plock_info *info) +static int do_lock_cancel(const struct dlm_plock_info *orig_info) { struct plock_op *op; + int rv; op = kzalloc(sizeof(*op), GFP_NOFS); if (!op) - return; + return -ENOMEM; + + op->info = *orig_info; + op->info.optype = DLM_PLOCK_OP_CANCEL; + op->info.wait = 0; - op->info.optype = DLM_PLOCK_OP_UNLOCK; - op->info.pid = info->pid; - op->info.fsid = info->fsid; - op->info.number = info->number; - op->info.start = 0; - op->info.end = OFFSET_MAX; - op->info.owner = info->owner; - - op->info.flags |= DLM_PLOCK_FL_CLOSE; send_op(op); + wait_event(recv_wq, (op->done != 0)); + + rv = op->info.rv; + + dlm_release_plock_op(op); + return rv; } int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, @@ -156,7 +175,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, send_op(op); if (op->info.wait) { - rv = wait_event_killable(recv_wq, (op->done != 0)); + rv = wait_event_interruptible(recv_wq, (op->done != 0)); if (rv == -ERESTARTSYS) { spin_lock(&ops_lock); /* recheck under ops_lock if we got a done != 0, @@ -166,17 +185,37 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, spin_unlock(&ops_lock); goto do_lock_wait; } - list_del(&op->list); spin_unlock(&ops_lock); + rv = do_lock_cancel(&op->info); + switch (rv) { + case 0: + /* waiter was deleted in user space, answer will never come + * remove original request. The original request must be + * on recv_list because the answer of do_lock_cancel() + * synchronized it. + */ + spin_lock(&ops_lock); + list_del(&op->list); + spin_unlock(&ops_lock); + rv = -EINTR; + break; + case -ENOENT: + /* cancellation wasn't successful but op should be done */ + fallthrough; + default: + /* internal error doing cancel we need to wait */ + goto wait; + } + log_debug(ls, "%s: wait interrupted %x %llx pid %d", __func__, ls->ls_global_id, (unsigned long long)number, op->info.pid); - do_unlock_close(&op->info); dlm_release_plock_op(op); goto out; } } else { +wait: wait_event(recv_wq, (op->done != 0)); } @@ -240,8 +279,8 @@ static int dlm_plock_callback(struct plock_op *op) rv = notify(fl, 0); if (rv) { /* XXX: We need to cancel the fs lock here: */ - log_print("dlm_plock_callback: lock granted after lock request " - "failed; dangling lock!\n"); + log_print("%s: lock granted after lock request failed; dangling lock!", + __func__); goto out; } @@ -318,6 +357,75 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, } EXPORT_SYMBOL_GPL(dlm_posix_unlock); +/* + * NOTE: This implementation can only handle async lock requests as nfs + * do it. It cannot handle cancellation of a pending lock request sitting + * in wait_event(), but for now only nfs is the only user local kernel + * user. + */ +int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file, + struct file_lock *fl) +{ + struct dlm_plock_info info; + struct plock_op *op; + struct dlm_ls *ls; + int rv; + + /* this only works for async request for now and nfs is the only + * kernel user right now. + */ + if (WARN_ON_ONCE(!fl->fl_lmops || !fl->fl_lmops->lm_grant)) + return -EOPNOTSUPP; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + memset(&info, 0, sizeof(info)); + info.pid = fl->fl_pid; + info.ex = (fl->fl_type == F_WRLCK); + info.fsid = ls->ls_global_id; + dlm_put_lockspace(ls); + info.number = number; + info.start = fl->fl_start; + info.end = fl->fl_end; + info.owner = (__u64)fl->fl_pid; + + rv = do_lock_cancel(&info); + switch (rv) { + case 0: + spin_lock(&ops_lock); + /* lock request to cancel must be on recv_list because + * do_lock_cancel() synchronizes it. + */ + op = plock_lookup_waiter(&info); + if (WARN_ON_ONCE(!op)) { + spin_unlock(&ops_lock); + rv = -ENOLCK; + break; + } + + list_del(&op->list); + spin_unlock(&ops_lock); + WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK); + op->data->callback(op->data->fl, -EINTR); + dlm_release_plock_op(op); + rv = -EINTR; + break; + case -ENOENT: + /* if cancel wasn't successful we probably were to late + * or it was a non-blocking lock request, so just unlock it. + */ + rv = dlm_posix_unlock(lockspace, number, file, fl); + break; + default: + break; + } + + return rv; +} +EXPORT_SYMBOL_GPL(dlm_posix_cancel); + int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, struct file_lock *fl) { @@ -403,6 +511,8 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, if (!op) return -EAGAIN; + trace_dlm_plock_read(&info); + /* there is no need to get a reply from userspace for unlocks that were generated by the vfs cleaning up for a close (the process did not make an unlock call). */ @@ -430,6 +540,8 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, if (copy_from_user(&info, u, sizeof(info))) return -EFAULT; + trace_dlm_plock_write(&info); + if (check_version(&info)) return -EINVAL; @@ -441,22 +553,11 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, */ spin_lock(&ops_lock); if (info.wait) { - list_for_each_entry(iter, &recv_list, list) { - if (iter->info.fsid == info.fsid && - iter->info.number == info.number && - iter->info.owner == info.owner && - iter->info.pid == info.pid && - iter->info.start == info.start && - iter->info.end == info.end && - iter->info.ex == info.ex && - iter->info.wait) { - op = iter; - break; - } - } + op = plock_lookup_waiter(&info); } else { list_for_each_entry(iter, &recv_list, list) { - if (!iter->info.wait) { + if (!iter->info.wait && + iter->info.fsid == info.fsid) { op = iter; break; } @@ -468,8 +569,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, if (info.wait) WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK); else - WARN_ON(op->info.fsid != info.fsid || - op->info.number != info.number || + WARN_ON(op->info.number != info.number || op->info.owner != info.owner || op->info.optype != info.optype); @@ -534,5 +634,7 @@ int dlm_plock_init(void) void dlm_plock_exit(void) { misc_deregister(&plock_dev_misc); + WARN_ON(!list_empty(&send_list)); + WARN_ON(!list_empty(&recv_list)); } diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f4afdf892f785..3b734aed26b54 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -28,7 +28,8 @@ static int rcom_response(struct dlm_ls *ls) } static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, - struct dlm_rcom **rc_ret, char *mb, int mb_len) + struct dlm_rcom **rc_ret, char *mb, int mb_len, + uint64_t seq) { struct dlm_rcom *rc; @@ -41,16 +42,14 @@ static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, rc->rc_header.h_cmd = DLM_RCOM; rc->rc_type = cpu_to_le32(type); - - spin_lock(&ls->ls_recover_lock); - rc->rc_seq = cpu_to_le64(ls->ls_recover_seq); - spin_unlock(&ls->ls_recover_lock); + rc->rc_seq = cpu_to_le64(seq); *rc_ret = rc; } static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, - struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret) + struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret, + uint64_t seq) { int mb_len = sizeof(struct dlm_rcom) + len; struct dlm_mhandle *mh; @@ -63,14 +62,14 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, return -ENOBUFS; } - _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len); + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len, seq); *mh_ret = mh; return 0; } static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, int len, struct dlm_rcom **rc_ret, - struct dlm_msg **msg_ret) + struct dlm_msg **msg_ret, uint64_t seq) { int mb_len = sizeof(struct dlm_rcom) + len; struct dlm_msg *msg; @@ -84,7 +83,7 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, return -ENOBUFS; } - _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len); + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len, seq); *msg_ret = msg; return 0; } @@ -170,7 +169,8 @@ static void disallow_sync_reply(struct dlm_ls *ls) * node's rcom_config. */ -int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags, + uint64_t seq) { struct dlm_rcom *rc; struct dlm_msg *msg; @@ -186,7 +186,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) retry: error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS, - sizeof(struct rcom_status), &rc, &msg); + sizeof(struct rcom_status), &rc, &msg, + seq); if (error) goto out; @@ -220,7 +221,9 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) return error; } -static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_status(struct dlm_ls *ls, + const struct dlm_rcom *rc_in, + uint64_t seq) { struct dlm_rcom *rc; struct rcom_status *rs; @@ -251,7 +254,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) do_create: error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS_REPLY, - len, &rc, &msg); + len, &rc, &msg, seq); if (error) return; @@ -281,7 +284,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) send_rcom_stateless(msg, rc); } -static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_sync_reply(struct dlm_ls *ls, const struct dlm_rcom *rc_in) { spin_lock(&ls->ls_rcom_spin); if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) || @@ -302,17 +305,18 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) spin_unlock(&ls->ls_rcom_spin); } -int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) +int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, + int last_len, uint64_t seq) { + struct dlm_mhandle *mh; struct dlm_rcom *rc; - struct dlm_msg *msg; int error = 0; ls->ls_recover_nodeid = nodeid; retry: - error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES, last_len, - &rc, &msg); + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, + &rc, &mh, seq); if (error) goto out; memcpy(rc->rc_buf, last_name, last_len); @@ -320,7 +324,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) allow_sync_reply(ls, &rc->rc_id); memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom_stateless(msg, rc); + send_rcom(mh, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -330,19 +334,20 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) return error; } -static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_names(struct dlm_ls *ls, const struct dlm_rcom *rc_in, + uint64_t seq) { + struct dlm_mhandle *mh; struct dlm_rcom *rc; int error, inlen, outlen, nodeid; - struct dlm_msg *msg; nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); inlen = le16_to_cpu(rc_in->rc_header.h_length) - sizeof(struct dlm_rcom); outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom); - error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, - &rc, &msg); + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, + &rc, &mh, seq); if (error) return; rc->rc_id = rc_in->rc_id; @@ -350,10 +355,10 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, nodeid); - send_rcom_stateless(msg, rc); + send_rcom(mh, rc); } -int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) +int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid, uint64_t seq) { struct dlm_rcom *rc; struct dlm_mhandle *mh; @@ -361,7 +366,7 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) int error; error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length, - &rc, &mh); + &rc, &mh, seq); if (error) goto out; memcpy(rc->rc_buf, r->res_name, r->res_length); @@ -372,7 +377,8 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) return error; } -static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_lookup(struct dlm_ls *ls, + const struct dlm_rcom *rc_in, uint64_t seq) { struct dlm_rcom *rc; struct dlm_mhandle *mh; @@ -387,7 +393,8 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) return; } - error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh); + error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh, + seq); if (error) return; @@ -402,7 +409,8 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) send_rcom(mh, rc); } -static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_lookup_reply(struct dlm_ls *ls, + const struct dlm_rcom *rc_in) { dlm_recover_master_reply(ls, rc_in); } @@ -437,7 +445,7 @@ static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); } -int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, uint64_t seq) { struct dlm_ls *ls = r->res_ls; struct dlm_rcom *rc; @@ -448,7 +456,8 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) if (lkb->lkb_lvbptr) len += ls->ls_lvblen; - error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh); + error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh, + seq); if (error) goto out; @@ -462,23 +471,28 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) } /* needs at least dlm_rcom + rcom_lock */ -static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_lock(struct dlm_ls *ls, const struct dlm_rcom *rc_in, + uint64_t seq) { + __le32 rl_remid, rl_result; + struct rcom_lock *rl; struct dlm_rcom *rc; struct dlm_mhandle *mh; int error, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); - dlm_recover_master_copy(ls, rc_in); + dlm_recover_master_copy(ls, rc_in, &rl_remid, &rl_result); error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY, - sizeof(struct rcom_lock), &rc, &mh); + sizeof(struct rcom_lock), &rc, &mh, seq); if (error) return; - /* We send back the same rcom_lock struct we received, but - dlm_recover_master_copy() has filled in rl_remid and rl_result */ - memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock)); + rl = (struct rcom_lock *)rc->rc_buf; + /* set rl_remid and rl_result from dlm_recover_master_copy() */ + rl->rl_remid = rl_remid; + rl->rl_result = rl_result; + rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; @@ -488,7 +502,7 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) /* If the lockspace doesn't exist then still send a status message back; it's possible that it just doesn't have its global_id yet. */ -int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) +int dlm_send_ls_not_ready(int nodeid, const struct dlm_rcom *rc_in) { struct dlm_rcom *rc; struct rcom_config *rf; @@ -566,7 +580,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) /* Called by dlm_recv; corresponds to dlm_receive_message() but special recovery-only comms are sent through here. */ -void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) +void dlm_receive_rcom(struct dlm_ls *ls, const struct dlm_rcom *rc, int nodeid) { int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock); int stop, reply = 0, names = 0, lookup = 0, lock = 0; @@ -620,21 +634,21 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) switch (rc->rc_type) { case cpu_to_le32(DLM_RCOM_STATUS): - receive_rcom_status(ls, rc); + receive_rcom_status(ls, rc, seq); break; case cpu_to_le32(DLM_RCOM_NAMES): - receive_rcom_names(ls, rc); + receive_rcom_names(ls, rc, seq); break; case cpu_to_le32(DLM_RCOM_LOOKUP): - receive_rcom_lookup(ls, rc); + receive_rcom_lookup(ls, rc, seq); break; case cpu_to_le32(DLM_RCOM_LOCK): if (le16_to_cpu(rc->rc_header.h_length) < lock_size) goto Eshort; - receive_rcom_lock(ls, rc); + receive_rcom_lock(ls, rc, seq); break; case cpu_to_le32(DLM_RCOM_STATUS_REPLY): @@ -652,7 +666,7 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) case cpu_to_le32(DLM_RCOM_LOCK_REPLY): if (le16_to_cpu(rc->rc_header.h_length) < lock_size) goto Eshort; - dlm_recover_process_copy(ls, rc); + dlm_recover_process_copy(ls, rc, seq); break; default: diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h index 454d3c4814abe..765926ae0020d 100644 --- a/fs/dlm/rcom.h +++ b/fs/dlm/rcom.h @@ -12,12 +12,15 @@ #ifndef __RCOM_DOT_H__ #define __RCOM_DOT_H__ -int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags); -int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); -int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); -int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); -void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid); -int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in); +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags, + uint64_t seq); +int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, + int last_len, uint64_t seq); +int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid, uint64_t seq); +int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, uint64_t seq); +void dlm_receive_rcom(struct dlm_ls *ls, const struct dlm_rcom *rc, + int nodeid); +int dlm_send_ls_not_ready(int nodeid, const struct dlm_rcom *rc_in); #endif diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index 29d71a5018d43..53917c0aa3c07 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -93,7 +93,7 @@ void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status) } static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, - int save_slots) + int save_slots, uint64_t seq) { struct dlm_rcom *rc = ls->ls_recover_buf; struct dlm_member *memb; @@ -107,7 +107,7 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, goto out; } - error = dlm_rcom_status(ls, memb->nodeid, 0); + error = dlm_rcom_status(ls, memb->nodeid, 0, seq); if (error) goto out; @@ -126,7 +126,7 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, } static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, - uint32_t status_flags) + uint32_t status_flags, uint64_t seq) { struct dlm_rcom *rc = ls->ls_recover_buf; int error = 0, delay = 0, nodeid = ls->ls_low_nodeid; @@ -137,7 +137,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, goto out; } - error = dlm_rcom_status(ls, nodeid, status_flags); + error = dlm_rcom_status(ls, nodeid, status_flags, seq); if (error) break; @@ -151,22 +151,22 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, return error; } -static int wait_status(struct dlm_ls *ls, uint32_t status) +static int wait_status(struct dlm_ls *ls, uint32_t status, uint64_t seq) { uint32_t status_all = status << 1; int error; if (ls->ls_low_nodeid == dlm_our_nodeid()) { - error = wait_status_all(ls, status, 0); + error = wait_status_all(ls, status, 0, seq); if (!error) dlm_set_recover_status(ls, status_all); } else - error = wait_status_low(ls, status_all, 0); + error = wait_status_low(ls, status_all, 0, seq); return error; } -int dlm_recover_members_wait(struct dlm_ls *ls) +int dlm_recover_members_wait(struct dlm_ls *ls, uint64_t seq) { struct dlm_member *memb; struct dlm_slot *slots; @@ -180,7 +180,7 @@ int dlm_recover_members_wait(struct dlm_ls *ls) } if (ls->ls_low_nodeid == dlm_our_nodeid()) { - error = wait_status_all(ls, DLM_RS_NODES, 1); + error = wait_status_all(ls, DLM_RS_NODES, 1, seq); if (error) goto out; @@ -199,7 +199,8 @@ int dlm_recover_members_wait(struct dlm_ls *ls) dlm_set_recover_status(ls, DLM_RS_NODES_ALL); } } else { - error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS); + error = wait_status_low(ls, DLM_RS_NODES_ALL, + DLM_RSF_NEED_SLOTS, seq); if (error) goto out; @@ -209,19 +210,19 @@ int dlm_recover_members_wait(struct dlm_ls *ls) return error; } -int dlm_recover_directory_wait(struct dlm_ls *ls) +int dlm_recover_directory_wait(struct dlm_ls *ls, uint64_t seq) { - return wait_status(ls, DLM_RS_DIR); + return wait_status(ls, DLM_RS_DIR, seq); } -int dlm_recover_locks_wait(struct dlm_ls *ls) +int dlm_recover_locks_wait(struct dlm_ls *ls, uint64_t seq) { - return wait_status(ls, DLM_RS_LOCKS); + return wait_status(ls, DLM_RS_LOCKS, seq); } -int dlm_recover_done_wait(struct dlm_ls *ls) +int dlm_recover_done_wait(struct dlm_ls *ls, uint64_t seq) { - return wait_status(ls, DLM_RS_DONE); + return wait_status(ls, DLM_RS_DONE, seq); } /* @@ -441,7 +442,7 @@ static void set_new_master(struct dlm_rsb *r) * equals our_nodeid below). */ -static int recover_master(struct dlm_rsb *r, unsigned int *count) +static int recover_master(struct dlm_rsb *r, unsigned int *count, uint64_t seq) { struct dlm_ls *ls = r->res_ls; int our_nodeid, dir_nodeid; @@ -472,7 +473,7 @@ static int recover_master(struct dlm_rsb *r, unsigned int *count) error = 0; } else { recover_idr_add(r); - error = dlm_send_rcom_lookup(r, dir_nodeid); + error = dlm_send_rcom_lookup(r, dir_nodeid, seq); } (*count)++; @@ -520,7 +521,7 @@ static int recover_master_static(struct dlm_rsb *r, unsigned int *count) * the correct dir node. */ -int dlm_recover_masters(struct dlm_ls *ls) +int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq) { struct dlm_rsb *r; unsigned int total = 0; @@ -542,7 +543,7 @@ int dlm_recover_masters(struct dlm_ls *ls) if (nodir) error = recover_master_static(r, &count); else - error = recover_master(r, &count); + error = recover_master(r, &count, seq); unlock_rsb(r); cond_resched(); total++; @@ -563,7 +564,7 @@ int dlm_recover_masters(struct dlm_ls *ls) return error; } -int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) +int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc) { struct dlm_rsb *r; int ret_nodeid, new_master; @@ -614,13 +615,14 @@ int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) * an equal number of replies then recovery for the rsb is done */ -static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head) +static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head, + uint64_t seq) { struct dlm_lkb *lkb; int error = 0; list_for_each_entry(lkb, head, lkb_statequeue) { - error = dlm_send_rcom_lock(r, lkb); + error = dlm_send_rcom_lock(r, lkb, seq); if (error) break; r->res_recover_locks_count++; @@ -629,7 +631,7 @@ static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head) return error; } -static int recover_locks(struct dlm_rsb *r) +static int recover_locks(struct dlm_rsb *r, uint64_t seq) { int error = 0; @@ -637,13 +639,13 @@ static int recover_locks(struct dlm_rsb *r) DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r);); - error = recover_locks_queue(r, &r->res_grantqueue); + error = recover_locks_queue(r, &r->res_grantqueue, seq); if (error) goto out; - error = recover_locks_queue(r, &r->res_convertqueue); + error = recover_locks_queue(r, &r->res_convertqueue, seq); if (error) goto out; - error = recover_locks_queue(r, &r->res_waitqueue); + error = recover_locks_queue(r, &r->res_waitqueue, seq); if (error) goto out; @@ -656,7 +658,7 @@ static int recover_locks(struct dlm_rsb *r) return error; } -int dlm_recover_locks(struct dlm_ls *ls) +int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq) { struct dlm_rsb *r; int error, count = 0; @@ -677,7 +679,7 @@ int dlm_recover_locks(struct dlm_ls *ls) goto out; } - error = recover_locks(r); + error = recover_locks(r, seq); if (error) { up_read(&ls->ls_root_sem); goto out; diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h index 235e0d25cd48c..dbc51013ecadb 100644 --- a/fs/dlm/recover.h +++ b/fs/dlm/recover.h @@ -15,13 +15,13 @@ int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)); uint32_t dlm_recover_status(struct dlm_ls *ls); void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status); -int dlm_recover_members_wait(struct dlm_ls *ls); -int dlm_recover_directory_wait(struct dlm_ls *ls); -int dlm_recover_locks_wait(struct dlm_ls *ls); -int dlm_recover_done_wait(struct dlm_ls *ls); -int dlm_recover_masters(struct dlm_ls *ls); -int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc); -int dlm_recover_locks(struct dlm_ls *ls); +int dlm_recover_members_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_directory_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_locks_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_done_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc); +int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq); void dlm_recovered_lock(struct dlm_rsb *r); int dlm_create_root_list(struct dlm_ls *ls); void dlm_release_root_list(struct dlm_ls *ls); diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 19da816cfb09d..4d17491dea2fe 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -90,7 +90,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_NODES); - error = dlm_recover_members_wait(ls); + error = dlm_recover_members_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_members_wait error %d", error); goto fail; @@ -103,7 +103,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) * nodes their master rsb names that hash to us. */ - error = dlm_recover_directory(ls); + error = dlm_recover_directory(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_directory error %d", error); goto fail; @@ -111,7 +111,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_DIR); - error = dlm_recover_directory_wait(ls); + error = dlm_recover_directory_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_directory_wait error %d", error); goto fail; @@ -145,7 +145,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) * departed nodes. */ - error = dlm_recover_masters(ls); + error = dlm_recover_masters(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_masters error %d", error); goto fail; @@ -155,7 +155,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) * Send our locks on remastered rsb's to the new masters. */ - error = dlm_recover_locks(ls); + error = dlm_recover_locks(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_locks error %d", error); goto fail; @@ -163,7 +163,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_LOCKS); - error = dlm_recover_locks_wait(ls); + error = dlm_recover_locks_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_locks_wait error %d", error); goto fail; @@ -187,7 +187,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) */ dlm_set_recover_status(ls, DLM_RS_LOCKS); - error = dlm_recover_locks_wait(ls); + error = dlm_recover_locks_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_locks_wait error %d", error); goto fail; @@ -206,7 +206,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_DONE); - error = dlm_recover_done_wait(ls); + error = dlm_recover_done_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_done_wait error %d", error); goto fail; diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index 8be2893ad15bb..892d6ca21e742 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -30,7 +30,8 @@ struct rq_entry { * lockspace is enabled on some while still suspended on others. */ -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, + const struct dlm_message *ms) { struct rq_entry *e; int length = le16_to_cpu(ms->m_header.h_length) - diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h index 4e403469a8458..42bfe23ceabea 100644 --- a/fs/dlm/requestqueue.h +++ b/fs/dlm/requestqueue.h @@ -11,7 +11,8 @@ #ifndef __REQUESTQUEUE_DOT_H__ #define __REQUESTQUEUE_DOT_H__ -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms); +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, + const struct dlm_message *ms); int dlm_process_requestqueue(struct dlm_ls *ls); void dlm_wait_requestqueue(struct dlm_ls *ls); void dlm_purge_requestqueue(struct dlm_ls *ls); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index d1dbe47c7975f..c20704aa21b3d 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -386,6 +386,7 @@ static int export_encode_fh(struct inode *inode, struct fid *fid, * @inode: the object to encode * @fid: where to store the file handle fragment * @max_len: maximum length to store there + * @parent: parent directory inode, if wanted * @flags: properties of the requested file handle * * Returns an enum fid_type or a negative errno. diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1f72f977c6dba..79b20d6ae39ec 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -913,11 +913,11 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } /* - * This function returns the number of file system metadata clusters at + * This function returns the number of file system metadata blocks at * the beginning of a block group, including the reserved gdt blocks. */ -static unsigned ext4_num_base_meta_clusters(struct super_block *sb, - ext4_group_t block_group) +unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned num; @@ -935,8 +935,15 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, } else { /* For META_BG_BLOCK_GROUPS */ num += ext4_bg_num_gdb_meta(sb, block_group); } - return EXT4_NUM_B2C(sbi, num); + return num; } + +static unsigned int ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group) +{ + return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group)); +} + /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 5504f72bbbbe7..6fe3c941b5651 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -215,7 +215,6 @@ int ext4_setup_system_zone(struct super_block *sb) struct ext4_system_blocks *system_blks; struct ext4_group_desc *gdp; ext4_group_t i; - int flex_size = ext4_flex_bg_size(sbi); int ret; system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); @@ -223,12 +222,13 @@ int ext4_setup_system_zone(struct super_block *sb) return -ENOMEM; for (i=0; i < ngroups; i++) { + unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i); + cond_resched(); - if (ext4_bg_has_super(sb, i) && - ((i < 5) || ((i % flex_size) == 0))) { + if (meta_blks != 0) { ret = add_system_zone(system_blks, ext4_group_first_block_no(sb, i), - ext4_bg_num_gdb(sb, i) + 1, 0); + meta_blks, 0); if (ret) goto err; } diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index e20ac0654b3f2..453d4da5de520 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -33,6 +33,8 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); + if (err) + ext4_fname_free_filename(fname); #endif return err; } @@ -51,6 +53,8 @@ int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); + if (err) + ext4_fname_free_filename(fname); #endif return err; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 481491e892dfe..9418359b1d9d3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -176,9 +176,6 @@ enum criteria { EXT4_MB_NUM_CRS }; -/* criteria below which we use fast block scanning and avoid unnecessary IO */ -#define CR_FAST CR_GOAL_LEN_SLOW - /* * Flags used in mballoc's allocation_context flags field. * @@ -1241,6 +1238,7 @@ struct ext4_inode_info { #define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group * scanning in mballoc */ +#define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1258,10 +1256,8 @@ struct ext4_inode_info { #define ext4_test_and_set_bit __test_and_set_bit_le #define ext4_set_bit __set_bit_le -#define ext4_set_bit_atomic ext2_set_bit_atomic #define ext4_test_and_clear_bit __test_and_clear_bit_le #define ext4_clear_bit __clear_bit_le -#define ext4_clear_bit_atomic ext2_clear_bit_atomic #define ext4_test_bit test_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le @@ -1708,10 +1704,13 @@ struct ext4_sb_info { const char *s_last_error_func; time64_t s_last_error_time; /* - * If we are in a context where we cannot update error information in - * the on-disk superblock, we queue this work to do it. + * If we are in a context where we cannot update the on-disk + * superblock, we queue the work here. This is used to update + * the error information in the superblock, and for periodic + * updates of the superblock called from the commit callback + * function. */ - struct work_struct s_error_work; + struct work_struct s_sb_upd_work; /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; @@ -1804,7 +1803,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ enum { EXT4_MF_MNTDIR_SAMPLED, - EXT4_MF_FS_ABORTED, /* Fatal error detected */ EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ }; @@ -2228,9 +2226,9 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly); #define EXT4_FLAGS_SHUTDOWN 1 #define EXT4_FLAGS_BDEV_IS_DAX 2 -static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) +static inline int ext4_forced_shutdown(struct super_block *sb) { - return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); } /* @@ -2708,7 +2706,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags); extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); -extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); @@ -2864,7 +2861,6 @@ extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); -extern void ext4_check_inodes_bitmap(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); @@ -2907,7 +2903,6 @@ extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); -extern int ext4_mb_reserve_blocks(struct super_block *, int); extern void ext4_discard_preallocations(struct inode *, unsigned int); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); @@ -2930,6 +2925,10 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, int len, int state); +static inline bool ext4_mb_cr_expensive(enum criteria cr) +{ + return cr >= CR_GOAL_LEN_SLOW; +} /* inode.c */ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, @@ -2983,7 +2982,6 @@ extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); @@ -3090,6 +3088,8 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno, extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); +extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group); extern __printf(7, 8) void __ext4_error(struct super_block *, const char *, unsigned int, bool, @@ -3531,8 +3531,6 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); -extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); int ext4_readpage_inline(struct inode *inode, struct folio *folio); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b38d59581411c..d1a2e66244017 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -67,11 +67,12 @@ static int ext4_journal_check_start(struct super_block *sb) might_sleep(); - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return -EIO; - if (sb_rdonly(sb)) + if (WARN_ON_ONCE(sb_rdonly(sb))) return -EROFS; + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; /* diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 9b5b8951afb44..6f7de14c0fa86 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -878,23 +878,29 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, err1 = __es_remove_extent(inode, lblk, end, NULL, es1); if (err1 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } err2 = __es_insert_extent(inode, &newes, es2); if (err2 == -ENOMEM && !ext4_es_must_keep(&newes)) err2 = 0; if (err2 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && (status & EXTENT_STATUS_WRITTEN || status & EXTENT_STATUS_UNWRITTEN)) __revise_pending(inode, lblk, len); - - /* es is pre-allocated but not used, free it. */ - if (es1 && !es1->es_len) - __es_free_extent(es1); - if (es2 && !es2->es_len) - __es_free_extent(es2); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err1 || err2) @@ -1491,8 +1497,12 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end, &reserved, es); - if (es && !es->es_len) - __es_free_extent(es); + /* Free preallocated extent if it didn't get used. */ + if (es) { + if (!es->es_len) + __es_free_extent(es); + es = NULL; + } write_unlock(&EXT4_I(inode)->i_es_lock); if (err) goto retry; @@ -2047,19 +2057,25 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); if (err1 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } err2 = __es_insert_extent(inode, &newes, es2); if (err2 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } if (allocated) __insert_pending(inode, lblk); - - /* es is pre-allocated but not used, free it. */ - if (es1 && !es1->es_len) - __es_free_extent(es1); - if (es2 && !es2->es_len) - __es_free_extent(es2); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err1 || err2) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2dc3f8301225a..6830ea3a6c59c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -131,7 +131,7 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (!iov_iter_count(to)) @@ -153,7 +153,7 @@ static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, { struct inode *inode = file_inode(in); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); } @@ -476,6 +476,11 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, * required to change security info in file_modified(), for extending * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten * extents (as partial block zeroing may be required). + * + * Note that unaligned writes are allowed under shared lock so long as + * they are pure overwrites. Otherwise, concurrent unaligned writes risk + * data corruption due to partial block zeroing in the dio layer, and so + * the I/O must occur exclusively. */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || @@ -492,21 +497,12 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, /* * Now that locking is settled, determine dio flags and exclusivity - * requirements. Unaligned writes are allowed under shared lock so long - * as they are pure overwrites. Set the iomap overwrite only flag as an - * added precaution in this case. Even though this is unnecessary, we - * can detect and warn on unexpected -EAGAIN if an unsafe unaligned - * write is ever submitted. - * - * Otherwise, concurrent unaligned writes risk data corruption due to - * partial block zeroing in the dio layer, and so the I/O must occur - * exclusively. The inode lock is already held exclusive if the write is - * non-overwrite or extending, so drain all outstanding dio and set the - * force wait dio flag. + * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce + * behavior already. The inode lock is already held exclusive if the + * write is non-overwrite or extending, so drain all outstanding dio and + * set the force wait dio flag. */ - if (*ilock_shared && unaligned_io) { - *dio_flags = IOMAP_DIO_OVERWRITE_ONLY; - } else if (!*ilock_shared && (unaligned_io || *extend)) { + if (!*ilock_shared && (unaligned_io || *extend)) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -608,7 +604,6 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); - WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)); if (ret == -ENOTBLK) ret = 0; @@ -709,7 +704,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; #ifdef CONFIG_FS_DAX @@ -806,10 +801,9 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct dax_device *dax_dev = sbi->s_daxdev; + struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; /* @@ -885,7 +879,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0c56f3a011a1f..b40d3b29f7e5c 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -131,9 +131,8 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0, err; bool needs_barrier = false; struct inode *inode = file->f_mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; ASSERT(ext4_journal_current_handle() == NULL); @@ -141,14 +140,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext4_sync_file_enter(file, datasync); if (sb_rdonly(inode->i_sb)) { - /* Make sure that we read updated s_mount_flags value */ + /* Make sure that we read updated s_ext4_flags value */ smp_rmb(); - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(inode->i_sb)) ret = -EROFS; goto out; } - if (!sbi->s_journal) { + if (!EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fsync_nojournal(file, start, end, datasync, &needs_barrier); if (needs_barrier) diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 46c3423ddfa17..deabe29da7fbc 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -300,7 +300,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, unsigned char *buff; struct qstr qstr = {.name = name, .len = len }; - if (len && IS_CASEFOLDED(dir) && um && + if (len && IS_CASEFOLDED(dir) && (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); if (!buff) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 48abef5f23e7f..b65058d972f95 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -950,7 +950,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, sb = dir->i_sb; sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(sb))) return ERR_PTR(-EIO); ngroups = ext4_get_groups_count(sb); @@ -1523,12 +1523,6 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int num, ret = 0, used_blks = 0; unsigned long used_inos = 0; - /* This should not happen, but just to be sure check this */ - if (sb_rdonly(sb)) { - ret = 1; - goto out; - } - gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp || !grp) goto out; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 0038610373745..012d9259ff532 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -228,7 +228,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, struct ext4_inode *raw_inode; int cp_len = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 89737d5a16148..4ce35f1c8b0a8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1114,7 +1114,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; trace_ext4_write_begin(inode, pos, len); @@ -2213,8 +2213,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(EXT4_SB(sb)) || - ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2534,14 +2533,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test - * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because + * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || - ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { + if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { ret = -EROFS; goto out_writepages; } @@ -2759,7 +2757,7 @@ static int ext4_writepages(struct address_space *mapping, int ret; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(sb); @@ -2798,16 +2796,16 @@ static int ext4_dax_writepages(struct address_space *mapping, int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); - ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); + ret = dax_writeback_mapping_range(mapping, + EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); @@ -2857,7 +2855,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; struct inode *inode = mapping->host; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; index = pos >> PAGE_SHIFT; @@ -2937,14 +2935,73 @@ static int ext4_da_should_update_i_disksize(struct folio *folio, return 1; } +static int ext4_da_do_write_end(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page) +{ + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + bool disksize_changed = false; + loff_t new_i_size; + + /* + * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES + * flag, which all that's needed to trigger page writeback. + */ + copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL); + new_i_size = pos + copied; + + /* + * It's important to update i_size while still holding page lock, + * because page writeout could otherwise come in and zero beyond + * i_size. + * + * Since we are holding inode lock, we are sure i_disksize <= + * i_size. We also know that if i_disksize < i_size, there are + * delalloc writes pending in the range up to i_size. If the end of + * the current write is <= i_size, there's no need to touch + * i_disksize since writeback will push i_disksize up to i_size + * eventually. If the end of the current write is > i_size and + * inside an allocated block which ext4_da_should_update_i_disksize() + * checked, we need to update i_disksize here as certain + * ext4_writepages() paths not allocating blocks and update i_disksize. + */ + if (new_i_size > inode->i_size) { + unsigned long end; + + i_size_write(inode, new_i_size); + end = (new_i_size - 1) & (PAGE_SIZE - 1); + if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) { + ext4_update_i_disksize(inode, new_i_size); + disksize_changed = true; + } + } + + unlock_page(page); + put_page(page); + + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + + if (disksize_changed) { + handle_t *handle; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + } + + return copied; +} + static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - loff_t new_i_size; - unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; struct folio *folio = page_folio(page); @@ -2963,30 +3020,7 @@ static int ext4_da_write_end(struct file *file, if (unlikely(copied < len) && !PageUptodate(page)) copied = 0; - start = pos & (PAGE_SIZE - 1); - end = start + copied - 1; - - /* - * Since we are holding inode lock, we are sure i_disksize <= - * i_size. We also know that if i_disksize < i_size, there are - * delalloc writes pending in the range upto i_size. If the end of - * the current write is <= i_size, there's no need to touch - * i_disksize since writeback will push i_disksize upto i_size - * eventually. If the end of the current write is > i_size and - * inside an allocated block (ext4_da_should_update_i_disksize() - * check), we need to update i_disksize here as certain - * ext4_writepages() paths not allocating blocks update i_disksize. - * - * Note that we defer inode dirtying to generic_write_end() / - * ext4_da_write_inline_data_end(). - */ - new_i_size = pos + copied; - if (copied && new_i_size > inode->i_size && - ext4_da_should_update_i_disksize(folio, end)) - ext4_update_i_disksize(inode, new_i_size); - - return generic_write_end(file, mapping, pos, len, copied, &folio->page, - fsdata); + return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page); } /* @@ -4940,9 +4974,12 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } - if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) + if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); + ret = -EFSCORRUPTED; + goto bad_inode; + } if ((err_str = check_igot_inode(inode, flags)) != NULL) { ext4_error_inode(inode, function, line, 0, err_str); ret = -EFSCORRUPTED; @@ -5131,11 +5168,10 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || - sb_rdonly(inode->i_sb)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (EXT4_SB(inode->i_sb)->s_journal) { @@ -5255,7 +5291,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (unlikely(IS_IMMUTABLE(inode))) @@ -5674,7 +5710,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { + if (unlikely(ext4_forced_shutdown(inode->i_sb))) { put_bh(iloc->bh); return -EIO; } @@ -5700,7 +5736,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, { int err; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; err = ext4_get_inode_loc(inode, iloc); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index b0349f4518637..0bfe2ce589e22 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -802,7 +802,7 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags) if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) return -EINVAL; - if (ext4_forced_shutdown(sbi)) + if (ext4_forced_shutdown(sb)) return 0; ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 21b903fe546e8..c91db9f57524c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -874,7 +874,7 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *iter, *grp; + struct ext4_group_info *iter; int i; if (ac->ac_status == AC_STATUS_FOUND) @@ -883,7 +883,6 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); - grp = NULL; for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { if (list_empty(&sbi->s_mb_largest_free_orders[i])) continue; @@ -892,28 +891,22 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); continue; } - grp = NULL; list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { - grp = iter; - break; + *group = iter->bb_group; + ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + return; } } read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); - if (grp) - break; } - if (!grp) { - /* Increment cr and search again */ - *new_cr = CR_GOAL_LEN_FAST; - } else { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; - } + /* Increment cr and search again if no group is found */ + *new_cr = CR_GOAL_LEN_FAST; } /* @@ -966,16 +959,25 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); i < MB_NUM_ORDERS(ac->ac_sb); i++) { grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); - if (grp) - break; + if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; + return; + } } - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; - } else { + /* + * CR_BEST_AVAIL_LEN works based on the concept that we have + * a larger normalized goal len request which can be trimmed to + * a smaller goal len such that it can still satisfy original + * request len. However, allocation request for non-regular + * files never gets normalized. + * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). + */ + if (ac->ac_flags & EXT4_MB_HINT_DATA) *new_cr = CR_BEST_AVAIL_LEN; - } + else + *new_cr = CR_GOAL_LEN_SLOW; } /* @@ -1051,18 +1053,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context ac->ac_g_ex.fe_len); grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); - if (grp) - break; + if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; + return; + } } - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; - } else { - /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ - ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; - *new_cr = CR_GOAL_LEN_SLOW; - } + /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; + *new_cr = CR_GOAL_LEN_SLOW; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) @@ -1080,8 +1080,9 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) * Return next linear group for allocation. If linear traversal should not be * performed, this function just returns the same group */ -static int -next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) +static ext4_group_t +next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group, + ext4_group_t ngroups) { if (!should_optimize_scan(ac)) goto inc_and_return; @@ -1255,7 +1256,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) { ext4_group_t ngroups; - int blocksize; + unsigned int blocksize; int blocks_per_page; int groups_per_page; int err = 0; @@ -2450,7 +2451,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, break; } - if (ac->ac_criteria < CR_FAST) { + if (!ext4_mb_cr_expensive(ac->ac_criteria)) { /* * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are * sure that this group will have a large enough @@ -2553,7 +2554,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) + if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return false; free = grp->bb_free; @@ -2634,7 +2635,12 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) goto out; - if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len) + /* + * In all criterias except CR_ANY_FREE we try to avoid groups that + * can't possibly satisfy the full goal request due to insufficient + * free blocks. + */ + if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; @@ -2658,7 +2664,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, * sure we locate metadata blocks in the first block group in * the flex_bg if possible. */ - if (cr < CR_FAST && + if (!ext4_mb_cr_expensive(cr) && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && @@ -2787,8 +2793,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* * ac->ac_2order is set only if the fe_len is a power of 2 - * if ac->ac_2order is set we also set criteria to 0 so that we - * try exact allocation using buddy. + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED + * so that we try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); ac->ac_2order = 0; @@ -2800,10 +2806,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * requests upto maximum buddy size we have constructed. */ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { - /* - * This should tell if fe_len is exactly power of 2 - */ - if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) + if (is_power_of_2(ac->ac_g_ex.fe_len)) ac->ac_2order = array_index_nospec(i - 1, MB_NUM_ORDERS(sb)); } @@ -2848,11 +2851,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* * Batch reads of the block allocation bitmaps * to get multiple READs in flight; limit - * prefetching at cr=0/1, otherwise mballoc can - * spend a lot of time loading imperfect groups + * prefetching at inexpensive CR, otherwise mballoc + * can spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && - (cr >= CR_FAST || + (ext4_mb_cr_expensive(cr) || prefetch_ios < sbi->s_mb_prefetch_limit)) { nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { @@ -3501,11 +3504,10 @@ static void ext4_discard_work(struct work_struct *work) struct super_block *sb = sbi->s_sb; struct ext4_free_data *fd, *nfd; struct ext4_buddy e4b; - struct list_head discard_list; + LIST_HEAD(discard_list); ext4_group_t grp, load_grp; int err = 0; - INIT_LIST_HEAD(&discard_list); spin_lock(&sbi->s_md_lock); list_splice_init(&sbi->s_discard_list, &discard_list); spin_unlock(&sbi->s_md_lock); @@ -3879,12 +3881,10 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; - struct list_head freed_data_list; + LIST_HEAD(freed_data_list); struct list_head *cut_pos = NULL; bool wake; - INIT_LIST_HEAD(&freed_data_list); - spin_lock(&sbi->s_md_lock); list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { if (entry->efd_tid != commit_tid) @@ -4084,7 +4084,7 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; - int i, err; + int i, err = 0; int already; unsigned int clen, clen_changed, thisgrp_len; @@ -4222,12 +4222,13 @@ ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_ static inline void ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t start, ext4_lblk_t end) + ext4_lblk_t start, loff_t end) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *tmp_pa; - ext4_lblk_t tmp_pa_start, tmp_pa_end; + ext4_lblk_t tmp_pa_start; + loff_t tmp_pa_end; struct rb_node *iter; read_lock(&ei->i_prealloc_lock); @@ -4236,7 +4237,7 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + tmp_pa_end = pa_logical_end(sbi, tmp_pa); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) @@ -4258,14 +4259,14 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, */ static inline void ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t *start, ext4_lblk_t *end) + ext4_lblk_t *start, loff_t *end) { struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; struct rb_node *iter; - ext4_lblk_t new_start, new_end; - ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1; + ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1; + loff_t new_end, tmp_pa_end, left_pa_end = -1; new_start = *start; new_end = *end; @@ -4284,7 +4285,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + tmp_pa_end = pa_logical_end(sbi, tmp_pa); /* PA must not overlap original request */ spin_lock(&tmp_pa->pa_lock); @@ -4364,8 +4365,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, } if (left_pa) { - left_pa_end = - left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len); + left_pa_end = pa_logical_end(sbi, left_pa); BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); } @@ -4404,8 +4404,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_super_block *es = sbi->s_es; int bsbits, max; - ext4_lblk_t end; - loff_t size, start_off; + loff_t size, start_off, end; loff_t orig_size __maybe_unused; ext4_lblk_t start; @@ -4432,7 +4431,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* first, let's learn actual file size * given current request is allocated */ - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = extent_logical_end(sbi, &ac->ac_o_ex); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); @@ -4766,7 +4765,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; - loff_t tmp_pa_end; struct rb_node *iter; ext4_fsblk_t goal_block; @@ -4862,9 +4860,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) * pa can possibly satisfy the request hence check if it overlaps * original logical start and stop searching if it doesn't. */ - tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); - - if (ac->ac_o_ex.fe_logical >= tmp_pa_end) { + if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) { spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } @@ -4984,7 +4980,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); n = rb_next(n); } - return; } /* @@ -5180,8 +5175,11 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { - int new_bex_start; - int new_bex_end; + struct ext4_free_extent ex = { + .fe_logical = ac->ac_g_ex.fe_logical, + .fe_len = ac->ac_orig_goal_len, + }; + loff_t orig_goal_end = extent_logical_end(sbi, &ex); /* we can't allocate as much as normalizer wants. * so, found space must get proper lstart @@ -5200,29 +5198,23 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) * still cover original start * 3. Else, keep the best ex at start of original request. */ - new_bex_end = ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_orig_goal_len); - new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (ac->ac_o_ex.fe_logical >= new_bex_start) - goto adjust_bex; + ex.fe_len = ac->ac_b_ex.fe_len; - new_bex_start = ac->ac_g_ex.fe_logical; - new_bex_end = - new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (ac->ac_o_ex.fe_logical < new_bex_end) + ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); + if (ac->ac_o_ex.fe_logical >= ex.fe_logical) goto adjust_bex; - new_bex_start = ac->ac_o_ex.fe_logical; - new_bex_end = - new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + ex.fe_logical = ac->ac_g_ex.fe_logical; + if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex)) + goto adjust_bex; + ex.fe_logical = ac->ac_o_ex.fe_logical; adjust_bex: - ac->ac_b_ex.fe_logical = new_bex_start; + ac->ac_b_ex.fe_logical = ex.fe_logical; BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); - BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_orig_goal_len))); + BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); } pa->pa_lstart = ac->ac_b_ex.fe_logical; @@ -5419,7 +5411,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct list_head list; + LIST_HEAD(list); struct ext4_buddy e4b; struct ext4_inode_info *ei; int err; @@ -5448,7 +5440,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, goto out_dbg; } - INIT_LIST_HEAD(&list); ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { @@ -5529,7 +5520,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; ext4_group_t group = 0; - struct list_head list; + LIST_HEAD(list); struct ext4_buddy e4b; struct rb_node *iter; int err; @@ -5546,8 +5537,6 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) trace_ext4_discard_preallocations(inode, atomic_read(&ei->i_prealloc_active), needed); - INIT_LIST_HEAD(&list); - if (needed == 0) needed = UINT_MAX; @@ -5671,7 +5660,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) return; ngroups = ext4_get_groups_count(sb); @@ -5705,7 +5694,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) return; mb_debug(sb, "Can't allocate:" @@ -5738,12 +5727,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) #else static inline void ext4_mb_show_pa(struct super_block *sb) { - return; } static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { ext4_mb_show_pa(ac->ac_sb); - return; } #endif @@ -5769,7 +5756,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) group_pa_eligible = sbi->s_mb_group_prealloc > 0; inode_pa_eligible = true; - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = extent_logical_end(sbi, &ac->ac_o_ex); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; @@ -5865,13 +5852,11 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, { ext4_group_t group = 0; struct ext4_buddy e4b; - struct list_head discard_list; + LIST_HEAD(discard_list); struct ext4_prealloc_space *pa, *tmp; mb_debug(sb, "discard locality group preallocation\n"); - INIT_LIST_HEAD(&discard_list); - spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], pa_node.lg_list, @@ -5984,12 +5969,9 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ - if (lg_prealloc_count > 8) { + if (lg_prealloc_count > 8) ext4_mb_discard_lg_preallocations(sb, lg, order, lg_prealloc_count); - return; - } - return ; } /* @@ -6102,7 +6084,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_fsblk_t goal, block; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_super_block *es = sbi->s_es; goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || @@ -6643,7 +6625,6 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, error_return: brelse(bitmap_bh); ext4_std_error(sb, err); - return; } /** @@ -6746,7 +6727,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } ext4_mb_clear_bb(handle, inode, block, count, flags); - return; } /** @@ -6936,8 +6916,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) void *bitmap; bitmap = e4b->bd_bitmap; - start = (e4b->bd_info->bb_first_free > start) ? - e4b->bd_info->bb_first_free : start; + start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; @@ -7154,8 +7133,7 @@ ext4_mballoc_query_range( ext4_lock_group(sb, group); - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; + start = max(e4b.bd_info->bb_first_free, start); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index df6b5e7c22741..d7aeb5da7d867 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -233,6 +233,20 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } +static inline loff_t extent_logical_end(struct ext4_sb_info *sbi, + struct ext4_free_extent *fex) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len); +} + +static inline loff_t pa_logical_end(struct ext4_sb_info *sbi, + struct ext4_prealloc_space *pa) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); +} + typedef int (*ext4_mballoc_query_range_fn)( struct super_block *sb, ext4_group_t agno, diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 0aaf38ffcb6ec..bd946d0c71b70 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -162,7 +162,7 @@ static int kmmpd(void *data) memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); - while (!kthread_should_stop() && !sb_rdonly(sb)) { + while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) { if (!ext4_has_feature_mmp(sb)) { ext4_warning(sb, "kmmpd being stopped since MMP feature" " has been disabled."); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 933ad03f4f585..41a6411c600b1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1445,7 +1445,7 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct dx_hash_info *hinfo = &name->hinfo; int len; - if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding || + if (!IS_CASEFOLDED(dir) || (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) { cf_name->name = NULL; return 0; @@ -1496,7 +1496,7 @@ static bool ext4_match(struct inode *parent, #endif #if IS_ENABLED(CONFIG_UNICODE) - if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) && + if (IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { if (fname->cf_name.name) { struct qstr cf = {.name = fname->cf_name.name, @@ -2393,7 +2393,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, #if IS_ENABLED(CONFIG_UNICODE) if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && - sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) + utf8_validate(sb->s_encoding, &dentry->d_name)) return -EINVAL; #endif @@ -2799,6 +2799,7 @@ static int ext4_add_nondir(handle_t *handle, return err; } drop_nlink(inode); + ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); return err; @@ -3142,7 +3143,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle = NULL; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; /* Initialize quotas before so that eventual writes go in @@ -3302,7 +3303,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; trace_ext4_unlink_enter(dir, dentry); @@ -3370,7 +3371,7 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct fscrypt_str disk_link; int retries = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, @@ -3437,6 +3438,7 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, err_drop_inode: clear_nlink(inode); + ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); if (handle) @@ -4021,6 +4023,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, ext4_resetent(handle, &old, old.inode->i_ino, old_file_type); drop_nlink(whiteout); + ext4_mark_inode_dirty(handle, whiteout); ext4_orphan_add(handle, whiteout); } unlock_new_inode(whiteout); @@ -4187,7 +4190,7 @@ static int ext4_rename2(struct mnt_idmap *idmap, { int err; - if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(old_dir->i_sb))) return -EIO; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 3621f29ec6712..dfdd7e5cf0389 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -184,7 +184,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) io_end->handle = NULL; /* Following call will use up the handle */ ret = ext4_convert_unwritten_io_end_vec(handle, io_end); - if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { + if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 73547d2334fd7..38217422f9388 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -434,6 +434,57 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) +#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ +#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ + +/* + * The ext4_maybe_update_superblock() function checks and updates the + * superblock if needed. + * + * This function is designed to update the on-disk superblock only under + * certain conditions to prevent excessive disk writes and unnecessary + * waking of the disk from sleep. The superblock will be updated if: + * 1. More than an hour has passed since the last superblock update, and + * 2. More than 16MB have been written since the last superblock update. + * + * @sb: The superblock + */ +static void ext4_maybe_update_superblock(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + journal_t *journal = sbi->s_journal; + time64_t now; + __u64 last_update; + __u64 lifetime_write_kbytes; + __u64 diff_size; + + if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || + !journal || (journal->j_flags & JBD2_UNMOUNT)) + return; + + now = ktime_get_real_seconds(); + last_update = ext4_get_tstamp(es, s_wtime); + + if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) + return; + + lifetime_write_kbytes = sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - + sbi->s_sectors_written_start) >> 1); + + /* Get the number of kilobytes not written to disk to account + * for statistics and compare with a multiple of 16 MB. This + * is used to determine when the next superblock commit should + * occur (i.e. not more often than once per 16MB if there was + * less written in an hour). + */ + diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); + + if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); +} + /* * The del_gendisk() function uninitializes the disk-specific data * structures, including the bdi structure, without telling anyone @@ -460,6 +511,7 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); + ext4_maybe_update_superblock(sb); spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { @@ -658,7 +710,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, WARN_ON_ONCE(1); if (!continue_fs && !sb_rdonly(sb)) { - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); + set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); if (journal) jbd2_journal_abort(journal, -EIO); } @@ -672,7 +724,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * defer superblock flushing to a workqueue. */ if (continue_fs && journal) - schedule_work(&EXT4_SB(sb)->s_error_work); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); } @@ -699,10 +751,10 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, sb->s_flags |= SB_RDONLY; } -static void flush_stashed_error_work(struct work_struct *work) +static void update_super_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, - s_error_work); + s_sb_upd_work); journal_t *journal = sbi->s_journal; handle_t *handle; @@ -716,6 +768,7 @@ static void flush_stashed_error_work(struct work_struct *work) */ if (!sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; + bool call_notify_err; handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; @@ -723,6 +776,10 @@ static void flush_stashed_error_work(struct work_struct *work) jbd2_journal_stop(handle); goto write_directly; } + + if (sbi->s_add_error_count > 0) + call_notify_err = true; + ext4_update_super(sbi->s_sb); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " @@ -736,7 +793,10 @@ static void flush_stashed_error_work(struct work_struct *work) goto write_directly; } jbd2_journal_stop(handle); - ext4_notify_error_sysfs(sbi); + + if (call_notify_err) + ext4_notify_error_sysfs(sbi); + return; } write_directly: @@ -759,7 +819,7 @@ void __ext4_error(struct super_block *sb, const char *function, struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); @@ -784,7 +844,7 @@ void __ext4_error_inode(struct inode *inode, const char *function, va_list args; struct va_format vaf; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -819,7 +879,7 @@ void __ext4_error_file(struct file *file, const char *function, struct inode *inode = file_inode(file); char pathname[80], *path; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -899,7 +959,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, char nbuf[16]; const char *errstr; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; /* Special case: if the error is EROFS, and we're not already @@ -993,7 +1053,7 @@ __acquires(bitlock) struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); @@ -1019,7 +1079,7 @@ __acquires(bitlock) if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, EFSCORRUPTED, ino, block, function, line); - schedule_work(&EXT4_SB(sb)->s_error_work); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } return; } @@ -1097,26 +1157,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } -/* - * Open the external journal device - */ -static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) -{ - struct block_device *bdev; - - bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, - &fs_holder_ops); - if (IS_ERR(bdev)) - goto fail; - return bdev; - -fail: - ext4_msg(sb, KERN_ERR, - "failed to open journal device unknown-block(%u,%u) %ld", - MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); - return NULL; -} - static inline struct inode *orphan_list_entry(struct list_head *l) { return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; @@ -1251,10 +1291,10 @@ static void ext4_put_super(struct super_block *sb) * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL - * Unregister sysfs before flush sbi->s_error_work. + * Unregister sysfs before flush sbi->s_sb_upd_work. * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If * read metadata verify failed then will queue error work. - * flush_stashed_error_work will call start_this_handle may trigger + * update_super_work will call start_this_handle may trigger * BUG_ON. */ ext4_unregister_sysfs(sb); @@ -1266,7 +1306,7 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); - flush_work(&sbi->s_error_work); + flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); @@ -1875,6 +1915,7 @@ static const struct mount_opts { {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, #endif + {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, {Opt_err, 0, 0} }; @@ -1943,8 +1984,6 @@ struct ext4_fs_context { unsigned int mask_s_mount_opt; unsigned int vals_s_mount_opt2; unsigned int mask_s_mount_opt2; - unsigned long vals_s_mount_flags; - unsigned long mask_s_mount_flags; unsigned int opt_flags; /* MOPT flags */ unsigned int spec; u32 s_max_batch_time; @@ -2095,12 +2134,6 @@ EXT4_SET_CTX(mount_opt2); EXT4_CLEAR_CTX(mount_opt2); EXT4_TEST_CTX(mount_opt2); -static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit) -{ - set_bit(bit, &ctx->mask_s_mount_flags); - set_bit(bit, &ctx->vals_s_mount_flags); -} - static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; @@ -2164,9 +2197,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", param->key); return 0; - case Opt_abort: - ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED); - return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); @@ -2820,8 +2850,6 @@ static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) sbi->s_mount_opt |= ctx->vals_s_mount_opt; sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; - sbi->s_mount_flags &= ~ctx->mask_s_mount_flags; - sbi->s_mount_flags |= ctx->vals_s_mount_flags; sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; @@ -4210,7 +4238,7 @@ int ext4_calculate_overhead(struct super_block *sb) else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); - if (j_inode) { + if (!IS_ERR(j_inode)) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; overhead += EXT4_NUM_B2C(sbi, j_blocks); iput(j_inode); @@ -4948,8 +4976,8 @@ static int ext4_load_and_init_journal(struct super_block *sb, return 0; out: - /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before destroying the journal. */ + flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; return -EINVAL; @@ -5272,7 +5300,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); - INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); + INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); if (err) @@ -5615,16 +5643,16 @@ failed_mount9: __maybe_unused sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { - /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before journal destroy. */ + flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: - /* flush s_error_work before sbi destroy */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before sbi destroy */ + flush_work(&sbi->s_sb_upd_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); ext4_group_desc_free(sbi); @@ -5751,22 +5779,22 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); - return NULL; + return ERR_CAST(journal_inode); } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); ext4_msg(sb, KERN_ERR, "journal inode is deleted"); - return NULL; + return ERR_PTR(-EFSCORRUPTED); } - - ext4_debug("Journal inode found at %p: %lld bytes\n", - journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); - return NULL; + return ERR_PTR(-EFSCORRUPTED); } + + ext4_debug("Journal inode found at %p: %lld bytes\n", + journal_inode, journal_inode->i_size); return journal_inode; } @@ -5792,24 +5820,21 @@ static int ext4_journal_bmap(journal_t *journal, sector_t *block) return 0; } -static journal_t *ext4_get_journal(struct super_block *sb, - unsigned int journal_inum) +static journal_t *ext4_open_inode_journal(struct super_block *sb, + unsigned int journal_inum) { struct inode *journal_inode; journal_t *journal; - if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) - return NULL; - journal_inode = ext4_get_journal_inode(sb, journal_inum); - if (!journal_inode) - return NULL; + if (IS_ERR(journal_inode)) + return ERR_CAST(journal_inode); journal = jbd2_journal_init_inode(journal_inode); - if (!journal) { + if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); - return NULL; + return ERR_CAST(journal); } journal->j_private = sb; journal->j_bmap = ext4_journal_bmap; @@ -5817,43 +5842,47 @@ static journal_t *ext4_get_journal(struct super_block *sb, return journal; } -static journal_t *ext4_get_dev_journal(struct super_block *sb, - dev_t j_dev) +static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, + dev_t j_dev, ext4_fsblk_t *j_start, + ext4_fsblk_t *j_len) { struct buffer_head *bh; - journal_t *journal; - ext4_fsblk_t start; - ext4_fsblk_t len; + struct block_device *bdev; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; - struct block_device *bdev; - - if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) - return NULL; + int errno; /* see get_tree_bdev why this is needed and safe */ up_write(&sb->s_umount); - bdev = ext4_blkdev_get(j_dev, sb); + bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, + &fs_holder_ops); down_write(&sb->s_umount); - if (bdev == NULL) - return NULL; + if (IS_ERR(bdev)) { + ext4_msg(sb, KERN_ERR, + "failed to open journal device unknown-block(%u,%u) %ld", + MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev)); + return ERR_CAST(bdev); + } blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { ext4_msg(sb, KERN_ERR, "blocksize too small for journal device"); + errno = -EINVAL; goto out_bdev; } sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev, blocksize); - if (!(bh = __bread(bdev, sb_block, blocksize))) { + bh = __bread(bdev, sb_block, blocksize); + if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " "external journal"); + errno = -EINVAL; goto out_bdev; } @@ -5861,57 +5890,74 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { - ext4_msg(sb, KERN_ERR, "external journal has " - "bad superblock"); - brelse(bh); - goto out_bdev; + ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); + errno = -EFSCORRUPTED; + goto out_bh; } if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && es->s_checksum != ext4_superblock_csum(sb, es)) { - ext4_msg(sb, KERN_ERR, "external journal has " - "corrupt superblock"); - brelse(bh); - goto out_bdev; + ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); + errno = -EFSCORRUPTED; + goto out_bh; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); - brelse(bh); - goto out_bdev; + errno = -EFSCORRUPTED; + goto out_bh; } - len = ext4_blocks_count(es); - start = sb_block + 1; - brelse(bh); /* we're done with the superblock */ + *j_start = sb_block + 1; + *j_len = ext4_blocks_count(es); + brelse(bh); + return bdev; - journal = jbd2_journal_init_dev(bdev, sb->s_bdev, - start, len, blocksize); - if (!journal) { +out_bh: + brelse(bh); +out_bdev: + blkdev_put(bdev, sb); + return ERR_PTR(errno); +} + +static journal_t *ext4_open_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + journal_t *journal; + ext4_fsblk_t j_start; + ext4_fsblk_t j_len; + struct block_device *journal_bdev; + int errno = 0; + + journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); + if (IS_ERR(journal_bdev)) + return ERR_CAST(journal_bdev); + + journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start, + j_len, sb->s_blocksize); + if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); + errno = PTR_ERR(journal); goto out_bdev; } - journal->j_private = sb; - if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) { - ext4_msg(sb, KERN_ERR, "I/O error on journal device"); - goto out_journal; - } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext4_msg(sb, KERN_ERR, "External journal has more than one " "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); + errno = -EINVAL; goto out_journal; } - EXT4_SB(sb)->s_journal_bdev = bdev; + journal->j_private = sb; + EXT4_SB(sb)->s_journal_bdev = journal_bdev; ext4_init_journal_params(sb, journal); return journal; out_journal: jbd2_journal_destroy(journal); out_bdev: - blkdev_put(bdev, sb); - return NULL; + blkdev_put(journal_bdev, sb); + return ERR_PTR(errno); } static int ext4_load_journal(struct super_block *sb, @@ -5943,13 +5989,13 @@ static int ext4_load_journal(struct super_block *sb, } if (journal_inum) { - journal = ext4_get_journal(sb, journal_inum); - if (!journal) - return -EINVAL; + journal = ext4_open_inode_journal(sb, journal_inum); + if (IS_ERR(journal)) + return PTR_ERR(journal); } else { - journal = ext4_get_dev_journal(sb, journal_dev); - if (!journal) - return -EINVAL; + journal = ext4_open_dev_journal(sb, journal_dev); + if (IS_ERR(journal)) + return PTR_ERR(journal); } journal_dev_ro = bdev_read_only(journal->j_dev); @@ -6066,7 +6112,7 @@ static void ext4_update_super(struct super_block *sb) * the clock is set in the future, and this will cause e2fsck * to complain and force a full file system check. */ - if (!(sb->s_flags & SB_RDONLY)) + if (!sb_rdonly(sb)) ext4_update_tstamp(es, s_wtime); es->s_kbytes_written = cpu_to_le64(sbi->s_kbytes_written + @@ -6264,13 +6310,7 @@ static int ext4_clear_journal_err(struct super_block *sb, */ int ext4_force_commit(struct super_block *sb) { - journal_t *journal; - - if (sb_rdonly(sb)) - return 0; - - journal = EXT4_SB(sb)->s_journal; - return ext4_journal_force_commit(journal); + return ext4_journal_force_commit(EXT4_SB(sb)->s_journal); } static int ext4_sync_fs(struct super_block *sb, int wait) @@ -6280,7 +6320,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(sb))) return 0; trace_ext4_sync_fs(sb, wait); @@ -6329,12 +6369,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) static int ext4_freeze(struct super_block *sb) { int error = 0; - journal_t *journal; - - if (sb_rdonly(sb)) - return 0; - - journal = EXT4_SB(sb)->s_journal; + journal_t *journal = EXT4_SB(sb)->s_journal; if (journal) { /* Now we set up the journal barrier. */ @@ -6368,7 +6403,7 @@ static int ext4_freeze(struct super_block *sb) */ static int ext4_unfreeze(struct super_block *sb) { - if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb))) + if (ext4_forced_shutdown(sb)) return 0; if (EXT4_SB(sb)->s_journal) { @@ -6484,7 +6519,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; } - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (test_opt2(sb, ABORT)) ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -6498,10 +6533,10 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } /* Flush outstanding errors before changing fs state */ - flush_work(&sbi->s_error_work); + flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { + if (ext4_forced_shutdown(sb)) { err = -EROFS; goto restore_opts; } @@ -6662,7 +6697,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) * If there was a failing r/w to ro transition, we may need to * re-enable quota */ - if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) && + if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && sb_any_quota_suspended(sb)) dquot_resume(sb, -1); sb->s_flags = old_sb_flags; @@ -7071,6 +7106,13 @@ static int ext4_quota_off(struct super_block *sb, int type) err = dquot_quota_off(sb, type); if (err || ext4_has_feature_quota(sb)) goto out_put; + /* + * When the filesystem was remounted read-only first, we cannot cleanup + * inode flags here. Bad luck but people should be using QUOTA feature + * these days anyway. + */ + if (sb_rdonly(sb)) + goto out_put; inode_lock(inode); /* diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 281e1bfbbe3ec..92ba28cebac63 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -701,7 +701,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, { int error; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (strlen(name) > 255) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 34cd57c6b68df..f2700477a3001 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1436,17 +1436,14 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (cmd == F_CANCELLK) { - /* Hack: */ - cmd = F_SETLK; - fl->fl_type = F_UNLCK; - } if (unlikely(gfs2_withdrawn(sdp))) { if (fl->fl_type == F_UNLCK) locks_lock_file_wait(file, fl); return -EIO; } - if (IS_GETLK(cmd)) + if (cmd == F_CANCELLK) + return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (IS_GETLK(cmd)) return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); else if (fl->fl_type == F_UNLCK) return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 9ec91017a7f3c..118699fff2f90 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -40,18 +40,6 @@ static inline void __buffer_unlink(struct journal_head *jh) } } -/* - * Check a checkpoint buffer could be release or not. - * - * Requires j_list_lock - */ -static inline bool __cp_buffer_busy(struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh)); -} - /* * __jbd2_log_wait_for_space: wait until there is space in the journal. * @@ -349,6 +337,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ +enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; + /* * journal_shrink_one_cp_list * @@ -360,7 +350,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * Called with j_list_lock held. */ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, - bool destroy, bool *released) + enum shrink_type type, + bool *released) { struct journal_head *last_jh; struct journal_head *next_jh = jh; @@ -376,12 +367,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, jh = next_jh; next_jh = jh->b_cpnext; - if (destroy) { + if (type == SHRINK_DESTROY) { ret = __jbd2_journal_remove_checkpoint(jh); } else { ret = jbd2_journal_try_remove_checkpoint(jh); - if (ret < 0) - continue; + if (ret < 0) { + if (type == SHRINK_BUSY_SKIP) + continue; + break; + } } nr_freed++; @@ -445,7 +439,7 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, tid = transaction->t_tid; freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, - false, &released); + SHRINK_BUSY_SKIP, &released); nr_freed += freed; (*nr_to_scan) -= min(*nr_to_scan, freed); if (*nr_to_scan == 0) @@ -485,19 +479,21 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) { transaction_t *transaction, *last_transaction, *next_transaction; + enum shrink_type type; bool released; transaction = journal->j_checkpoint_transactions; if (!transaction) return; + type = destroy ? SHRINK_DESTROY : SHRINK_BUSY_STOP; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; journal_shrink_one_cp_list(transaction->t_checkpoint_list, - destroy, &released); + type, &released); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if @@ -631,6 +627,8 @@ int jbd2_journal_try_remove_checkpoint(struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + if (jh->b_transaction) + return -EBUSY; if (!trylock_buffer(bh)) return -EBUSY; if (buffer_dirty(bh)) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 1b5a45ab62b0d..768fa05bcbede 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -115,14 +115,6 @@ void __jbd2_debug(int level, const char *file, const char *func, #endif /* Checksumming functions */ -static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) -{ - if (!jbd2_journal_has_csum_v2or3_feature(j)) - return 1; - - return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; -} - static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) { __u32 csum; @@ -1333,6 +1325,189 @@ static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, return count; } +/* + * If the journal init or create aborts, we need to mark the journal + * superblock as being NULL to prevent the journal destroy from writing + * back a bogus superblock. + */ +static void journal_fail_superblock(journal_t *journal) +{ + struct buffer_head *bh = journal->j_sb_buffer; + brelse(bh); + journal->j_sb_buffer = NULL; +} + +/* + * Check the superblock for a given journal, performing initial + * validation of the format. + */ +static int journal_check_superblock(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + int num_fc_blks; + int err = -EINVAL; + + if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || + sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { + printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); + return err; + } + + if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && + be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { + printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); + return err; + } + + if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { + printk(KERN_WARNING "JBD2: journal file too short\n"); + return err; + } + + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_total_len) { + printk(KERN_WARNING + "JBD2: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + return err; + } + + /* + * If this is a V2 superblock, then we have to check the + * features flags on it. + */ + if (!jbd2_format_support_feature(journal)) + return 0; + + if ((sb->s_feature_ro_compat & + ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || + (sb->s_feature_incompat & + ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { + printk(KERN_WARNING "JBD2: Unrecognised features on journal\n"); + return err; + } + + num_fc_blks = jbd2_has_feature_fast_commit(journal) ? + jbd2_journal_get_num_fc_blks(sb) : 0; + if (be32_to_cpu(sb->s_maxlen) < JBD2_MIN_JOURNAL_BLOCKS || + be32_to_cpu(sb->s_maxlen) - JBD2_MIN_JOURNAL_BLOCKS < num_fc_blks) { + printk(KERN_ERR "JBD2: journal file too short %u,%d\n", + be32_to_cpu(sb->s_maxlen), num_fc_blks); + return err; + } + + if (jbd2_has_feature_csum2(journal) && + jbd2_has_feature_csum3(journal)) { + /* Can't have checksum v2 and v3 at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " + "at the same time!\n"); + return err; + } + + if (jbd2_journal_has_csum_v2or3_feature(journal) && + jbd2_has_feature_checksum(journal)) { + /* Can't have checksum v1 and v2 on at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " + "at the same time!\n"); + return err; + } + + /* Load the checksum driver */ + if (jbd2_journal_has_csum_v2or3_feature(journal)) { + if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) { + printk(KERN_ERR "JBD2: Unknown checksum type\n"); + return err; + } + + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); + err = PTR_ERR(journal->j_chksum_driver); + journal->j_chksum_driver = NULL; + return err; + } + /* Check superblock checksum */ + if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { + printk(KERN_ERR "JBD2: journal checksum error\n"); + err = -EFSBADCRC; + return err; + } + } + + return 0; +} + +static int journal_revoke_records_per_block(journal_t *journal) +{ + int record_size; + int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); + + if (jbd2_has_feature_64bit(journal)) + record_size = 8; + else + record_size = 4; + + if (jbd2_journal_has_csum_v2or3(journal)) + space -= sizeof(struct jbd2_journal_block_tail); + return space / record_size; +} + +/* + * Load the on-disk journal superblock and read the key fields into the + * journal_t. + */ +static int journal_load_superblock(journal_t *journal) +{ + int err; + struct buffer_head *bh; + journal_superblock_t *sb; + + bh = getblk_unmovable(journal->j_dev, journal->j_blk_offset, + journal->j_blocksize); + if (bh) + err = bh_read(bh, 0); + if (!bh || err < 0) { + pr_err("%s: Cannot read journal superblock\n", __func__); + brelse(bh); + return -EIO; + } + + journal->j_sb_buffer = bh; + sb = (journal_superblock_t *)bh->b_data; + journal->j_superblock = sb; + err = journal_check_superblock(journal); + if (err) { + journal_fail_superblock(journal); + return err; + } + + journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); + journal->j_tail = be32_to_cpu(sb->s_start); + journal->j_first = be32_to_cpu(sb->s_first); + journal->j_errno = be32_to_cpu(sb->s_errno); + journal->j_last = be32_to_cpu(sb->s_maxlen); + + if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) + journal->j_total_len = be32_to_cpu(sb->s_maxlen); + /* Precompute checksum seed for all metadata */ + if (jbd2_journal_has_csum_v2or3(journal)) + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); + + if (jbd2_has_feature_fast_commit(journal)) { + journal->j_fc_last = be32_to_cpu(sb->s_maxlen); + journal->j_last = journal->j_fc_last - + jbd2_journal_get_num_fc_blks(sb); + journal->j_fc_first = journal->j_last + 1; + journal->j_fc_off = 0; + } + + return 0; +} + + /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing @@ -1349,12 +1524,21 @@ static journal_t *journal_init_common(struct block_device *bdev, static struct lock_class_key jbd2_trans_commit_key; journal_t *journal; int err; - struct buffer_head *bh; int n; journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) - return NULL; + return ERR_PTR(-ENOMEM); + + journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_total_len = len; + + err = journal_load_superblock(journal); + if (err) + goto err_cleanup; init_waitqueue_head(&journal->j_wait_transaction_locked); init_waitqueue_head(&journal->j_wait_done_commit); @@ -1367,12 +1551,15 @@ static journal_t *journal_init_common(struct block_device *bdev, mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); spin_lock_init(&journal->j_list_lock); + spin_lock_init(&journal->j_history_lock); rwlock_init(&journal->j_state_lock); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); journal->j_min_batch_time = 0; journal->j_max_batch_time = 15000; /* 15ms */ atomic_set(&journal->j_reserved_credits, 0); + lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", + &jbd2_trans_commit_key, 0); /* The journal is marked for error until we succeed with recovery! */ journal->j_flags = JBD2_ABORT; @@ -1382,18 +1569,11 @@ static journal_t *journal_init_common(struct block_device *bdev, if (err) goto err_cleanup; - spin_lock_init(&journal->j_history_lock); - - lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", - &jbd2_trans_commit_key, 0); - - /* journal descriptor can store up to n blocks -bzzz */ - journal->j_blocksize = blocksize; - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_total_len = len; - /* We need enough buffers to write out full descriptor block. */ + /* + * journal descriptor can store up to n blocks, we need enough + * buffers to write out full descriptor block. + */ + err = -ENOMEM; n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; journal->j_fc_wbuf = NULL; @@ -1402,37 +1582,30 @@ static journal_t *journal_init_common(struct block_device *bdev, if (!journal->j_wbuf) goto err_cleanup; - bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); - if (!bh) { - pr_err("%s: Cannot get buffer for journal superblock\n", - __func__); + err = percpu_counter_init(&journal->j_checkpoint_jh_count, 0, + GFP_KERNEL); + if (err) goto err_cleanup; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; journal->j_shrink_transaction = NULL; journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; journal->j_shrinker.count_objects = jbd2_journal_shrink_count; journal->j_shrinker.seeks = DEFAULT_SEEKS; journal->j_shrinker.batch = journal->j_max_transaction_buffers; - - if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL)) + err = register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)", + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); + if (err) goto err_cleanup; - if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)", - MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) { - percpu_counter_destroy(&journal->j_checkpoint_jh_count); - goto err_cleanup; - } return journal; err_cleanup: - brelse(journal->j_sb_buffer); + percpu_counter_destroy(&journal->j_checkpoint_jh_count); kfree(journal->j_wbuf); jbd2_journal_destroy_revoke(journal); + journal_fail_superblock(journal); kfree(journal); - return NULL; + return ERR_PTR(err); } /* jbd2_journal_init_dev and jbd2_journal_init_inode: @@ -1465,8 +1638,8 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev, journal_t *journal; journal = journal_init_common(bdev, fs_dev, start, len, blocksize); - if (!journal) - return NULL; + if (IS_ERR(journal)) + return ERR_CAST(journal); snprintf(journal->j_devname, sizeof(journal->j_devname), "%pg", journal->j_dev); @@ -1492,11 +1665,9 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) blocknr = 0; err = bmap(inode, &blocknr); - if (err || !blocknr) { - pr_err("%s: Cannot locate journal superblock\n", - __func__); - return NULL; + pr_err("%s: Cannot locate journal superblock\n", __func__); + return err ? ERR_PTR(err) : ERR_PTR(-EINVAL); } jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", @@ -1506,8 +1677,8 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev, blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); - if (!journal) - return NULL; + if (IS_ERR(journal)) + return ERR_CAST(journal); journal->j_inode = inode; snprintf(journal->j_devname, sizeof(journal->j_devname), @@ -1518,18 +1689,6 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) return journal; } -/* - * If the journal init or create aborts, we need to mark the journal - * superblock as being NULL to prevent the journal destroy from writing - * back a bogus superblock. - */ -static void journal_fail_superblock(journal_t *journal) -{ - struct buffer_head *bh = journal->j_sb_buffer; - brelse(bh); - journal->j_sb_buffer = NULL; -} - /* * Given a journal_t structure, initialise the various fields for * startup of a new journaling session. We use this both when creating @@ -1886,163 +2045,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal) } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); -static int journal_revoke_records_per_block(journal_t *journal) -{ - int record_size; - int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); - - if (jbd2_has_feature_64bit(journal)) - record_size = 8; - else - record_size = 4; - - if (jbd2_journal_has_csum_v2or3(journal)) - space -= sizeof(struct jbd2_journal_block_tail); - return space / record_size; -} - -/* - * Read the superblock for a given journal, performing initial - * validation of the format. - */ -static int journal_get_superblock(journal_t *journal) -{ - struct buffer_head *bh; - journal_superblock_t *sb; - int err; - - bh = journal->j_sb_buffer; - - J_ASSERT(bh != NULL); - if (buffer_verified(bh)) - return 0; - - err = bh_read(bh, 0); - if (err < 0) { - printk(KERN_ERR - "JBD2: IO error reading journal superblock\n"); - goto out; - } - - sb = journal->j_superblock; - - err = -EINVAL; - - if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || - sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { - printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); - goto out; - } - - if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && - be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { - printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); - goto out; - } - - if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { - printk(KERN_WARNING "JBD2: journal file too short\n"); - goto out; - } - - if (be32_to_cpu(sb->s_first) == 0 || - be32_to_cpu(sb->s_first) >= journal->j_total_len) { - printk(KERN_WARNING - "JBD2: Invalid start block of journal: %u\n", - be32_to_cpu(sb->s_first)); - goto out; - } - - if (jbd2_has_feature_csum2(journal) && - jbd2_has_feature_csum3(journal)) { - /* Can't have checksum v2 and v3 at the same time! */ - printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " - "at the same time!\n"); - goto out; - } - - if (jbd2_journal_has_csum_v2or3_feature(journal) && - jbd2_has_feature_checksum(journal)) { - /* Can't have checksum v1 and v2 on at the same time! */ - printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " - "at the same time!\n"); - goto out; - } - - if (!jbd2_verify_csum_type(journal, sb)) { - printk(KERN_ERR "JBD2: Unknown checksum type\n"); - goto out; - } - - /* Load the checksum driver */ - if (jbd2_journal_has_csum_v2or3_feature(journal)) { - journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); - err = PTR_ERR(journal->j_chksum_driver); - journal->j_chksum_driver = NULL; - goto out; - } - /* Check superblock checksum */ - if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { - printk(KERN_ERR "JBD2: journal checksum error\n"); - err = -EFSBADCRC; - goto out; - } - } - set_buffer_verified(bh); - return 0; - -out: - journal_fail_superblock(journal); - return err; -} - -/* - * Load the on-disk journal superblock and read the key fields into the - * journal_t. - */ - -static int load_superblock(journal_t *journal) -{ - int err; - journal_superblock_t *sb; - int num_fc_blocks; - - err = journal_get_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); - journal->j_tail = be32_to_cpu(sb->s_start); - journal->j_first = be32_to_cpu(sb->s_first); - journal->j_errno = be32_to_cpu(sb->s_errno); - journal->j_last = be32_to_cpu(sb->s_maxlen); - - if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) - journal->j_total_len = be32_to_cpu(sb->s_maxlen); - /* Precompute checksum seed for all metadata */ - if (jbd2_journal_has_csum_v2or3(journal)) - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, - sizeof(sb->s_uuid)); - journal->j_revoke_records_per_block = - journal_revoke_records_per_block(journal); - - if (jbd2_has_feature_fast_commit(journal)) { - journal->j_fc_last = be32_to_cpu(sb->s_maxlen); - num_fc_blocks = jbd2_journal_get_num_fc_blks(sb); - if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) - journal->j_last = journal->j_fc_last - num_fc_blocks; - journal->j_fc_first = journal->j_last + 1; - journal->j_fc_off = 0; - } - - return 0; -} - - /** * jbd2_journal_load() - Read journal from disk. * @journal: Journal to act on. @@ -2054,28 +2056,7 @@ static int load_superblock(journal_t *journal) int jbd2_journal_load(journal_t *journal) { int err; - journal_superblock_t *sb; - - err = load_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - /* - * If this is a V2 superblock, then we have to check the - * features flags on it. - */ - if (jbd2_format_support_feature(journal)) { - if ((sb->s_feature_ro_compat & - ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || - (sb->s_feature_incompat & - ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { - printk(KERN_WARNING - "JBD2: Unrecognised features on journal\n"); - return -EINVAL; - } - } + journal_superblock_t *sb = journal->j_superblock; /* * Create a slab for this blocksize @@ -2086,8 +2067,11 @@ int jbd2_journal_load(journal_t *journal) /* Let the recovery code check whether it needs to recover any * data from the journal. */ - if (jbd2_journal_recover(journal)) - goto recovery_error; + err = jbd2_journal_recover(journal); + if (err) { + pr_warn("JBD2: journal recovery failed\n"); + return err; + } if (journal->j_failed_commit) { printk(KERN_ERR "JBD2: journal transaction %u on %s " @@ -2104,15 +2088,14 @@ int jbd2_journal_load(journal_t *journal) /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ - if (journal_reset(journal)) - goto recovery_error; + err = journal_reset(journal); + if (err) { + pr_warn("JBD2: journal reset failed\n"); + return err; + } journal->j_flags |= JBD2_LOADED; return 0; - -recovery_error: - printk(KERN_WARNING "JBD2: recovery failed\n"); - return -EIO; } /** @@ -2224,8 +2207,6 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, if (!compat && !ro && !incompat) return 1; - if (journal_get_superblock(journal)) - return 0; if (!jbd2_format_support_feature(journal)) return 0; @@ -2515,16 +2496,12 @@ int jbd2_journal_flush(journal_t *journal, unsigned int flags) int jbd2_journal_wipe(journal_t *journal, int write) { - int err = 0; + int err; J_ASSERT (!(journal->j_flags & JBD2_LOADED)); - err = load_superblock(journal); - if (err) - return err; - if (!journal->j_tail) - goto no_recovery; + return 0; printk(KERN_WARNING "JBD2: %s recovery information on journal\n", write ? "Clearing" : "Ignoring"); @@ -2537,7 +2514,6 @@ int jbd2_journal_wipe(journal_t *journal, int write) mutex_unlock(&journal->j_checkpoint_mutex); } - no_recovery: return err; } diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 0184931d47f7d..c269a7d29a465 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -230,12 +230,8 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) /* Make sure we wrap around the log correctly! */ #define wrap(journal, var) \ do { \ - unsigned long _wrap_last = \ - jbd2_has_feature_fast_commit(journal) ? \ - (journal)->j_fc_last : (journal)->j_last; \ - \ - if (var >= _wrap_last) \ - var -= (_wrap_last - (journal)->j_first); \ + if (var >= (journal)->j_last) \ + var -= ((journal)->j_last - (journal)->j_first); \ } while (0) static int fc_do_one_pass(journal_t *journal, @@ -524,9 +520,7 @@ static int do_one_pass(journal_t *journal, break; jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", - next_commit_ID, next_log_block, - jbd2_has_feature_fast_commit(journal) ? - journal->j_fc_last : journal->j_last); + next_commit_ID, next_log_block, journal->j_last); /* Skip over each chunk of the transaction looking * either the next descriptor block or the final commit diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index a14a0f18a4c40..88afd108c2dd2 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -269,6 +269,7 @@ int dbUnmount(struct inode *ipbmap, int mounterror) /* free the memory for the in-memory bmap. */ kfree(bmp); + JFS_SBI(ipbmap->i_sb)->bmap = NULL; return (0); } diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index ae99a7e232eeb..63d21822d309b 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -166,7 +166,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) /* * COMMIT_SyncList flags an anonymous tlock on page that is on * sync list. - * We need to commit the inode to get the page written disk. + * We need to commit the inode to get the page written to the disk. */ if (test_and_clear_cflag(COMMIT_Synclist,ip)) jfs_commit_inode(ip, 0); @@ -311,6 +311,11 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) * blocks in the map. in that case, we'll start off with the * maximum free. */ + + /* give up if no space left */ + if (bmp->db_maxfreebud == -1) + return -ENOSPC; + max = (s64) 1 << bmp->db_maxfreebud; if (*nblocks >= max && *nblocks > nbperpage) nb = nblks = (max > nbperpage) ? max : nbperpage; diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index a40383aa6c84b..923a58422c461 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -193,6 +193,7 @@ int diUnmount(struct inode *ipimap, int mounterror) * free in-memory control structure */ kfree(imap); + JFS_IP(ipimap)->i_imap = NULL; return (0); } diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 029d470656005..57d7a4300210d 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -883,7 +883,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, struct component_name dname; u32 ssize; /* source pathname size */ struct btstack btstack; - struct inode *ip = d_inode(dentry); + struct inode *ip; s64 xlen = 0; int bmask = 0, xsize; s64 xaddr; diff --git a/fs/libfs.c b/fs/libfs.c index da78eb64831ec..a4eb127578862 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1648,16 +1648,6 @@ bool is_empty_dir_inode(struct inode *inode) } #if IS_ENABLED(CONFIG_UNICODE) -/* - * Determine if the name of a dentry should be casefolded. - * - * Return: if names will need casefolding - */ -static bool needs_casefold(const struct inode *dir) -{ - return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding; -} - /** * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems * @dentry: dentry whose name we are checking against @@ -1678,7 +1668,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, char strbuf[DNAME_INLINE_LEN]; int ret; - if (!dir || !needs_casefold(dir)) + if (!dir || !IS_CASEFOLDED(dir)) goto fallback; /* * If the dentry name is stored in-line, then it may be concurrently @@ -1720,7 +1710,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) const struct unicode_map *um = sb->s_encoding; int ret = 0; - if (!dir || !needs_casefold(dir)) + if (!dir || !IS_CASEFOLDED(dir)) return 0; ret = utf8_casefold_hash(um, dentry, str); diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 1d9488cf05348..87a0f207df0b9 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -276,6 +276,9 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, { struct nsm_handle *new; + if (!hostname) + return NULL; + new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL); if (unlikely(new == NULL)) return NULL; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 22d3ff3818f5f..6579948070a48 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -45,7 +45,6 @@ #define NLMDBG_FACILITY NLMDBG_SVC #define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) -#define ALLOWED_SIGS (sigmask(SIGKILL)) static struct svc_program nlmsvc_program; @@ -57,6 +56,12 @@ static unsigned int nlmsvc_users; static struct svc_serv *nlmsvc_serv; unsigned long nlmsvc_timeout; +static void nlmsvc_request_retry(struct timer_list *tl) +{ + svc_wake_up(nlmsvc_serv); +} +DEFINE_TIMER(nlmsvc_retry, nlmsvc_request_retry); + unsigned int lockd_net_id; /* @@ -111,26 +116,12 @@ static void set_grace_period(struct net *net) schedule_delayed_work(&ln->grace_period_end, grace_period); } -static void restart_grace(void) -{ - if (nlmsvc_ops) { - struct net *net = &init_net; - struct lockd_net *ln = net_generic(net, lockd_net_id); - - cancel_delayed_work_sync(&ln->grace_period_end); - locks_end_grace(&ln->lockd_manager); - nlmsvc_invalidate_all(); - set_grace_period(net); - } -} - /* * This is the lockd kernel thread */ static int lockd(void *vrqstp) { - int err = 0; struct svc_rqst *rqstp = vrqstp; struct net *net = &init_net; struct lockd_net *ln = net_generic(net, lockd_net_id); @@ -138,9 +129,6 @@ lockd(void *vrqstp) /* try_to_freeze() is called from svc_recv() */ set_freezable(); - /* Allow SIGKILL to tell lockd to drop all of its locks */ - allow_signal(SIGKILL); - dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); /* @@ -148,33 +136,12 @@ lockd(void *vrqstp) * NFS mount or NFS daemon has gone away. */ while (!kthread_should_stop()) { - long timeout = MAX_SCHEDULE_TIMEOUT; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - /* update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nlm_max_connections; - if (signalled()) { - flush_signals(current); - restart_grace(); - continue; - } - - timeout = nlmsvc_retry_blocked(); - - /* - * Find a socket with data available and call its - * recvfrom routine. - */ - err = svc_recv(rqstp, timeout); - if (err == -EAGAIN || err == -EINTR) - continue; - dprintk("lockd: request from %s\n", - svc_print_addr(rqstp, buf, sizeof(buf))); - - svc_process(rqstp); + nlmsvc_retry_blocked(); + svc_recv(rqstp); } - flush_signals(current); if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); @@ -407,6 +374,7 @@ static void lockd_put(void) #endif svc_set_num_threads(nlmsvc_serv, NULL, 0); + timer_delete_sync(&nlmsvc_retry); nlmsvc_serv = NULL; dprintk("lockd_down: service destroyed\n"); } @@ -538,7 +506,7 @@ static inline int is_callback(u32 proc) } -static int lockd_authenticate(struct svc_rqst *rqstp) +static enum svc_auth_status lockd_authenticate(struct svc_rqst *rqstp) { rqstp->rq_client = NULL; switch (rqstp->rq_authop->flavour) { diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c43ccdf28ed91..43aeba9de55cb 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -131,12 +131,14 @@ static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when) static inline void nlmsvc_remove_block(struct nlm_block *block) { + spin_lock(&nlm_blocked_lock); if (!list_empty(&block->b_list)) { - spin_lock(&nlm_blocked_lock); list_del_init(&block->b_list); spin_unlock(&nlm_blocked_lock); nlmsvc_release_block(block); + return; } + spin_unlock(&nlm_blocked_lock); } /* @@ -152,6 +154,7 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) file, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, lock->fl.fl_type); + spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { fl = &block->b_call->a_args.lock.fl; dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n", @@ -161,9 +164,11 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) nlmdbg_cookie2a(&block->b_call->a_args.cookie)); if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) { kref_get(&block->b_count); + spin_unlock(&nlm_blocked_lock); return block; } } + spin_unlock(&nlm_blocked_lock); return NULL; } @@ -185,16 +190,19 @@ nlmsvc_find_block(struct nlm_cookie *cookie) { struct nlm_block *block; + spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie)) goto found; } + spin_unlock(&nlm_blocked_lock); return NULL; found: dprintk("nlmsvc_find_block(%s): block=%p\n", nlmdbg_cookie2a(cookie), block); kref_get(&block->b_count); + spin_unlock(&nlm_blocked_lock); return block; } @@ -317,6 +325,7 @@ void nlmsvc_traverse_blocks(struct nlm_host *host, restart: mutex_lock(&file->f_mutex); + spin_lock(&nlm_blocked_lock); list_for_each_entry_safe(block, next, &file->f_blocks, b_flist) { if (!match(block->b_host, host)) continue; @@ -325,11 +334,13 @@ void nlmsvc_traverse_blocks(struct nlm_host *host, if (list_empty(&block->b_list)) continue; kref_get(&block->b_count); + spin_unlock(&nlm_blocked_lock); mutex_unlock(&file->f_mutex); nlmsvc_unlink_block(block); nlmsvc_release_block(block); goto restart; } + spin_unlock(&nlm_blocked_lock); mutex_unlock(&file->f_mutex); } @@ -1008,7 +1019,7 @@ retry_deferred_block(struct nlm_block *block) * picks up locks that can be granted, or grant notifications that must * be retransmitted. */ -unsigned long +void nlmsvc_retry_blocked(void) { unsigned long timeout = MAX_SCHEDULE_TIMEOUT; @@ -1038,5 +1049,6 @@ nlmsvc_retry_blocked(void) } spin_unlock(&nlm_blocked_lock); - return timeout; + if (timeout < MAX_SCHEDULE_TIMEOUT) + mod_timer(&nlmsvc_retry, jiffies + timeout); } diff --git a/fs/locks.c b/fs/locks.c index a45efc16945d5..76ad05f8070ad 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1744,13 +1744,6 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri if (is_deleg && !inode_trylock(inode)) return -EAGAIN; - if (is_deleg && arg == F_WRLCK) { - /* Write delegations are not currently supported: */ - inode_unlock(inode); - WARN_ON_ONCE(1); - return -EINVAL; - } - percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index b6fc169be1b16..7df2503cef6c3 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -209,8 +209,6 @@ config NFS_DISABLE_UDP_SUPPORT config NFS_V4_2_READ_PLUS bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation" depends on NFS_V4_2 - default n + default y help - This is intended for developers only. The READ_PLUS operation has - been shown to have issues under specific conditions and should not - be used in production. + Choose Y here to enable use of the NFS v4.2 READ_PLUS operation. diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 70f5563a8e81c..65cbb5607a5fc 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -404,7 +404,7 @@ bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d, int ret, i; d->children = kcalloc(v->concat.volumes_count, - sizeof(struct pnfs_block_dev), GFP_KERNEL); + sizeof(struct pnfs_block_dev), gfp_mask); if (!d->children) return -ENOMEM; @@ -433,7 +433,7 @@ bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d, int ret, i; d->children = kcalloc(v->stripe.volumes_count, - sizeof(struct pnfs_block_dev), GFP_KERNEL); + sizeof(struct pnfs_block_dev), gfp_mask); if (!d->children) return -ENOMEM; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 456af7d230cf1..466ebf1d41b2b 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -74,23 +74,12 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net) static int nfs4_callback_svc(void *vrqstp) { - int err; struct svc_rqst *rqstp = vrqstp; set_freezable(); - while (!kthread_freezable_should_stop(NULL)) { - - if (signal_pending(current)) - flush_signals(current); - /* - * Listen for a request on the socket - */ - err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); - if (err == -EAGAIN || err == -EINTR) - continue; - svc_process(rqstp); - } + while (!kthread_freezable_should_stop(NULL)) + svc_recv(rqstp); svc_exit_thread(rqstp); return 0; @@ -112,11 +101,7 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); while (!kthread_freezable_should_stop(NULL)) { - - if (signal_pending(current)) - flush_signals(current); - - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { req = list_first_entry(&serv->sv_cb_list, @@ -387,7 +372,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) * All other checking done after NFS decoding where the nfs_client can be * found in nfs4_callback_compound */ -static int nfs_callback_authenticate(struct svc_rqst *rqstp) +static enum svc_auth_status nfs_callback_authenticate(struct svc_rqst *rqstp) { rqstp->rq_auth_stat = rpc_autherr_badcred; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index e4c5f193ed5e8..44eca51b28085 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -517,6 +517,8 @@ int nfs_create_rpc_client(struct nfs_client *clp, .authflavor = flavor, .cred = cl_init->cred, .xprtsec = cl_init->xprtsec, + .connect_timeout = cl_init->connect_timeout, + .reconnect_timeout = cl_init->reconnect_timeout, }; if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8f3112e71a6a6..e6a51fd94fea8 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1089,6 +1089,17 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; + /* + * nfs_readdir_handle_cache_misses return force clear at + * (cache_misses > NFS_READDIR_CACHE_MISS_THRESHOLD) for + * readdir heuristic, NFS_READDIR_CACHE_MISS_THRESHOLD + 1 + * entries need be emitted here. + */ + if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 2) { + desc->eob = true; + break; + } + ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { @@ -1107,10 +1118,6 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; - if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 1) { - desc->eob = true; - break; - } } if (array->folio_is_eof) desc->eof = !desc->eob; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index aaffaaa336cc5..47d892a1d363d 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -472,13 +472,31 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, return result; } +static void nfs_direct_add_page_head(struct list_head *list, + struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + if (!list_empty(&head->wb_list) || !nfs_lock_request(head)) + return; + if (!list_empty(&head->wb_list)) { + nfs_unlock_request(head); + return; + } + list_add(&head->wb_list, list); + kref_get(&head->wb_kref); + kref_get(&head->wb_kref); +} + static void nfs_direct_join_group(struct list_head *list, struct inode *inode) { struct nfs_page *req, *subreq; list_for_each_entry(req, list, wb_list) { - if (req->wb_head != req) + if (req->wb_head != req) { + nfs_direct_add_page_head(&req->wb_list, req); continue; + } subreq = req->wb_this_page; if (subreq == req) continue; diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 6603b5cee029c..714975e5c0dbd 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -7,14 +7,16 @@ * Resolves DNS hostnames into valid ip addresses */ -#ifdef CONFIG_NFS_USE_KERNEL_DNS - #include #include #include -#include + #include "dns_resolve.h" +#ifdef CONFIG_NFS_USE_KERNEL_DNS + +#include + ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, struct sockaddr_storage *ss, size_t salen) { @@ -35,7 +37,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, #else -#include #include #include #include @@ -43,15 +44,12 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, #include #include #include -#include -#include #include #include #include #include #include "nfs4_fs.h" -#include "dns_resolve.h" #include "cache_lib.h" #include "netns.h" diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 79b1b3fcd3fcf..3f9768810427d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -200,7 +200,7 @@ nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe EXPORT_SYMBOL_GPL(nfs_file_splice_read); int -nfs_file_mmap(struct file * file, struct vm_area_struct * vma) +nfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); int status; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 913c09806c7f5..9c9cf764f6000 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -82,6 +82,8 @@ struct nfs_client_initdata { const struct rpc_timeout *timeparms; const struct cred *cred; struct xprtsec_parms xprtsec; + unsigned long connect_timeout; + unsigned long reconnect_timeout; }; /* @@ -493,6 +495,7 @@ extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); +extern bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size); extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, struct nfs_open_context *ctx, struct folio *folio); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 05c3b4b2b3dd8..c190938142960 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -949,7 +949,7 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_filename_inline(xdr, &entry->name, &entry->len); if (unlikely(error)) - return -EAGAIN; + return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN; /* * The type (size and byte order) of nfscookie isn't defined in diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index eff3802c5e035..674c012868b1a 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -86,6 +86,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) { struct rpc_timeout ds_timeout; + unsigned long connect_timeout = ds_timeo * (ds_retrans + 1) * HZ / 10; struct nfs_client *mds_clp = mds_srv->nfs_client; struct nfs_client_initdata cl_init = { .addr = ds_addr, @@ -98,6 +99,8 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, .timeparms = &ds_timeout, .cred = mds_srv->cred, .xprtsec = mds_clp->cl_xprtsec, + .connect_timeout = connect_timeout, + .reconnect_timeout = connect_timeout, }; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 3b0b650c9c5ab..60f032be805ae 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1991,7 +1991,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_inline_filename3(xdr, &entry->name, &entry->len); if (unlikely(error)) - return -EAGAIN; + return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN; error = decode_cookie3(xdr, &new_cookie); if (unlikely(error)) diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 0fe5aacbcfdf1..b59876b01a1e3 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -13,6 +13,7 @@ * more? Need to consider not to pre-alloc too much for a compound. */ #define PNFS_LAYOUTSTATS_MAXDEV (4) +#define READ_PLUS_SCRATCH_SIZE (16) /* nfs4.2proc.c */ #ifdef CONFIG_NFS_V4_2 diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 49f78e23b34c0..063e00aff87ed 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -471,8 +471,9 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, continue; } break; - } else if (err == -NFS4ERR_OFFLOAD_NO_REQS && !args.sync) { - args.sync = true; + } else if (err == -NFS4ERR_OFFLOAD_NO_REQS && + args.sync != res.synchronous) { + args.sync = res.synchronous; dst_exception.retry = 1; continue; } else if ((err == -ESTALE || diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 95234208dc9ee..9e3ae53e22058 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -54,10 +54,16 @@ (1 /* data_content4 */ + \ 2 /* data_info4.di_offset */ + \ 1 /* data_info4.di_length */) +#define NFS42_READ_PLUS_HOLE_SEGMENT_SIZE \ + (1 /* data_content4 */ + \ + 2 /* data_info4.di_offset */ + \ + 2 /* data_info4.di_length */) +#define READ_PLUS_SEGMENT_SIZE_DIFF (NFS42_READ_PLUS_HOLE_SEGMENT_SIZE - \ + NFS42_READ_PLUS_DATA_SEGMENT_SIZE) #define decode_read_plus_maxsz (op_decode_hdr_maxsz + \ 1 /* rpr_eof */ + \ 1 /* rpr_contents count */ + \ - NFS42_READ_PLUS_DATA_SEGMENT_SIZE) + NFS42_READ_PLUS_HOLE_SEGMENT_SIZE) #define encode_seek_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ 2 /* offset */ + \ @@ -617,8 +623,8 @@ static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req, encode_putfh(xdr, args->fh, &hdr); encode_read_plus(xdr, args, &hdr); - rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->count, hdr.replen); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->count, + hdr.replen - READ_PLUS_SEGMENT_SIZE_DIFF); encode_nops(&hdr); } @@ -1056,13 +1062,12 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) res->eof = be32_to_cpup(p++); segments = be32_to_cpup(p++); if (segments == 0) - return status; + return 0; segs = kmalloc_array(segments, sizeof(*segs), GFP_KERNEL); if (!segs) return -ENOMEM; - status = -EIO; for (i = 0; i < segments; i++) { status = decode_read_plus_segment(xdr, &segs[i]); if (status < 0) @@ -1428,7 +1433,7 @@ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; - xdr_set_scratch_buffer(xdr, res->scratch, sizeof(res->scratch)); + xdr_set_scratch_buffer(xdr, res->scratch, READ_PLUS_SCRATCH_SIZE); status = decode_compound_hdr(xdr, &hdr); if (status) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index d9114a754db73..27fb25567ce75 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -232,6 +232,8 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); + if (test_bit(NFS_CS_DS, &cl_init->init_flags)) + __set_bit(NFS_CS_DS, &clp->cl_flags); /* * Set up the connection to the server before we add add to the * global list. @@ -1007,6 +1009,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + __set_bit(NFS_CS_DS, &cl_init.init_flags); /* * Set an authflavor equual to the MDS value. Use the MDS nfs_client * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d57aaf0cc577f..794343790ea8b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5438,18 +5438,8 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task, return false; } -static inline void nfs4_read_plus_scratch_free(struct nfs_pgio_header *hdr) -{ - if (hdr->res.scratch) { - kfree(hdr->res.scratch); - hdr->res.scratch = NULL; - } -} - static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - nfs4_read_plus_scratch_free(hdr); - if (!nfs4_sequence_done(task, &hdr->res.seq_res)) return -EAGAIN; if (nfs4_read_stateid_changed(task, &hdr->args)) @@ -5469,8 +5459,7 @@ static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr, /* Note: We don't use READ_PLUS with pNFS yet */ if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) { msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]; - hdr->res.scratch = kmalloc(32, GFP_KERNEL); - return hdr->res.scratch != NULL; + return nfs_read_alloc_scratch(hdr, READ_PLUS_SCRATCH_SIZE); } return false; } @@ -8798,6 +8787,8 @@ nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred, #ifdef CONFIG_NFS_V4_1_MIGRATION calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR; #endif + if (test_bit(NFS_CS_DS, &clp->cl_flags)) + calldata->args.flags |= EXCHGID4_FLAG_USE_PNFS_DS; msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; @@ -8875,6 +8866,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cre /* Save the EXCHANGE_ID verifier session trunk tests */ memcpy(clp->cl_confirm.data, argp->verifier.data, sizeof(clp->cl_confirm.data)); + if (resp->flags & EXCHGID4_FLAG_USE_PNFS_DS) + set_bit(NFS_CS_DS, &clp->cl_flags); out: trace_nfs4_exchange_id(clp, status); rpc_put_task(task); diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index ddbbf4fcda867..178001c90156f 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -154,7 +154,7 @@ nfs4_get_device_info(struct nfs_server *server, set_bit(NFS_DEVICEID_NOCACHE, &d->flags); out_free_pages: - for (i = 0; i < max_pages; i++) + while (--i >= 0) __free_page(pages[i]); kfree(pages); out_free_pdev: diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index a0112ad4937aa..afd23910f3bff 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -852,6 +852,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, { struct nfs_client *clp = ERR_PTR(-EIO); struct nfs4_pnfs_ds_addr *da; + unsigned long connect_timeout = timeo * (retrans + 1) * HZ / 10; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); @@ -870,6 +871,8 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, .dstaddr = (struct sockaddr *)&da->da_addr, .addrlen = da->da_addrlen, .servername = clp->cl_hostname, + .connect_timeout = connect_timeout, + .reconnect_timeout = connect_timeout, }; if (da->da_transport != clp->cl_proto) @@ -943,7 +946,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, * Test this address for session trunking and * add as an alias */ - xprtdata.cred = nfs4_get_clid_cred(clp), + xprtdata.cred = nfs4_get_clid_cred(clp); rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_setup_test_and_add_xprt, &rpcdata); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index f71eeee67e201..7dc21a48e3e7b 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -47,6 +47,8 @@ static struct nfs_pgio_header *nfs_readhdr_alloc(void) static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) { + if (rhdr->res.scratch != NULL) + kfree(rhdr->res.scratch); kmem_cache_free(nfs_rdata_cachep, rhdr); } @@ -108,6 +110,14 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); +bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size) +{ + WARN_ON(hdr->res.scratch != NULL); + hdr->res.scratch = kmalloc(size, GFP_KERNEL); + return hdr->res.scratch != NULL; +} +EXPORT_SYMBOL_GPL(nfs_read_alloc_scratch); + static void nfs_readpage_release(struct nfs_page *req, int error) { struct folio *folio = nfs_page_to_folio(req); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2284f749d8924..0d6473cb00cb3 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1339,15 +1339,13 @@ int nfs_get_tree_common(struct fs_context *fc) void nfs_kill_super(struct super_block *s) { struct nfs_server *server = NFS_SB(s); - dev_t dev = s->s_dev; nfs_sysfs_move_sb_to_server(server); - generic_shutdown_super(s); + kill_anon_super(s); nfs_fscache_release_super_cookie(s); nfs_free_server(server); - free_anon_bdev(dev); } EXPORT_SYMBOL_GPL(nfs_kill_super); diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 8e9c1a0f8d380..1ed2f691ebb90 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -83,6 +83,15 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, int len = sizeof(__be32), ret, i; __be32 *p; + /* + * See paragraph 5 of RFC 8881 S18.40.3. + */ + if (!gdp->gd_maxcount) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + p = xdr_reserve_space(xdr, len + sizeof(__be32)); if (!p) return nfserr_resource; diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 4c9b87850ab12..929248c6ca84c 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -19,7 +19,7 @@ * typical sockaddr_storage. This is for space reasons, since sockaddr_storage * is much larger than a sockaddr_in6. */ -struct svc_cacherep { +struct nfsd_cacherep { struct { /* Keep often-read xid, csum in the same cache line: */ __be32 k_xid; @@ -84,8 +84,10 @@ int nfsd_net_reply_cache_init(struct nfsd_net *nn); void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); -int nfsd_cache_lookup(struct svc_rqst *); -void nfsd_cache_update(struct svc_rqst *, int, __be32 *); +int nfsd_cache_lookup(struct svc_rqst *rqstp, + struct nfsd_cacherep **cacherep); +void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, + int cachetype, __be32 *statp); int nfsd_reply_cache_stats_show(struct seq_file *m, void *v); #endif /* NFSCACHE_H */ diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c index e81d2a5cf381e..bb205328e043d 100644 --- a/fs/nfsd/flexfilelayoutxdr.c +++ b/fs/nfsd/flexfilelayoutxdr.c @@ -85,6 +85,15 @@ nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, int addr_len; __be32 *p; + /* + * See paragraph 5 of RFC 8881 S18.40.3. + */ + if (!gdp->gd_maxcount) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + /* len + padding for two strings */ addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len; ver_len = 20; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index fc8d5b7db9f81..268ef57751c48 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -307,7 +307,9 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!IS_POSIXACL(inode)) iap->ia_mode &= ~current_umask(); - fh_fill_pre_attrs(fhp); + status = fh_fill_pre_attrs(fhp); + if (status != nfs_ok) + goto out; host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true); if (host_err < 0) { status = nfserrno(host_err); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 518203821790c..96e786b5e5444 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -441,7 +441,7 @@ struct posix_ace_state_array { * calculated so far: */ struct posix_acl_state { - int empty; + unsigned char valid; struct posix_ace_state owner; struct posix_ace_state group; struct posix_ace_state other; @@ -457,7 +457,6 @@ init_state(struct posix_acl_state *state, int cnt) int alloc; memset(state, 0, sizeof(struct posix_acl_state)); - state->empty = 1; /* * In the worst case, each individual acl could be for a distinct * named user or group, but we don't know which, so we allocate @@ -500,7 +499,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) * and effective cases: when there are no inheritable ACEs, * calls ->set_acl with a NULL ACL structure. */ - if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) + if (!state->valid && (flags & NFS4_ACL_TYPE_DEFAULT)) return NULL; /* @@ -622,11 +621,12 @@ static void process_one_v4_ace(struct posix_acl_state *state, struct nfs4_ace *ace) { u32 mask = ace->access_mask; + short type = ace2type(ace); int i; - state->empty = 0; + state->valid |= type; - switch (ace2type(ace)) { + switch (type) { case ACL_USER_OBJ: if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { allow_bits(&state->owner, mask); @@ -726,6 +726,30 @@ static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE)) process_one_v4_ace(&effective_acl_state, ace); } + + /* + * At this point, the default ACL may have zeroed-out entries for owner, + * group and other. That usually results in a non-sensical resulting ACL + * that denies all access except to any ACE that was explicitly added. + * + * The setfacl command solves a similar problem with this logic: + * + * "If a Default ACL entry is created, and the Default ACL contains + * no owner, owning group, or others entry, a copy of the ACL + * owner, owning group, or others entry is added to the Default ACL." + * + * Copy any missing ACEs from the effective set, if any ACEs were + * explicitly set. + */ + if (default_acl_state.valid) { + if (!(default_acl_state.valid & ACL_USER_OBJ)) + default_acl_state.owner = effective_acl_state.owner; + if (!(default_acl_state.valid & ACL_GROUP_OBJ)) + default_acl_state.group = effective_acl_state.group; + if (!(default_acl_state.valid & ACL_OTHER)) + default_acl_state.other = effective_acl_state.other; + } + *pacl = posix_state_to_acl(&effective_acl_state, flags); if (IS_ERR(*pacl)) { ret = PTR_ERR(*pacl); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5ae670807449b..5ca748309c262 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -297,12 +297,12 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, } if (d_really_is_positive(child)) { - status = nfs_ok; - /* NFSv4 protocol requires change attributes even though * no change happened. */ - fh_fill_both_attrs(fhp); + status = fh_fill_both_attrs(fhp); + if (status != nfs_ok) + goto out; switch (open->op_createmode) { case NFS4_CREATE_UNCHECKED: @@ -345,7 +345,9 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!IS_POSIXACL(inode)) iap->ia_mode &= ~current_umask(); - fh_fill_pre_attrs(fhp); + status = fh_fill_pre_attrs(fhp); + if (status != nfs_ok) + goto out; status = nfsd4_vfs_create(fhp, child, open); if (status != nfs_ok) goto out; @@ -380,6 +382,38 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, return status; } +/** + * set_change_info - set up the change_info4 for a reply + * @cinfo: pointer to nfsd4_change_info to be populated + * @fhp: pointer to svc_fh to use as source + * + * Many operations in NFSv4 require change_info4 in the reply. This function + * populates that from the info that we (should!) have already collected. In + * the event that we didn't get any pre-attrs, just zero out both. + */ +static void +set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) +{ + cinfo->atomic = (u32)(fhp->fh_pre_saved && fhp->fh_post_saved && !fhp->fh_no_atomic_attr); + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + + /* + * If fetching the pre-change attributes failed, then we should + * have already failed the whole operation. We could have still + * failed to fetch post-change attributes however. + * + * If we didn't get post-op attrs, just zero-out the after + * field since we don't know what it should be. If the pre_saved + * field isn't set for some reason, throw warning and just copy + * whatever is in the after field. + */ + if (WARN_ON_ONCE(!fhp->fh_pre_saved)) + cinfo->before_change = 0; + if (!fhp->fh_post_saved) + cinfo->after_change = cinfo->before_change + 1; +} + static __be32 do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) { @@ -424,11 +458,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru } else { status = nfsd_lookup(rqstp, current_fh, open->op_fname, open->op_fnamelen, *resfh); - if (!status) + if (status == nfs_ok) /* NFSv4 protocol requires change attributes even though * no change happened. */ - fh_fill_both_attrs(current_fh); + status = fh_fill_both_attrs(current_fh); } if (status) goto out; @@ -1313,12 +1347,11 @@ static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr, /* found a match */ if (ni->nsui_busy) { /* wait - and try again */ - prepare_to_wait(&nn->nfsd_ssc_waitq, &wait, - TASK_INTERRUPTIBLE); + prepare_to_wait(&nn->nfsd_ssc_waitq, &wait, TASK_IDLE); spin_unlock(&nn->nfsd_ssc_lock); /* allow 20secs for mount/unmount for now - revisit */ - if (signal_pending(current) || + if (kthread_should_stop() || (schedule_timeout(20*HZ) == 0)) { finish_wait(&nn->nfsd_ssc_waitq, &wait); kfree(work); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index daf305daa7516..8534693eb6a49 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -649,6 +649,18 @@ find_readable_file(struct nfs4_file *f) return ret; } +static struct nfsd_file * +find_rw_file(struct nfs4_file *f) +{ + struct nfsd_file *ret; + + spin_lock(&f->fi_lock); + ret = nfsd_file_get(f->fi_fds[O_RDWR]); + spin_unlock(&f->fi_lock); + + return ret; +} + struct nfsd_file * find_any_file(struct nfs4_file *f) { @@ -1144,7 +1156,7 @@ static void block_delegations(struct knfsd_fh *fh) static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, - struct nfs4_clnt_odstate *odstate) + struct nfs4_clnt_odstate *odstate, u32 dl_type) { struct nfs4_delegation *dp; long n; @@ -1170,7 +1182,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, INIT_LIST_HEAD(&dp->dl_recall_lru); dp->dl_clnt_odstate = odstate; get_clnt_odstate(odstate); - dp->dl_type = NFS4_OPEN_DELEGATE_READ; + dp->dl_type = dl_type; dp->dl_retries = 1; dp->dl_recalled = false; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, @@ -5449,8 +5461,9 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct nfs4_file *fp = stp->st_stid.sc_file; struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate; struct nfs4_delegation *dp; - struct nfsd_file *nf; + struct nfsd_file *nf = NULL; struct file_lock *fl; + u32 dl_type; /* * The fi_had_conflict and nfs_get_existing_delegation checks @@ -5460,15 +5473,35 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); - nf = find_readable_file(fp); - if (!nf) { - /* - * We probably could attempt another open and get a read - * delegation, but for now, don't bother until the - * client actually sends us one. - */ - return ERR_PTR(-EAGAIN); + /* + * Try for a write delegation first. RFC8881 section 10.4 says: + * + * "An OPEN_DELEGATE_WRITE delegation allows the client to handle, + * on its own, all opens." + * + * Furthermore the client can use a write delegation for most READ + * operations as well, so we require a O_RDWR file here. + * + * Offer a write delegation in the case of a BOTH open, and ensure + * we get the O_RDWR descriptor. + */ + if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == NFS4_SHARE_ACCESS_BOTH) { + nf = find_rw_file(fp); + dl_type = NFS4_OPEN_DELEGATE_WRITE; } + + /* + * If the file is being opened O_RDONLY or we couldn't get a O_RDWR + * file for some reason, then try for a read delegation instead. + */ + if (!nf && (open->op_share_access & NFS4_SHARE_ACCESS_READ)) { + nf = find_readable_file(fp); + dl_type = NFS4_OPEN_DELEGATE_READ; + } + + if (!nf) + return ERR_PTR(-EAGAIN); + spin_lock(&state_lock); spin_lock(&fp->fi_lock); if (nfs4_delegation_exists(clp, fp)) @@ -5491,11 +5524,11 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, return ERR_PTR(status); status = -ENOMEM; - dp = alloc_init_deleg(clp, fp, odstate); + dp = alloc_init_deleg(clp, fp, odstate, dl_type); if (!dp) goto out_delegees; - fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); + fl = nfs4_alloc_init_lease(dp, dl_type); if (!fl) goto out_clnt_odstate; @@ -5568,10 +5601,28 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) } /* - * Attempt to hand out a delegation. + * The Linux NFS server does not offer write delegations to NFSv4.0 + * clients in order to avoid conflicts between write delegations and + * GETATTRs requesting CHANGE or SIZE attributes. + * + * With NFSv4.1 and later minorversions, the SEQUENCE operation that + * begins each COMPOUND contains a client ID. Delegation recall can + * be avoided when the server recognizes the client sending a + * GETATTR also holds write delegation it conflicts with. + * + * However, the NFSv4.0 protocol does not enable a server to + * determine that a GETATTR originated from the client holding the + * conflicting delegation versus coming from some other client. Per + * RFC 7530 Section 16.7.5, the server must recall or send a + * CB_GETATTR even when the GETATTR originates from the client that + * holds the conflicting delegation. * - * Note we don't support write delegations, and won't until the vfs has - * proper support for them. + * An NFSv4.0 client can trigger a pathological situation if it + * always sends a DELEGRETURN preceded by a conflicting GETATTR in + * the same COMPOUND. COMPOUND execution will always stop at the + * GETATTR and the DELEGRETURN will never get executed. The server + * eventually revokes the delegation, which can result in loss of + * open or lock state. */ static void nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, @@ -5590,8 +5641,6 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, case NFS4_OPEN_CLAIM_PREVIOUS: if (!cb_up) open->op_recall = 1; - if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ) - goto out_no_deleg; break; case NFS4_OPEN_CLAIM_NULL: parent = currentfh; @@ -5606,6 +5655,9 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out_no_deleg; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE && + !clp->cl_minorversion) + goto out_no_deleg; break; default: goto out_no_deleg; @@ -5616,8 +5668,13 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); - trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); - open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { + open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE; + trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); + } else { + open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; + trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); + } nfs4_put_stid(&dp->dl_stid); return; out_no_deleg: @@ -8341,3 +8398,68 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, { get_stateid(cstate, &u->write.wr_stateid); } + +/** + * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict + * @rqstp: RPC transaction context + * @inode: file to be checked for a conflict + * + * This function is called when there is a conflict between a write + * delegation and a change/size GETATTR from another client. The server + * must either use the CB_GETATTR to get the current values of the + * attributes from the client that holds the delegation or recall the + * delegation before replying to the GETATTR. See RFC 8881 section + * 18.7.4. + * + * The current implementation does not support CB_GETATTR yet. However + * this can avoid recalling the delegation could be added in follow up + * work. + * + * Returns 0 if there is no conflict; otherwise an nfs_stat + * code is returned. + */ +__be32 +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) +{ + __be32 status; + struct file_lock_context *ctx; + struct file_lock *fl; + struct nfs4_delegation *dp; + + ctx = locks_inode_context(inode); + if (!ctx) + return 0; + spin_lock(&ctx->flc_lock); + list_for_each_entry(fl, &ctx->flc_lease, fl_list) { + if (fl->fl_flags == FL_LAYOUT) + continue; + if (fl->fl_lmops != &nfsd_lease_mng_ops) { + /* + * non-nfs lease, if it's a lease with F_RDLCK then + * we are done; there isn't any write delegation + * on this inode + */ + if (fl->fl_type == F_RDLCK) + break; + goto break_lease; + } + if (fl->fl_type == F_WRLCK) { + dp = fl->fl_owner; + if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { + spin_unlock(&ctx->flc_lock); + return 0; + } +break_lease: + spin_unlock(&ctx->flc_lock); + nfsd_stats_wdeleg_getattr_inc(); + status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); + if (status != nfserr_jukebox || + !nfsd_wait_for_delegreturn(rqstp, inode)) + return status; + return 0; + } + break; + } + spin_unlock(&ctx->flc_lock); + return 0; +} diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b30dca7de8cc0..2e40c74d2f727 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2984,6 +2984,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, if (status) goto out; } + if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { + status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry)); + if (status) + goto out; + } err = vfs_getattr(&path, &stat, STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE, @@ -3973,17 +3978,20 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); if (nfserr) return nfserr; - p = xdr_reserve_space(xdr, 32); + + p = xdr_reserve_space(xdr, XDR_UNIT * 8); if (!p) return nfserr_resource; *p++ = cpu_to_be32(open->op_recall); /* + * Always flush on close + * * TODO: space_limit's in delegations */ *p++ = cpu_to_be32(NFS4_LIMIT_SIZE); - *p++ = cpu_to_be32(~(u32)0); - *p++ = cpu_to_be32(~(u32)0); + *p++ = xdr_zero; + *p++ = xdr_zero; /* * TODO: ACE's in delegations @@ -4678,20 +4686,17 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, *p++ = cpu_to_be32(gdev->gd_layout_type); - /* If maxcount is 0 then just update notifications */ - if (gdev->gd_maxcount != 0) { - ops = nfsd4_layout_ops[gdev->gd_layout_type]; - nfserr = ops->encode_getdeviceinfo(xdr, gdev); - if (nfserr) { - /* - * We don't bother to burden the layout drivers with - * enforcing gd_maxcount, just tell the client to - * come back with a bigger buffer if it's not enough. - */ - if (xdr->buf->len + 4 > gdev->gd_maxcount) - goto toosmall; - return nfserr; - } + ops = nfsd4_layout_ops[gdev->gd_layout_type]; + nfserr = ops->encode_getdeviceinfo(xdr, gdev); + if (nfserr) { + /* + * We don't bother to burden the layout drivers with + * enforcing gd_maxcount, just tell the client to + * come back with a bigger buffer if it's not enough. + */ + if (xdr->buf->len + 4 > gdev->gd_maxcount) + goto toosmall; + return nfserr; } if (gdev->gd_notify_types) { diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index a8eda1c85829e..80621a7095107 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -84,11 +84,11 @@ nfsd_hashsize(unsigned int limit) return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); } -static struct svc_cacherep * -nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum, - struct nfsd_net *nn) +static struct nfsd_cacherep * +nfsd_cacherep_alloc(struct svc_rqst *rqstp, __wsum csum, + struct nfsd_net *nn) { - struct svc_cacherep *rp; + struct nfsd_cacherep *rp; rp = kmem_cache_alloc(drc_slab, GFP_KERNEL); if (rp) { @@ -110,36 +110,64 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum, return rp; } -static void -nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, - struct nfsd_net *nn) +static void nfsd_cacherep_free(struct nfsd_cacherep *rp) { - if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { - nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len); + if (rp->c_type == RC_REPLBUFF) kfree(rp->c_replvec.iov_base); + kmem_cache_free(drc_slab, rp); +} + +static unsigned long +nfsd_cacherep_dispose(struct list_head *dispose) +{ + struct nfsd_cacherep *rp; + unsigned long freed = 0; + + while (!list_empty(dispose)) { + rp = list_first_entry(dispose, struct nfsd_cacherep, c_lru); + list_del(&rp->c_lru); + nfsd_cacherep_free(rp); + freed++; } + return freed; +} + +static void +nfsd_cacherep_unlink_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, + struct nfsd_cacherep *rp) +{ + if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) + nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len); if (rp->c_state != RC_UNUSED) { rb_erase(&rp->c_node, &b->rb_head); list_del(&rp->c_lru); atomic_dec(&nn->num_drc_entries); nfsd_stats_drc_mem_usage_sub(nn, sizeof(*rp)); } - kmem_cache_free(drc_slab, rp); } static void -nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, +nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp, + struct nfsd_net *nn) +{ + nfsd_cacherep_unlink_locked(nn, b, rp); + nfsd_cacherep_free(rp); +} + +static void +nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp, struct nfsd_net *nn) { spin_lock(&b->cache_lock); - nfsd_reply_cache_free_locked(b, rp, nn); + nfsd_cacherep_unlink_locked(nn, b, rp); spin_unlock(&b->cache_lock); + nfsd_cacherep_free(rp); } int nfsd_drc_slab_create(void) { drc_slab = kmem_cache_create("nfsd_drc", - sizeof(struct svc_cacherep), 0, 0, NULL); + sizeof(struct nfsd_cacherep), 0, 0, NULL); return drc_slab ? 0: -ENOMEM; } @@ -208,7 +236,7 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) void nfsd_reply_cache_shutdown(struct nfsd_net *nn) { - struct svc_cacherep *rp; + struct nfsd_cacherep *rp; unsigned int i; unregister_shrinker(&nn->nfsd_reply_cache_shrinker); @@ -216,7 +244,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) for (i = 0; i < nn->drc_hashsize; i++) { struct list_head *head = &nn->drc_hashtbl[i].lru_head; while (!list_empty(head)) { - rp = list_first_entry(head, struct svc_cacherep, c_lru); + rp = list_first_entry(head, struct nfsd_cacherep, c_lru); nfsd_reply_cache_free_locked(&nn->drc_hashtbl[i], rp, nn); } @@ -233,7 +261,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) * not already scheduled. */ static void -lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) +lru_put_end(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp) { rp->c_timestamp = jiffies; list_move_tail(&rp->c_lru, &b->lru_head); @@ -247,12 +275,21 @@ nfsd_cache_bucket_find(__be32 xid, struct nfsd_net *nn) return &nn->drc_hashtbl[hash]; } -static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn, - unsigned int max) +/* + * Remove and return no more than @max expired entries in bucket @b. + * If @max is zero, do not limit the number of removed entries. + */ +static void +nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, + unsigned int max, struct list_head *dispose) { - struct svc_cacherep *rp, *tmp; - long freed = 0; + unsigned long expiry = jiffies - RC_EXPIRE; + struct nfsd_cacherep *rp, *tmp; + unsigned int freed = 0; + + lockdep_assert_held(&b->cache_lock); + /* The bucket LRU is ordered oldest-first. */ list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { /* * Don't free entries attached to calls that are still @@ -260,60 +297,77 @@ static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn, */ if (rp->c_state == RC_INPROG) continue; + if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries && - time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) + time_before(expiry, rp->c_timestamp)) break; - nfsd_reply_cache_free_locked(b, rp, nn); - if (max && freed++ > max) + + nfsd_cacherep_unlink_locked(nn, b, rp); + list_add(&rp->c_lru, dispose); + + if (max && ++freed > max) break; } - return freed; } -static long nfsd_prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn) +/** + * nfsd_reply_cache_count - count_objects method for the DRC shrinker + * @shrink: our registered shrinker context + * @sc: garbage collection parameters + * + * Returns the total number of entries in the duplicate reply cache. To + * keep things simple and quick, this is not the number of expired entries + * in the cache (ie, the number that would be removed by a call to + * nfsd_reply_cache_scan). + */ +static unsigned long +nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - return prune_bucket(b, nn, 3); + struct nfsd_net *nn = container_of(shrink, + struct nfsd_net, nfsd_reply_cache_shrinker); + + return atomic_read(&nn->num_drc_entries); } -/* - * Walk the LRU list and prune off entries that are older than RC_EXPIRE. - * Also prune the oldest ones when the total exceeds the max number of entries. +/** + * nfsd_reply_cache_scan - scan_objects method for the DRC shrinker + * @shrink: our registered shrinker context + * @sc: garbage collection parameters + * + * Free expired entries on each bucket's LRU list until we've released + * nr_to_scan freed objects. Nothing will be released if the cache + * has not exceeded it's max_drc_entries limit. + * + * Returns the number of entries released by this call. */ -static long -prune_cache_entries(struct nfsd_net *nn) +static unsigned long +nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { + struct nfsd_net *nn = container_of(shrink, + struct nfsd_net, nfsd_reply_cache_shrinker); + unsigned long freed = 0; + LIST_HEAD(dispose); unsigned int i; - long freed = 0; for (i = 0; i < nn->drc_hashsize; i++) { struct nfsd_drc_bucket *b = &nn->drc_hashtbl[i]; if (list_empty(&b->lru_head)) continue; + spin_lock(&b->cache_lock); - freed += prune_bucket(b, nn, 0); + nfsd_prune_bucket_locked(nn, b, 0, &dispose); spin_unlock(&b->cache_lock); - } - return freed; -} -static unsigned long -nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) -{ - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); + freed += nfsd_cacherep_dispose(&dispose); + if (freed > sc->nr_to_scan) + break; + } - return atomic_read(&nn->num_drc_entries); + trace_nfsd_drc_gc(nn, freed); + return freed; } -static unsigned long -nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) -{ - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); - - return prune_cache_entries(nn); -} /* * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes */ @@ -348,8 +402,8 @@ nfsd_cache_csum(struct svc_rqst *rqstp) } static int -nfsd_cache_key_cmp(const struct svc_cacherep *key, - const struct svc_cacherep *rp, struct nfsd_net *nn) +nfsd_cache_key_cmp(const struct nfsd_cacherep *key, + const struct nfsd_cacherep *rp, struct nfsd_net *nn) { if (key->c_key.k_xid == rp->c_key.k_xid && key->c_key.k_csum != rp->c_key.k_csum) { @@ -365,11 +419,11 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key, * Must be called with cache_lock held. Returns the found entry or * inserts an empty key on failure. */ -static struct svc_cacherep * -nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key, +static struct nfsd_cacherep * +nfsd_cache_insert(struct nfsd_drc_bucket *b, struct nfsd_cacherep *key, struct nfsd_net *nn) { - struct svc_cacherep *rp, *ret = key; + struct nfsd_cacherep *rp, *ret = key; struct rb_node **p = &b->rb_head.rb_node, *parent = NULL; unsigned int entries = 0; @@ -378,7 +432,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key, while (*p != NULL) { ++entries; parent = *p; - rp = rb_entry(parent, struct svc_cacherep, c_node); + rp = rb_entry(parent, struct nfsd_cacherep, c_node); cmp = nfsd_cache_key_cmp(key, rp, nn); if (cmp < 0) @@ -411,6 +465,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key, /** * nfsd_cache_lookup - Find an entry in the duplicate reply cache * @rqstp: Incoming Call to find + * @cacherep: OUT: DRC entry for this request * * Try to find an entry matching the current call in the cache. When none * is found, we try to grab the oldest expired entry off the LRU list. If @@ -423,16 +478,17 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key, * %RC_REPLY: Reply from cache * %RC_DROPIT: Do not process the request further */ -int nfsd_cache_lookup(struct svc_rqst *rqstp) +int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep) { struct nfsd_net *nn; - struct svc_cacherep *rp, *found; + struct nfsd_cacherep *rp, *found; __wsum csum; struct nfsd_drc_bucket *b; int type = rqstp->rq_cachetype; + unsigned long freed; + LIST_HEAD(dispose); int rtn = RC_DOIT; - rqstp->rq_cacherep = NULL; if (type == RC_NOCACHE) { nfsd_stats_rc_nocache_inc(); goto out; @@ -445,7 +501,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) * preallocate an entry. */ nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - rp = nfsd_reply_cache_alloc(rqstp, csum, nn); + rp = nfsd_cacherep_alloc(rqstp, csum, nn); if (!rp) goto out; @@ -454,20 +510,18 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) found = nfsd_cache_insert(b, rp, nn); if (found != rp) goto found_entry; - - nfsd_stats_rc_misses_inc(); - rqstp->rq_cacherep = rp; + *cacherep = rp; rp->c_state = RC_INPROG; + nfsd_prune_bucket_locked(nn, b, 3, &dispose); + spin_unlock(&b->cache_lock); + freed = nfsd_cacherep_dispose(&dispose); + trace_nfsd_drc_gc(nn, freed); + + nfsd_stats_rc_misses_inc(); atomic_inc(&nn->num_drc_entries); nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp)); - - nfsd_prune_bucket(b, nn); - -out_unlock: - spin_unlock(&b->cache_lock); -out: - return rtn; + goto out; found_entry: /* We found a matching entry which is either in progress or done. */ @@ -505,12 +559,16 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) out_trace: trace_nfsd_drc_found(nn, rqstp, rtn); - goto out_unlock; +out_unlock: + spin_unlock(&b->cache_lock); +out: + return rtn; } /** * nfsd_cache_update - Update an entry in the duplicate reply cache. * @rqstp: svc_rqst with a finished Reply + * @rp: IN: DRC entry for this request * @cachetype: which cache to update * @statp: pointer to Reply's NFS status code, or NULL * @@ -528,10 +586,10 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) * nfsd failed to encode a reply that otherwise would have been cached. * In this case, nfsd_cache_update is called with statp == NULL. */ -void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) +void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, + int cachetype, __be32 *statp) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - struct svc_cacherep *rp = rqstp->rq_cacherep; struct kvec *resv = &rqstp->rq_res.head[0], *cachv; struct nfsd_drc_bucket *b; int len; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 3709830f90a63..7ed02fb88a362 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1627,6 +1627,7 @@ static void __exit exit_nfsd(void) } MODULE_AUTHOR("Olaf Kirch "); +MODULE_DESCRIPTION("In-kernel NFS server"); MODULE_LICENSE("GPL"); module_init(init_nfsd) module_exit(exit_nfsd) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index d88498f8b275c..11c14faa6c67b 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -96,7 +96,12 @@ int nfsd_pool_stats_open(struct inode *, struct file *); int nfsd_pool_stats_release(struct inode *, struct file *); void nfsd_shutdown_threads(struct net *net); -void nfsd_put(struct net *net); +static inline void nfsd_put(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + svc_put(nn->nfsd_serv); +} bool i_am_nfsd(void); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index c291389a1d71d..355bf0db3235b 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -614,7 +614,7 @@ fh_update(struct svc_fh *fhp) * @fhp: file handle to be updated * */ -void fh_fill_pre_attrs(struct svc_fh *fhp) +__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); struct inode *inode; @@ -622,12 +622,12 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) __be32 err; if (fhp->fh_no_wcc || fhp->fh_pre_saved) - return; + return nfs_ok; inode = d_inode(fhp->fh_dentry); err = fh_getattr(fhp, &stat); if (err) - return; + return err; if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); @@ -636,6 +636,7 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) fhp->fh_pre_ctime = stat.ctime; fhp->fh_pre_size = stat.size; fhp->fh_pre_saved = true; + return nfs_ok; } /** @@ -643,26 +644,27 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) * @fhp: file handle to be updated * */ -void fh_fill_post_attrs(struct svc_fh *fhp) +__be32 fh_fill_post_attrs(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); struct inode *inode = d_inode(fhp->fh_dentry); __be32 err; if (fhp->fh_no_wcc) - return; + return nfs_ok; if (fhp->fh_post_saved) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, &fhp->fh_post_attr); if (err) - return; + return err; fhp->fh_post_saved = true; if (v4) fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, inode); + return nfs_ok; } /** @@ -672,16 +674,20 @@ void fh_fill_post_attrs(struct svc_fh *fhp) * This is used when the directory wasn't changed, but wcc attributes * are needed anyway. */ -void fh_fill_both_attrs(struct svc_fh *fhp) +__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp) { - fh_fill_post_attrs(fhp); - if (!fhp->fh_post_saved) - return; + __be32 err; + + err = fh_fill_post_attrs(fhp); + if (err) + return err; + fhp->fh_pre_change = fhp->fh_post_change; fhp->fh_pre_mtime = fhp->fh_post_attr.mtime; fhp->fh_pre_ctime = fhp->fh_post_attr.ctime; fhp->fh_pre_size = fhp->fh_post_attr.size; fhp->fh_pre_saved = true; + return nfs_ok; } /* diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 4e0ecf0ae2cf2..40426f899e760 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -294,7 +294,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) } u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode); -extern void fh_fill_pre_attrs(struct svc_fh *fhp); -extern void fh_fill_post_attrs(struct svc_fh *fhp); -extern void fh_fill_both_attrs(struct svc_fh *fhp); +__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp); +__be32 fh_fill_post_attrs(struct svc_fh *fhp); +__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp); #endif /* _LINUX_NFSD_NFSFH_H */ diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 2154fa63c5f2c..1582af33e204a 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -542,9 +542,14 @@ static struct notifier_block nfsd_inet6addr_notifier = { /* Only used under nfsd_mutex, so this atomic may be overkill: */ static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); -static void nfsd_last_thread(struct svc_serv *serv, struct net *net) +static void nfsd_last_thread(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv = nn->nfsd_serv; + + spin_lock(&nfsd_notifier_lock); + nn->nfsd_serv = NULL; + spin_unlock(&nfsd_notifier_lock); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { @@ -554,6 +559,8 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) #endif } + svc_xprt_destroy_all(serv, net); + /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are @@ -644,7 +651,8 @@ void nfsd_shutdown_threads(struct net *net) svc_get(serv); /* Kill outstanding nfsd threads */ svc_set_num_threads(serv, NULL, 0); - nfsd_put(net); + nfsd_last_thread(net); + svc_put(serv); mutex_unlock(&nfsd_mutex); } @@ -674,9 +682,6 @@ int nfsd_create_serv(struct net *net) serv->sv_maxconn = nn->max_connections; error = svc_bind(serv, net); if (error < 0) { - /* NOT nfsd_put() as notifiers (see below) haven't - * been set up yet. - */ svc_put(serv); return error; } @@ -719,29 +724,6 @@ int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) return 0; } -/* This is the callback for kref_put() below. - * There is no code here as the first thing to be done is - * call svc_shutdown_net(), but we cannot get the 'net' from - * the kref. So do all the work when kref_put returns true. - */ -static void nfsd_noop(struct kref *ref) -{ -} - -void nfsd_put(struct net *net) -{ - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) { - svc_xprt_destroy_all(nn->nfsd_serv, net); - nfsd_last_thread(nn->nfsd_serv, net); - svc_destroy(&nn->nfsd_serv->sv_refcnt); - spin_lock(&nfsd_notifier_lock); - nn->nfsd_serv = NULL; - spin_unlock(&nfsd_notifier_lock); - } -} - int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) { int i = 0; @@ -792,7 +774,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) if (err) break; } - nfsd_put(net); + svc_put(nn->nfsd_serv); return err; } @@ -807,6 +789,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) int error; bool nfsd_up_before; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); @@ -826,22 +809,25 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) goto out; nfsd_up_before = nn->nfsd_net_up; + serv = nn->nfsd_serv; error = nfsd_startup_net(net, cred); if (error) goto out_put; - error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs); + error = svc_set_num_threads(serv, NULL, nrservs); if (error) goto out_shutdown; - error = nn->nfsd_serv->sv_nrthreads; + error = serv->sv_nrthreads; + if (error == 0) + nfsd_last_thread(net); out_shutdown: if (error < 0 && !nfsd_up_before) nfsd_shutdown_net(net); out_put: /* Threads now hold service active */ if (xchg(&nn->keep_active, 0)) - nfsd_put(net); - nfsd_put(net); + svc_put(serv); + svc_put(serv); out: mutex_unlock(&nfsd_mutex); return error; @@ -953,7 +939,6 @@ nfsd(void *vrqstp) struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); struct net *net = perm_sock->xpt_net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - int err; /* At this point, the thread shares current->fs * with the init process. We need to create files with the @@ -965,15 +950,6 @@ nfsd(void *vrqstp) current->fs->umask = 0; - /* - * thread is spawned with all signals set to SIG_IGN, re-enable - * the ones that will bring down the thread - */ - allow_signal(SIGKILL); - allow_signal(SIGHUP); - allow_signal(SIGINT); - allow_signal(SIGQUIT); - atomic_inc(&nfsdstats.th_cnt); set_freezable(); @@ -981,54 +957,19 @@ nfsd(void *vrqstp) /* * The main request loop */ - for (;;) { + while (!kthread_should_stop()) { /* Update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nn->max_connections; - /* - * Find a socket with data available and call its - * recvfrom routine. - */ - while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN) - ; - if (err == -EINTR) - break; - validate_process_creds(); - svc_process(rqstp); + svc_recv(rqstp); validate_process_creds(); } - /* Clear signals before calling svc_exit_thread() */ - flush_signals(current); - atomic_dec(&nfsdstats.th_cnt); out: - /* Take an extra ref so that the svc_put in svc_exit_thread() - * doesn't call svc_destroy() - */ - svc_get(nn->nfsd_serv); - /* Release the thread */ svc_exit_thread(rqstp); - - /* We need to drop a ref, but may not drop the last reference - * without holding nfsd_mutex, and we cannot wait for nfsd_mutex as that - * could deadlock with nfsd_shutdown_threads() waiting for us. - * So three options are: - * - drop a non-final reference, - * - get the mutex without waiting - * - sleep briefly andd try the above again - */ - while (!svc_put_not_last(nn->nfsd_serv)) { - if (mutex_trylock(&nfsd_mutex)) { - nfsd_put(net); - mutex_unlock(&nfsd_mutex); - break; - } - msleep(20); - } - return 0; } @@ -1046,6 +987,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; + struct nfsd_cacherep *rp; /* * Give the xdr decoder a chance to change this if it wants @@ -1056,7 +998,8 @@ int nfsd_dispatch(struct svc_rqst *rqstp) if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; - switch (nfsd_cache_lookup(rqstp)) { + rp = NULL; + switch (nfsd_cache_lookup(rqstp, &rp)) { case RC_DOIT: break; case RC_REPLY: @@ -1072,7 +1015,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; - nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1); + nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1); out_cached_reply: return 1; @@ -1082,13 +1025,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp) return 1; out_update_drop: - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); out_dropit: return 0; out_encode_err: trace_nfsd_cant_encode_err(rqstp); - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); *statp = rpc_system_err; return 1; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index d49d3060ed4f7..cbddcf484dbac 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -732,4 +732,7 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE); return clp->cl_state == NFSD4_EXPIRABLE; } + +extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, + struct inode *inode); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 777e24e5da33b..63797635e1c32 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -65,6 +65,8 @@ static int nfsd_show(struct seq_file *seq, void *v) seq_printf(seq, " %lld", percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)])); } + seq_printf(seq, "\nwdeleg_getattr %lld", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR])); seq_putc(seq, '\n'); #endif diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 9b43dc3d99913..cf5524e7ca062 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -22,6 +22,7 @@ enum { NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, #define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) + NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */ #endif NFSD_STATS_COUNTERS_NUM }; @@ -93,4 +94,10 @@ static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); } +#ifdef CONFIG_NFSD_V4 +static inline void nfsd_stats_wdeleg_getattr_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]); +} +#endif #endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 2af74983f1461..8039043488718 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -607,6 +607,7 @@ DEFINE_STATEID_EVENT(layout_recall_release); DEFINE_STATEID_EVENT(open); DEFINE_STATEID_EVENT(deleg_read); +DEFINE_STATEID_EVENT(deleg_write); DEFINE_STATEID_EVENT(deleg_return); DEFINE_STATEID_EVENT(deleg_recall); @@ -1240,8 +1241,8 @@ TRACE_EVENT(nfsd_drc_found, TRACE_EVENT(nfsd_drc_mismatch, TP_PROTO( const struct nfsd_net *nn, - const struct svc_cacherep *key, - const struct svc_cacherep *rp + const struct nfsd_cacherep *key, + const struct nfsd_cacherep *rp ), TP_ARGS(nn, key, rp), TP_STRUCT__entry( @@ -1261,6 +1262,28 @@ TRACE_EVENT(nfsd_drc_mismatch, __entry->ingress) ); +TRACE_EVENT_CONDITION(nfsd_drc_gc, + TP_PROTO( + const struct nfsd_net *nn, + unsigned long freed + ), + TP_ARGS(nn, freed), + TP_CONDITION(freed > 0), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + __field(unsigned long, freed) + __field(int, total) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + __entry->freed = freed; + __entry->total = atomic_read(&nn->num_drc_entries); + ), + TP_printk("boot_time=%16llx total=%d freed=%lu", + __entry->boot_time, __entry->total, __entry->freed + ) +); + TRACE_EVENT(nfsd_cb_args, TP_PROTO( const struct nfs4_client *clp, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9b7acba382fe1..48260cf68fde8 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1540,7 +1540,9 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, dput(dchild); if (err) goto out_unlock; - fh_fill_pre_attrs(fhp); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp); fh_fill_post_attrs(fhp); out_unlock: @@ -1635,13 +1637,16 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, inode_unlock(dentry->d_inode); goto out_drop_write; } - fh_fill_pre_attrs(fhp); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path); err = nfserrno(host_err); cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); if (!err) nfsd_create_setattr(rqstp, fhp, resfhp, attrs); fh_fill_post_attrs(fhp); +out_unlock: inode_unlock(dentry->d_inode); if (!err) err = nfserrno(commit_metadata(fhp)); @@ -1703,7 +1708,9 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserr_noent; if (d_really_is_negative(dold)) goto out_dput; - fh_fill_pre_attrs(ffhp); + err = fh_fill_pre_attrs(ffhp); + if (err != nfs_ok) + goto out_dput; host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL); fh_fill_post_attrs(ffhp); inode_unlock(dirp); @@ -1789,8 +1796,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, } trap = lock_rename(tdentry, fdentry); - fh_fill_pre_attrs(ffhp); - fh_fill_pre_attrs(tfhp); + err = fh_fill_pre_attrs(ffhp); + if (err != nfs_ok) + goto out_unlock; + err = fh_fill_pre_attrs(tfhp); + if (err != nfs_ok) + goto out_unlock; odentry = lookup_one_len(fname, fdentry, flen); host_err = PTR_ERR(odentry); @@ -1857,6 +1868,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, fh_fill_post_attrs(ffhp); fh_fill_post_attrs(tfhp); } +out_unlock: unlock_rename(tdentry, fdentry); fh_drop_write(ffhp); @@ -1916,12 +1928,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, goto out_unlock; } rinode = d_inode(rdentry); - ihold(rinode); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; + ihold(rinode); if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; - fh_fill_pre_attrs(fhp); if (type != S_IFDIR) { int retries; @@ -2341,16 +2355,18 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name) return nfserrno(ret); inode_lock(fhp->fh_dentry->d_inode); - fh_fill_pre_attrs(fhp); - + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; ret = __vfs_removexattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, NULL); - + err = nfsd_xattr_errno(ret); fh_fill_post_attrs(fhp); +out_unlock: inode_unlock(fhp->fh_dentry->d_inode); fh_drop_write(fhp); - return nfsd_xattr_errno(ret); + return err; } __be32 @@ -2368,15 +2384,17 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, if (ret) return nfserrno(ret); inode_lock(fhp->fh_dentry->d_inode); - fh_fill_pre_attrs(fhp); - - ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, buf, - len, flags, NULL); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; + ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry, + name, buf, len, flags, NULL); fh_fill_post_attrs(fhp); + err = nfsd_xattr_errno(ret); +out_unlock: inode_unlock(fhp->fh_dentry->d_inode); fh_drop_write(fhp); - - return nfsd_xattr_errno(ret); + return err; } #endif diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 510978e602da6..9d918a79dc166 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -774,17 +774,6 @@ void warn_on_nonidempotent_op(struct nfsd4_op *op); #define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) -static inline void -set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) -{ - BUG_ON(!fhp->fh_pre_saved); - cinfo->atomic = (u32)(fhp->fh_post_saved && !fhp->fh_no_atomic_attr); - - cinfo->before_change = fhp->fh_pre_change; - cinfo->after_change = fhp->fh_post_change; -} - - bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp); bool nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e8e7d47265aa9..ce215565d061e 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -908,9 +908,9 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) /* call the kernels journal init function now */ j_journal = jbd2_journal_init_inode(inode); - if (j_journal == NULL) { + if (IS_ERR(j_journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EINVAL; + status = PTR_ERR(j_journal); goto done; } @@ -1684,9 +1684,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, } journal = jbd2_journal_init_inode(inode); - if (journal == NULL) { + if (IS_ERR(journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EIO; + status = PTR_ERR(journal); goto done; } diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 05d4414d0c33f..9b76ee66aeb2f 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -738,18 +738,11 @@ static int user_plock(struct ocfs2_cluster_connection *conn, * * Internally, fs/dlm will pass these to a misc device, which * a userspace daemon will read and write to. - * - * For now, cancel requests (which happen internally only), - * are turned into unlocks. Most of this function taken from - * gfs2_lock. */ - if (cmd == F_CANCELLK) { - cmd = F_SETLK; - fl->fl_type = F_UNLCK; - } - - if (IS_GETLK(cmd)) + if (cmd == F_CANCELLK) + return dlm_posix_cancel(conn->cc_lockspace, ino, file, fl); + else if (IS_GETLK(cmd)) return dlm_posix_get(conn->cc_lockspace, ino, file, fl); else if (fl->fl_type == F_UNLCK) return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl); diff --git a/fs/smb/server/asn1.c b/fs/smb/server/asn1.c index cc6384f796759..4a4b2b03ff33d 100644 --- a/fs/smb/server/asn1.c +++ b/fs/smb/server/asn1.c @@ -214,12 +214,10 @@ static int ksmbd_neg_token_alloc(void *context, size_t hdrlen, { struct ksmbd_conn *conn = context; - conn->mechToken = kmalloc(vlen + 1, GFP_KERNEL); + conn->mechToken = kmemdup_nul(value, vlen, GFP_KERNEL); if (!conn->mechToken) return -ENOMEM; - memcpy(conn->mechToken, value, vlen); - conn->mechToken[vlen] = '\0'; return 0; } diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 5e5e120edcc22..229a6527870d0 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -355,6 +355,9 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, if (blob_len < (u64)sess_key_off + sess_key_len) return -EINVAL; + if (sess_key_len > CIFS_KEY_SIZE) + return -EINVAL; + ctx_arc4 = kmalloc(sizeof(*ctx_arc4), GFP_KERNEL); if (!ctx_arc4) return -ENOMEM; @@ -1029,11 +1032,15 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, { struct scatterlist *sg; unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; - int i, nr_entries[3] = {0}, total_entries = 0, sg_idx = 0; + int i, *nr_entries, total_entries = 0, sg_idx = 0; if (!nvec) return NULL; + nr_entries = kcalloc(nvec, sizeof(int), GFP_KERNEL); + if (!nr_entries) + return NULL; + for (i = 0; i < nvec - 1; i++) { unsigned long kaddr = (unsigned long)iov[i + 1].iov_base; @@ -1051,8 +1058,10 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, total_entries += 2; sg = kmalloc_array(total_entries, sizeof(struct scatterlist), GFP_KERNEL); - if (!sg) + if (!sg) { + kfree(nr_entries); return NULL; + } sg_init_table(sg, total_entries); smb2_sg_set_buf(&sg[sg_idx++], iov[0].iov_base + 24, assoc_data_len); @@ -1086,6 +1095,7 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, } } smb2_sg_set_buf(&sg[sg_idx], sign, SMB2_SIGNATURE_SIZE); + kfree(nr_entries); return sg; } diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 2a717d158f02e..0d990c2f33cda 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -123,28 +123,22 @@ void ksmbd_conn_enqueue_request(struct ksmbd_work *work) } } -int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) +void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; - int ret = 1; if (list_empty(&work->request_entry) && list_empty(&work->async_request_entry)) - return 0; + return; - if (!work->multiRsp) - atomic_dec(&conn->req_running); - if (!work->multiRsp) { - spin_lock(&conn->request_lock); - list_del_init(&work->request_entry); - spin_unlock(&conn->request_lock); - if (work->asynchronous) - release_async_work(work); - ret = 0; - } + atomic_dec(&conn->req_running); + spin_lock(&conn->request_lock); + list_del_init(&work->request_entry); + spin_unlock(&conn->request_lock); + if (work->asynchronous) + release_async_work(work); wake_up_all(&conn->req_running_q); - return ret; } void ksmbd_conn_lock(struct ksmbd_conn *conn) @@ -193,41 +187,22 @@ void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id) int ksmbd_conn_write(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; - size_t len = 0; int sent; - struct kvec iov[3]; - int iov_idx = 0; if (!work->response_buf) { pr_err("NULL response header\n"); return -EINVAL; } - if (work->tr_buf) { - iov[iov_idx] = (struct kvec) { work->tr_buf, - sizeof(struct smb2_transform_hdr) + 4 }; - len += iov[iov_idx++].iov_len; - } - - if (work->aux_payload_sz) { - iov[iov_idx] = (struct kvec) { work->response_buf, work->resp_hdr_sz }; - len += iov[iov_idx++].iov_len; - iov[iov_idx] = (struct kvec) { work->aux_payload_buf, work->aux_payload_sz }; - len += iov[iov_idx++].iov_len; - } else { - if (work->tr_buf) - iov[iov_idx].iov_len = work->resp_hdr_sz; - else - iov[iov_idx].iov_len = get_rfc1002_len(work->response_buf) + 4; - iov[iov_idx].iov_base = work->response_buf; - len += iov[iov_idx++].iov_len; - } + if (work->send_no_response) + return 0; ksmbd_conn_lock(conn); - sent = conn->transport->ops->writev(conn->transport, &iov[0], - iov_idx, len, - work->need_invalidate_rkey, - work->remote_key); + sent = conn->transport->ops->writev(conn->transport, work->iov, + work->iov_cnt, + get_rfc1002_len(work->iov[0].iov_base) + 4, + work->need_invalidate_rkey, + work->remote_key); ksmbd_conn_unlock(conn); if (sent < 0) { diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index ad8dfaa48ffb3..ab2583f030ceb 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -158,7 +158,7 @@ int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, struct smb2_buffer_desc_v1 *desc, unsigned int desc_len); void ksmbd_conn_enqueue_request(struct ksmbd_work *work); -int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); +void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops); int ksmbd_conn_handler_loop(void *p); int ksmbd_conn_transport_init(void); diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c index 14b9caebf7a4f..51def3ca74c01 100644 --- a/fs/smb/server/ksmbd_work.c +++ b/fs/smb/server/ksmbd_work.c @@ -27,18 +27,35 @@ struct ksmbd_work *ksmbd_alloc_work_struct(void) INIT_LIST_HEAD(&work->async_request_entry); INIT_LIST_HEAD(&work->fp_entry); INIT_LIST_HEAD(&work->interim_entry); + INIT_LIST_HEAD(&work->aux_read_list); + work->iov_alloc_cnt = 4; + work->iov = kcalloc(work->iov_alloc_cnt, sizeof(struct kvec), + GFP_KERNEL); + if (!work->iov) { + kmem_cache_free(work_cache, work); + work = NULL; + } } return work; } void ksmbd_free_work_struct(struct ksmbd_work *work) { + struct aux_read *ar, *tmp; + WARN_ON(work->saved_cred != NULL); kvfree(work->response_buf); - kvfree(work->aux_payload_buf); + + list_for_each_entry_safe(ar, tmp, &work->aux_read_list, entry) { + kvfree(ar->buf); + list_del(&ar->entry); + kfree(ar); + } + kfree(work->tr_buf); kvfree(work->request_buf); + kfree(work->iov); if (work->async_id) ksmbd_release_id(&work->conn->async_ida, work->async_id); kmem_cache_free(work_cache, work); @@ -77,3 +94,77 @@ bool ksmbd_queue_work(struct ksmbd_work *work) { return queue_work(ksmbd_wq, &work->work); } + +static int ksmbd_realloc_iov_pin(struct ksmbd_work *work, void *ib, + unsigned int ib_len) +{ + + if (work->iov_alloc_cnt <= work->iov_cnt) { + struct kvec *new; + + work->iov_alloc_cnt += 4; + new = krealloc(work->iov, + sizeof(struct kvec) * work->iov_alloc_cnt, + GFP_KERNEL | __GFP_ZERO); + if (!new) + return -ENOMEM; + work->iov = new; + } + + work->iov[++work->iov_idx].iov_base = ib; + work->iov[work->iov_idx].iov_len = ib_len; + work->iov_cnt++; + + return 0; +} + +static int __ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len, + void *aux_buf, unsigned int aux_size) +{ + /* Plus rfc_length size on first iov */ + if (!work->iov_idx) { + work->iov[work->iov_idx].iov_base = work->response_buf; + *(__be32 *)work->iov[0].iov_base = 0; + work->iov[work->iov_idx].iov_len = 4; + work->iov_cnt++; + } + + ksmbd_realloc_iov_pin(work, ib, len); + inc_rfc1001_len(work->iov[0].iov_base, len); + + if (aux_size) { + struct aux_read *ar; + + ksmbd_realloc_iov_pin(work, aux_buf, aux_size); + inc_rfc1001_len(work->iov[0].iov_base, aux_size); + + ar = kmalloc(sizeof(struct aux_read), GFP_KERNEL); + if (!ar) + return -ENOMEM; + + ar->buf = aux_buf; + list_add(&ar->entry, &work->aux_read_list); + } + + return 0; +} + +int ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len) +{ + return __ksmbd_iov_pin_rsp(work, ib, len, NULL, 0); +} + +int ksmbd_iov_pin_rsp_read(struct ksmbd_work *work, void *ib, int len, + void *aux_buf, unsigned int aux_size) +{ + return __ksmbd_iov_pin_rsp(work, ib, len, aux_buf, aux_size); +} + +int allocate_interim_rsp_buf(struct ksmbd_work *work) +{ + work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, GFP_KERNEL); + if (!work->response_buf) + return -ENOMEM; + work->response_sz = MAX_CIFS_SMALL_BUFFER_SIZE; + return 0; +} diff --git a/fs/smb/server/ksmbd_work.h b/fs/smb/server/ksmbd_work.h index f8ae6144c0aea..8ca2c813246e6 100644 --- a/fs/smb/server/ksmbd_work.h +++ b/fs/smb/server/ksmbd_work.h @@ -19,6 +19,11 @@ enum { KSMBD_WORK_CLOSED, }; +struct aux_read { + void *buf; + struct list_head entry; +}; + /* one of these for every pending CIFS request at the connection */ struct ksmbd_work { /* Server corresponding to this mid */ @@ -31,13 +36,19 @@ struct ksmbd_work { /* Response buffer */ void *response_buf; - /* Read data buffer */ - void *aux_payload_buf; + struct list_head aux_read_list; + + struct kvec *iov; + int iov_alloc_cnt; + int iov_cnt; + int iov_idx; /* Next cmd hdr in compound req buf*/ int next_smb2_rcv_hdr_off; /* Next cmd hdr in compound rsp buf*/ int next_smb2_rsp_hdr_off; + /* Current cmd hdr in compound rsp buf*/ + int curr_smb2_rsp_hdr_off; /* * Current Local FID assigned compound response if SMB2 CREATE @@ -53,16 +64,11 @@ struct ksmbd_work { unsigned int credits_granted; /* response smb header size */ - unsigned int resp_hdr_sz; unsigned int response_sz; - /* Read data count */ - unsigned int aux_payload_sz; void *tr_buf; unsigned char state; - /* Multiple responses for one request e.g. SMB ECHO */ - bool multiRsp:1; /* No response for cancelled request */ bool send_no_response:1; /* Request is encrypted */ @@ -95,6 +101,15 @@ static inline void *ksmbd_resp_buf_next(struct ksmbd_work *work) return work->response_buf + work->next_smb2_rsp_hdr_off + 4; } +/** + * ksmbd_resp_buf_curr - Get current buffer on compound response. + * @work: smb work containing response buffer + */ +static inline void *ksmbd_resp_buf_curr(struct ksmbd_work *work) +{ + return work->response_buf + work->curr_smb2_rsp_hdr_off + 4; +} + /** * ksmbd_req_buf_next - Get next buffer on compound request. * @work: smb work containing response buffer @@ -113,5 +128,8 @@ int ksmbd_work_pool_init(void); int ksmbd_workqueue_init(void); void ksmbd_workqueue_destroy(void); bool ksmbd_queue_work(struct ksmbd_work *work); - +int ksmbd_iov_pin_rsp_read(struct ksmbd_work *work, void *ib, int len, + void *aux_buf, unsigned int aux_size); +int ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len); +int allocate_interim_rsp_buf(struct ksmbd_work *work); #endif /* __KSMBD_WORK_H__ */ diff --git a/fs/smb/server/mgmt/share_config.h b/fs/smb/server/mgmt/share_config.h index 3fd3382939421..5f591751b9236 100644 --- a/fs/smb/server/mgmt/share_config.h +++ b/fs/smb/server/mgmt/share_config.h @@ -34,29 +34,22 @@ struct ksmbd_share_config { #define KSMBD_SHARE_INVALID_UID ((__u16)-1) #define KSMBD_SHARE_INVALID_GID ((__u16)-1) -static inline int share_config_create_mode(struct ksmbd_share_config *share, - umode_t posix_mode) +static inline umode_t +share_config_create_mode(struct ksmbd_share_config *share, + umode_t posix_mode) { - if (!share->force_create_mode) { - if (!posix_mode) - return share->create_mask; - else - return posix_mode & share->create_mask; - } - return share->force_create_mode & share->create_mask; + umode_t mode = (posix_mode ?: (umode_t)-1) & share->create_mask; + + return mode | share->force_create_mode; } -static inline int share_config_directory_mode(struct ksmbd_share_config *share, - umode_t posix_mode) +static inline umode_t +share_config_directory_mode(struct ksmbd_share_config *share, + umode_t posix_mode) { - if (!share->force_directory_mode) { - if (!posix_mode) - return share->directory_mask; - else - return posix_mode & share->directory_mask; - } + umode_t mode = (posix_mode ?: (umode_t)-1) & share->directory_mask; - return share->force_directory_mode & share->directory_mask; + return mode | share->force_directory_mode; } static inline int test_share_config_flag(struct ksmbd_share_config *share, diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 844b303baf293..9bc0103720f57 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -616,15 +616,6 @@ static int oplock_break_pending(struct oplock_info *opinfo, int req_op_level) return 0; } -static inline int allocate_oplock_break_buf(struct ksmbd_work *work) -{ - work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, GFP_KERNEL); - if (!work->response_buf) - return -ENOMEM; - work->response_sz = MAX_CIFS_SMALL_BUFFER_SIZE; - return 0; -} - /** * __smb2_oplock_break_noti() - send smb2 oplock break cmd from conn * to client @@ -639,7 +630,6 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) { struct smb2_oplock_break *rsp = NULL; struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work); - struct ksmbd_conn *conn = work->conn; struct oplock_break_info *br_info = work->request_buf; struct smb2_hdr *rsp_hdr; struct ksmbd_file *fp; @@ -648,7 +638,7 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) if (!fp) goto out; - if (allocate_oplock_break_buf(work)) { + if (allocate_interim_rsp_buf(work)) { pr_err("smb2_allocate_rsp_buf failed! "); ksmbd_fd_put(work, fp); goto out; @@ -656,8 +646,6 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) rsp_hdr = smb2_get_msg(work->response_buf); memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); - *(__be32 *)work->response_buf = - cpu_to_be32(conn->vals->header_size); rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER; rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; rsp_hdr->CreditRequest = cpu_to_le16(0); @@ -684,13 +672,15 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) rsp->PersistentFid = fp->persistent_id; rsp->VolatileFid = fp->volatile_id; - inc_rfc1001_len(work->response_buf, 24); + ksmbd_fd_put(work, fp); + if (ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_oplock_break))) + goto out; ksmbd_debug(OPLOCK, "sending oplock break v_id %llu p_id = %llu lock level = %d\n", rsp->VolatileFid, rsp->PersistentFid, rsp->OplockLevel); - ksmbd_fd_put(work, fp); ksmbd_conn_write(work); out: @@ -751,18 +741,15 @@ static void __smb2_lease_break_noti(struct work_struct *wk) struct smb2_lease_break *rsp = NULL; struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work); struct lease_break_info *br_info = work->request_buf; - struct ksmbd_conn *conn = work->conn; struct smb2_hdr *rsp_hdr; - if (allocate_oplock_break_buf(work)) { + if (allocate_interim_rsp_buf(work)) { ksmbd_debug(OPLOCK, "smb2_allocate_rsp_buf failed! "); goto out; } rsp_hdr = smb2_get_msg(work->response_buf); memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); - *(__be32 *)work->response_buf = - cpu_to_be32(conn->vals->header_size); rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER; rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; rsp_hdr->CreditRequest = cpu_to_le16(0); @@ -791,7 +778,9 @@ static void __smb2_lease_break_noti(struct work_struct *wk) rsp->AccessMaskHint = 0; rsp->ShareMaskHint = 0; - inc_rfc1001_len(work->response_buf, 44); + if (ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_lease_break))) + goto out; ksmbd_conn_write(work); @@ -1492,7 +1481,7 @@ struct create_context *smb2_find_context_vals(void *open_req, const char *tag, i name_len < 4 || name_off + name_len > cc_len || (value_off & 0x7) != 0 || - (value_off && (value_off < name_off + name_len)) || + (value_len && value_off < name_off + (name_len < 8 ? 8 : name_len)) || ((u64)value_off + value_len > cc_len)) return ERR_PTR(-EINVAL); diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 9df121bdf3492..801cd0929209c 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -163,6 +163,7 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, { u16 command = 0; int rc; + bool is_chained = false; if (conn->ops->allocate_rsp_buf(work)) return; @@ -229,14 +230,13 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, } } + is_chained = is_chained_smb2_message(work); + if (work->sess && (work->sess->sign || smb3_11_final_sess_setup_resp(work) || conn->ops->is_sign_req(work, command))) conn->ops->set_sign_rsp(work); - } while (is_chained_smb2_message(work)); - - if (work->send_no_response) - return; + } while (is_chained == true); send: smb3_preauth_hash_rsp(work); diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index a947c18915c2e..749660110878f 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -145,12 +145,18 @@ void smb2_set_err_rsp(struct ksmbd_work *work) err_rsp = smb2_get_msg(work->response_buf); if (err_rsp->hdr.Status != STATUS_STOPPED_ON_SYMLINK) { + int err; + err_rsp->StructureSize = SMB2_ERROR_STRUCTURE_SIZE2_LE; err_rsp->ErrorContextCount = 0; err_rsp->Reserved = 0; err_rsp->ByteCount = 0; err_rsp->ErrorData[0] = 0; - inc_rfc1001_len(work->response_buf, SMB2_ERROR_STRUCTURE_SIZE2); + err = ksmbd_iov_pin_rsp(work, (void *)err_rsp, + __SMB2_HEADER_STRUCTURE_SIZE + + SMB2_ERROR_STRUCTURE_SIZE2); + if (err) + work->send_no_response = 1; } } @@ -245,9 +251,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) struct smb2_hdr *rsp_hdr; struct smb2_negotiate_rsp *rsp; struct ksmbd_conn *conn = work->conn; - - *(__be32 *)work->response_buf = - cpu_to_be32(conn->vals->header_size); + int err; rsp_hdr = smb2_get_msg(work->response_buf); memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); @@ -286,12 +290,13 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH); ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) + le16_to_cpu(rsp->SecurityBufferOffset)); - inc_rfc1001_len(work->response_buf, - sizeof(struct smb2_negotiate_rsp) - - sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH); rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY) rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE; + err = ksmbd_iov_pin_rsp(work, rsp, + sizeof(struct smb2_negotiate_rsp) + AUTH_GSS_LENGTH); + if (err) + return err; conn->use_spnego = true; ksmbd_conn_set_need_negotiate(conn); @@ -390,11 +395,12 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work) next_hdr_offset = le32_to_cpu(req->NextCommand); new_len = ALIGN(len, 8); - inc_rfc1001_len(work->response_buf, - sizeof(struct smb2_hdr) + new_len - len); + work->iov[work->iov_idx].iov_len += (new_len - len); + inc_rfc1001_len(work->response_buf, new_len - len); rsp->NextCommand = cpu_to_le32(new_len); work->next_smb2_rcv_hdr_off += next_hdr_offset; + work->curr_smb2_rsp_hdr_off = work->next_smb2_rsp_hdr_off; work->next_smb2_rsp_hdr_off += new_len; ksmbd_debug(SMB, "Compound req new_len = %d rcv off = %d rsp off = %d\n", @@ -470,10 +476,10 @@ bool is_chained_smb2_message(struct ksmbd_work *work) len = len - get_rfc1002_len(work->response_buf); if (len) { ksmbd_debug(SMB, "padding len %u\n", len); + work->iov[work->iov_idx].iov_len += len; inc_rfc1001_len(work->response_buf, len); - if (work->aux_payload_sz) - work->aux_payload_sz += len; } + work->curr_smb2_rsp_hdr_off = work->next_smb2_rsp_hdr_off; } return false; } @@ -488,11 +494,8 @@ int init_smb2_rsp_hdr(struct ksmbd_work *work) { struct smb2_hdr *rsp_hdr = smb2_get_msg(work->response_buf); struct smb2_hdr *rcv_hdr = smb2_get_msg(work->request_buf); - struct ksmbd_conn *conn = work->conn; memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); - *(__be32 *)work->response_buf = - cpu_to_be32(conn->vals->header_size); rsp_hdr->ProtocolId = rcv_hdr->ProtocolId; rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; rsp_hdr->Command = rcv_hdr->Command; @@ -657,7 +660,7 @@ int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg) struct ksmbd_conn *conn = work->conn; int id; - rsp_hdr = smb2_get_msg(work->response_buf); + rsp_hdr = ksmbd_resp_buf_next(work); rsp_hdr->Flags |= SMB2_FLAGS_ASYNC_COMMAND; id = ksmbd_acquire_async_msg_id(&conn->async_ida); @@ -706,15 +709,24 @@ void release_async_work(struct ksmbd_work *work) void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status) { struct smb2_hdr *rsp_hdr; + struct ksmbd_work *in_work = ksmbd_alloc_work_struct(); - rsp_hdr = smb2_get_msg(work->response_buf); - smb2_set_err_rsp(work); + if (allocate_interim_rsp_buf(in_work)) { + pr_err("smb_allocate_rsp_buf failed!\n"); + ksmbd_free_work_struct(in_work); + return; + } + + in_work->conn = work->conn; + memcpy(smb2_get_msg(in_work->response_buf), ksmbd_resp_buf_next(work), + __SMB2_HEADER_STRUCTURE_SIZE); + + rsp_hdr = smb2_get_msg(in_work->response_buf); + smb2_set_err_rsp(in_work); rsp_hdr->Status = status; - work->multiRsp = 1; - ksmbd_conn_write(work); - rsp_hdr->Status = 0; - work->multiRsp = 0; + ksmbd_conn_write(in_work); + ksmbd_free_work_struct(in_work); } static __le32 smb2_get_reparse_tag_special_file(umode_t mode) @@ -821,9 +833,8 @@ static void build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt) pneg_ctxt->Name[15] = 0x7C; } -static void assemble_neg_contexts(struct ksmbd_conn *conn, - struct smb2_negotiate_rsp *rsp, - void *smb2_buf_len) +static unsigned int assemble_neg_contexts(struct ksmbd_conn *conn, + struct smb2_negotiate_rsp *rsp) { char * const pneg_ctxt = (char *)rsp + le32_to_cpu(rsp->NegotiateContextOffset); @@ -834,7 +845,6 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn, "assemble SMB2_PREAUTH_INTEGRITY_CAPABILITIES context\n"); build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt, conn->preauth_info->Preauth_HashId); - inc_rfc1001_len(smb2_buf_len, AUTH_GSS_PADDING); ctxt_size = sizeof(struct smb2_preauth_neg_context); if (conn->cipher_type) { @@ -874,7 +884,7 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn, } rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt); - inc_rfc1001_len(smb2_buf_len, ctxt_size); + return ctxt_size + AUTH_GSS_PADDING; } static __le32 decode_preauth_ctxt(struct ksmbd_conn *conn, @@ -1090,7 +1100,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) struct smb2_negotiate_req *req = smb2_get_msg(work->request_buf); struct smb2_negotiate_rsp *rsp = smb2_get_msg(work->response_buf); int rc = 0; - unsigned int smb2_buf_len, smb2_neg_size; + unsigned int smb2_buf_len, smb2_neg_size, neg_ctxt_len = 0; __le32 status; ksmbd_debug(SMB, "Received negotiate request\n"); @@ -1183,7 +1193,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) conn->preauth_info->Preauth_HashValue); rsp->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); - assemble_neg_contexts(conn, rsp, work->response_buf); + neg_ctxt_len = assemble_neg_contexts(conn, rsp); break; case SMB302_PROT_ID: init_smb3_02_server(conn); @@ -1233,8 +1243,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH); ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) + le16_to_cpu(rsp->SecurityBufferOffset)); - inc_rfc1001_len(work->response_buf, sizeof(struct smb2_negotiate_rsp) - - sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH); + rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; conn->use_spnego = true; @@ -1252,9 +1261,15 @@ int smb2_handle_negotiate(struct ksmbd_work *work) ksmbd_conn_set_need_negotiate(conn); err_out: + if (rc) + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + + if (!rc) + rc = ksmbd_iov_pin_rsp(work, rsp, + sizeof(struct smb2_negotiate_rsp) + + AUTH_GSS_LENGTH + neg_ctxt_len); if (rc < 0) smb2_set_err_rsp(work); - return rc; } @@ -1454,7 +1469,6 @@ static int ntlm_authenticate(struct ksmbd_work *work, memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len); rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); kfree(spnego_blob); - inc_rfc1001_len(work->response_buf, spnego_blob_len - 1); } user = session_user(conn, req); @@ -1600,7 +1614,6 @@ static int krb5_authenticate(struct ksmbd_work *work, return -EINVAL; } rsp->SecurityBufferLength = cpu_to_le16(out_len); - inc_rfc1001_len(work->response_buf, out_len - 1); if ((conn->sign || server_conf.enforced_signing) || (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) @@ -1672,7 +1685,6 @@ int smb2_sess_setup(struct ksmbd_work *work) rsp->SessionFlags = 0; rsp->SecurityBufferOffset = cpu_to_le16(72); rsp->SecurityBufferLength = 0; - inc_rfc1001_len(work->response_buf, 9); ksmbd_conn_lock(conn); if (!req->hdr.SessionId) { @@ -1808,13 +1820,6 @@ int smb2_sess_setup(struct ksmbd_work *work) goto out_err; rsp->hdr.Status = STATUS_MORE_PROCESSING_REQUIRED; - /* - * Note: here total size -1 is done as an - * adjustment for 0 size blob - */ - inc_rfc1001_len(work->response_buf, - le16_to_cpu(rsp->SecurityBufferLength) - 1); - } else if (negblob->MessageType == NtLmAuthenticate) { rc = ntlm_authenticate(work, req, rsp); if (rc) @@ -1899,6 +1904,18 @@ int smb2_sess_setup(struct ksmbd_work *work) ksmbd_conn_set_need_negotiate(conn); } } + smb2_set_err_rsp(work); + } else { + unsigned int iov_len; + + if (rsp->SecurityBufferLength) + iov_len = offsetof(struct smb2_sess_setup_rsp, Buffer) + + le16_to_cpu(rsp->SecurityBufferLength); + else + iov_len = sizeof(struct smb2_sess_setup_rsp); + rc = ksmbd_iov_pin_rsp(work, rsp, iov_len); + if (rc) + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; } ksmbd_conn_unlock(conn); @@ -1977,13 +1994,16 @@ int smb2_tree_connect(struct ksmbd_work *work) status.tree_conn->posix_extensions = true; rsp->StructureSize = cpu_to_le16(16); - inc_rfc1001_len(work->response_buf, 16); out_err1: rsp->Capabilities = 0; rsp->Reserved = 0; /* default manual caching */ rsp->ShareFlags = SMB2_SHAREFLAG_MANUAL_CACHING; + rc = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_tree_connect_rsp)); + if (rc) + status.ret = KSMBD_TREE_CONN_STATUS_NOMEM; + if (!IS_ERR(treename)) kfree(treename); if (!IS_ERR(name)) @@ -2096,20 +2116,27 @@ int smb2_tree_disconnect(struct ksmbd_work *work) struct smb2_tree_disconnect_req *req; struct ksmbd_session *sess = work->sess; struct ksmbd_tree_connect *tcon = work->tcon; + int err; WORK_BUFFERS(work, req, rsp); - rsp->StructureSize = cpu_to_le16(4); - inc_rfc1001_len(work->response_buf, 4); - ksmbd_debug(SMB, "request\n"); + rsp->StructureSize = cpu_to_le16(4); + err = ksmbd_iov_pin_rsp(work, rsp, + sizeof(struct smb2_tree_disconnect_rsp)); + if (err) { + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + smb2_set_err_rsp(work); + return err; + } + if (!tcon || test_and_set_bit(TREE_CONN_EXPIRE, &tcon->status)) { ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; smb2_set_err_rsp(work); - return 0; + return -ENOENT; } ksmbd_close_tree_conn_fds(work); @@ -2131,15 +2158,21 @@ int smb2_session_logoff(struct ksmbd_work *work) struct smb2_logoff_rsp *rsp; struct ksmbd_session *sess; u64 sess_id; + int err; WORK_BUFFERS(work, req, rsp); + ksmbd_debug(SMB, "request\n"); + sess_id = le64_to_cpu(req->hdr.SessionId); rsp->StructureSize = cpu_to_le16(4); - inc_rfc1001_len(work->response_buf, 4); - - ksmbd_debug(SMB, "request\n"); + err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp)); + if (err) { + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + smb2_set_err_rsp(work); + return err; + } ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_RECONNECT); ksmbd_close_session_fds(work); @@ -2154,7 +2187,7 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; smb2_set_err_rsp(work); - return 0; + return -ENOENT; } ksmbd_destroy_file_table(&sess->file_table); @@ -2215,7 +2248,10 @@ static noinline int create_smb2_pipe(struct ksmbd_work *work) rsp->CreateContextsOffset = 0; rsp->CreateContextsLength = 0; - inc_rfc1001_len(work->response_buf, 88); /* StructureSize - 1*/ + err = ksmbd_iov_pin_rsp(work, rsp, offsetof(struct smb2_create_rsp, Buffer)); + if (err) + goto out; + kfree(name); return 0; @@ -2597,6 +2633,7 @@ int smb2_open(struct ksmbd_work *work) u64 time; umode_t posix_mode = 0; __le32 daccess, maximal_access = 0; + int iov_len = 0; WORK_BUFFERS(work, req, rsp); @@ -3248,7 +3285,7 @@ int smb2_open(struct ksmbd_work *work) rsp->CreateContextsOffset = 0; rsp->CreateContextsLength = 0; - inc_rfc1001_len(work->response_buf, 88); /* StructureSize - 1*/ + iov_len = offsetof(struct smb2_create_rsp, Buffer); /* If lease is request send lease context response */ if (opinfo && opinfo->is_lease) { @@ -3263,8 +3300,7 @@ int smb2_open(struct ksmbd_work *work) create_lease_buf(rsp->Buffer, opinfo->o_lease); le32_add_cpu(&rsp->CreateContextsLength, conn->vals->create_lease_size); - inc_rfc1001_len(work->response_buf, - conn->vals->create_lease_size); + iov_len += conn->vals->create_lease_size; next_ptr = &lease_ccontext->Next; next_off = conn->vals->create_lease_size; } @@ -3284,8 +3320,7 @@ int smb2_open(struct ksmbd_work *work) le32_to_cpu(maximal_access)); le32_add_cpu(&rsp->CreateContextsLength, conn->vals->create_mxac_size); - inc_rfc1001_len(work->response_buf, - conn->vals->create_mxac_size); + iov_len += conn->vals->create_mxac_size; if (next_ptr) *next_ptr = cpu_to_le32(next_off); next_ptr = &mxac_ccontext->Next; @@ -3303,8 +3338,7 @@ int smb2_open(struct ksmbd_work *work) stat.ino, tcon->id); le32_add_cpu(&rsp->CreateContextsLength, conn->vals->create_disk_id_size); - inc_rfc1001_len(work->response_buf, - conn->vals->create_disk_id_size); + iov_len += conn->vals->create_disk_id_size; if (next_ptr) *next_ptr = cpu_to_le32(next_off); next_ptr = &disk_id_ccontext->Next; @@ -3318,8 +3352,7 @@ int smb2_open(struct ksmbd_work *work) fp); le32_add_cpu(&rsp->CreateContextsLength, conn->vals->create_posix_size); - inc_rfc1001_len(work->response_buf, - conn->vals->create_posix_size); + iov_len += conn->vals->create_posix_size; if (next_ptr) *next_ptr = cpu_to_le32(next_off); } @@ -3337,7 +3370,8 @@ int smb2_open(struct ksmbd_work *work) } ksmbd_revert_fsids(work); err_out1: - + if (!rc) + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len); if (rc) { if (rc == -EINVAL) rsp->hdr.Status = STATUS_INVALID_PARAMETER; @@ -4063,7 +4097,10 @@ int smb2_query_dir(struct ksmbd_work *work) rsp->OutputBufferOffset = cpu_to_le16(0); rsp->OutputBufferLength = cpu_to_le32(0); rsp->Buffer[0] = 0; - inc_rfc1001_len(work->response_buf, 9); + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_query_directory_rsp)); + if (rc) + goto err_out; } else { no_buf_len: ((struct file_directory_info *) @@ -4075,7 +4112,11 @@ int smb2_query_dir(struct ksmbd_work *work) rsp->StructureSize = cpu_to_le16(9); rsp->OutputBufferOffset = cpu_to_le16(72); rsp->OutputBufferLength = cpu_to_le32(d_info.data_count); - inc_rfc1001_len(work->response_buf, 8 + d_info.data_count); + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, + offsetof(struct smb2_query_directory_rsp, Buffer) + + d_info.data_count); + if (rc) + goto err_out; } kfree(srch_ptr); @@ -4116,27 +4157,18 @@ int smb2_query_dir(struct ksmbd_work *work) * @reqOutputBufferLength: max buffer length expected in command response * @rsp: query info response buffer contains output buffer length * @rsp_org: base response buffer pointer in case of chained response - * @infoclass_size: query info class response buffer size * * Return: 0 on success, otherwise error */ static int buffer_check_err(int reqOutputBufferLength, struct smb2_query_info_rsp *rsp, - void *rsp_org, int infoclass_size) + void *rsp_org) { if (reqOutputBufferLength < le32_to_cpu(rsp->OutputBufferLength)) { - if (reqOutputBufferLength < infoclass_size) { - pr_err("Invalid Buffer Size Requested\n"); - rsp->hdr.Status = STATUS_INFO_LENGTH_MISMATCH; - *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr)); - return -EINVAL; - } - - ksmbd_debug(SMB, "Buffer Overflow\n"); - rsp->hdr.Status = STATUS_BUFFER_OVERFLOW; - *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr) + - reqOutputBufferLength); - rsp->OutputBufferLength = cpu_to_le32(reqOutputBufferLength); + pr_err("Invalid Buffer Size Requested\n"); + rsp->hdr.Status = STATUS_INFO_LENGTH_MISMATCH; + *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr)); + return -EINVAL; } return 0; } @@ -4155,7 +4187,6 @@ static void get_standard_info_pipe(struct smb2_query_info_rsp *rsp, sinfo->Directory = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_standard_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_standard_info)); } static void get_internal_info_pipe(struct smb2_query_info_rsp *rsp, u64 num, @@ -4169,7 +4200,6 @@ static void get_internal_info_pipe(struct smb2_query_info_rsp *rsp, u64 num, file_info->IndexNumber = cpu_to_le64(num | (1ULL << 63)); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_internal_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_internal_info)); } static int smb2_get_info_file_pipe(struct ksmbd_session *sess, @@ -4195,14 +4225,12 @@ static int smb2_get_info_file_pipe(struct ksmbd_session *sess, case FILE_STANDARD_INFORMATION: get_standard_info_pipe(rsp, rsp_org); rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), - rsp, rsp_org, - FILE_STANDARD_INFORMATION_SIZE); + rsp, rsp_org); break; case FILE_INTERNAL_INFORMATION: get_internal_info_pipe(rsp, id, rsp_org); rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), - rsp, rsp_org, - FILE_INTERNAL_INFORMATION_SIZE); + rsp, rsp_org); break; default: ksmbd_debug(SMB, "smb2_info_file_pipe for %u not supported\n", @@ -4308,7 +4336,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) name_len -= XATTR_USER_PREFIX_LEN; - ptr = (char *)(&eainfo->name + name_len + 1); + ptr = eainfo->name + name_len + 1; buf_free_len -= (offsetof(struct smb2_ea_info, name) + name_len + 1); /* bailout if xattr can't fit in buf_free_len */ @@ -4370,7 +4398,6 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, if (rsp_data_cnt == 0) rsp->hdr.Status = STATUS_NO_EAS_ON_FILE; rsp->OutputBufferLength = cpu_to_le32(rsp_data_cnt); - inc_rfc1001_len(rsp_org, rsp_data_cnt); out: kvfree(xattr_list); return rc; @@ -4385,7 +4412,6 @@ static void get_file_access_info(struct smb2_query_info_rsp *rsp, file_info->AccessFlags = fp->daccess; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_access_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_access_info)); } static int get_file_basic_info(struct smb2_query_info_rsp *rsp, @@ -4415,7 +4441,6 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, basic_info->Pad1 = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_basic_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_basic_info)); return 0; } @@ -4440,8 +4465,6 @@ static void get_file_standard_info(struct smb2_query_info_rsp *rsp, sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_standard_info)); - inc_rfc1001_len(rsp_org, - sizeof(struct smb2_file_standard_info)); } static void get_file_alignment_info(struct smb2_query_info_rsp *rsp, @@ -4453,8 +4476,6 @@ static void get_file_alignment_info(struct smb2_query_info_rsp *rsp, file_info->AlignmentRequirement = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_alignment_info)); - inc_rfc1001_len(rsp_org, - sizeof(struct smb2_file_alignment_info)); } static int get_file_all_info(struct ksmbd_work *work, @@ -4518,7 +4539,6 @@ static int get_file_all_info(struct ksmbd_work *work, rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_all_info) + conv_len - 1); kfree(filename); - inc_rfc1001_len(rsp_org, le32_to_cpu(rsp->OutputBufferLength)); return 0; } @@ -4541,7 +4561,6 @@ static void get_file_alternate_info(struct ksmbd_work *work, file_info->FileNameLength = cpu_to_le32(conv_len); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_alt_name_info) + conv_len); - inc_rfc1001_len(rsp_org, le32_to_cpu(rsp->OutputBufferLength)); } static void get_file_stream_info(struct ksmbd_work *work, @@ -4641,7 +4660,6 @@ static void get_file_stream_info(struct ksmbd_work *work, kvfree(xattr_list); rsp->OutputBufferLength = cpu_to_le32(nbytes); - inc_rfc1001_len(rsp_org, nbytes); } static void get_file_internal_info(struct smb2_query_info_rsp *rsp, @@ -4656,7 +4674,6 @@ static void get_file_internal_info(struct smb2_query_info_rsp *rsp, file_info->IndexNumber = cpu_to_le64(stat.ino); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_internal_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_internal_info)); } static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, @@ -4692,7 +4709,6 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, file_info->Reserved = cpu_to_le32(0); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_ntwrk_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_ntwrk_info)); return 0; } @@ -4704,7 +4720,6 @@ static void get_file_ea_info(struct smb2_query_info_rsp *rsp, void *rsp_org) file_info->EASize = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_ea_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_ea_info)); } static void get_file_position_info(struct smb2_query_info_rsp *rsp, @@ -4716,7 +4731,6 @@ static void get_file_position_info(struct smb2_query_info_rsp *rsp, file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_pos_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_pos_info)); } static void get_file_mode_info(struct smb2_query_info_rsp *rsp, @@ -4728,7 +4742,6 @@ static void get_file_mode_info(struct smb2_query_info_rsp *rsp, file_info->Mode = fp->coption & FILE_MODE_INFO_MASK; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_mode_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_mode_info)); } static void get_file_compression_info(struct smb2_query_info_rsp *rsp, @@ -4750,7 +4763,6 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp, rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_comp_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_comp_info)); } static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp, @@ -4769,11 +4781,10 @@ static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp, file_info->ReparseTag = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_attr_tag_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_attr_tag_info)); return 0; } -static int find_file_posix_info(struct smb2_query_info_rsp *rsp, +static void find_file_posix_info(struct smb2_query_info_rsp *rsp, struct ksmbd_file *fp, void *rsp_org) { struct smb311_posix_qinfo *file_info; @@ -4811,8 +4822,6 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, SIDUNIX_GROUP, (struct smb_sid *)&file_info->Sids[16]); rsp->OutputBufferLength = cpu_to_le32(out_buf_len); - inc_rfc1001_len(rsp_org, out_buf_len); - return out_buf_len; } static int smb2_get_info_file(struct ksmbd_work *work, @@ -4822,7 +4831,6 @@ static int smb2_get_info_file(struct ksmbd_work *work, struct ksmbd_file *fp; int fileinfoclass = 0; int rc = 0; - int file_infoclass_size; unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID; if (test_share_config_flag(work->tcon->share_conf, @@ -4855,85 +4863,69 @@ static int smb2_get_info_file(struct ksmbd_work *work, switch (fileinfoclass) { case FILE_ACCESS_INFORMATION: get_file_access_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_ACCESS_INFORMATION_SIZE; break; case FILE_BASIC_INFORMATION: rc = get_file_basic_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_BASIC_INFORMATION_SIZE; break; case FILE_STANDARD_INFORMATION: get_file_standard_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_STANDARD_INFORMATION_SIZE; break; case FILE_ALIGNMENT_INFORMATION: get_file_alignment_info(rsp, work->response_buf); - file_infoclass_size = FILE_ALIGNMENT_INFORMATION_SIZE; break; case FILE_ALL_INFORMATION: rc = get_file_all_info(work, rsp, fp, work->response_buf); - file_infoclass_size = FILE_ALL_INFORMATION_SIZE; break; case FILE_ALTERNATE_NAME_INFORMATION: get_file_alternate_info(work, rsp, fp, work->response_buf); - file_infoclass_size = FILE_ALTERNATE_NAME_INFORMATION_SIZE; break; case FILE_STREAM_INFORMATION: get_file_stream_info(work, rsp, fp, work->response_buf); - file_infoclass_size = FILE_STREAM_INFORMATION_SIZE; break; case FILE_INTERNAL_INFORMATION: get_file_internal_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_INTERNAL_INFORMATION_SIZE; break; case FILE_NETWORK_OPEN_INFORMATION: rc = get_file_network_open_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_NETWORK_OPEN_INFORMATION_SIZE; break; case FILE_EA_INFORMATION: get_file_ea_info(rsp, work->response_buf); - file_infoclass_size = FILE_EA_INFORMATION_SIZE; break; case FILE_FULL_EA_INFORMATION: rc = smb2_get_ea(work, fp, req, rsp, work->response_buf); - file_infoclass_size = FILE_FULL_EA_INFORMATION_SIZE; break; case FILE_POSITION_INFORMATION: get_file_position_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_POSITION_INFORMATION_SIZE; break; case FILE_MODE_INFORMATION: get_file_mode_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_MODE_INFORMATION_SIZE; break; case FILE_COMPRESSION_INFORMATION: get_file_compression_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_COMPRESSION_INFORMATION_SIZE; break; case FILE_ATTRIBUTE_TAG_INFORMATION: rc = get_file_attribute_tag_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_ATTRIBUTE_TAG_INFORMATION_SIZE; break; case SMB_FIND_FILE_POSIX_INFO: if (!work->tcon->posix_extensions) { pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n"); rc = -EOPNOTSUPP; } else { - file_infoclass_size = find_file_posix_info(rsp, fp, - work->response_buf); + find_file_posix_info(rsp, fp, work->response_buf); } break; default: @@ -4943,8 +4935,7 @@ static int smb2_get_info_file(struct ksmbd_work *work, } if (!rc) rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), - rsp, work->response_buf, - file_infoclass_size); + rsp, work->response_buf); ksmbd_fd_put(work, fp); return rc; } @@ -4960,7 +4951,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, struct kstatfs stfs; struct path path; int rc = 0, len; - int fs_infoclass_size = 0; if (!share->path) return -EIO; @@ -4990,8 +4980,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->DeviceType = cpu_to_le32(stfs.f_type); info->DeviceCharacteristics = cpu_to_le32(0x00000020); rsp->OutputBufferLength = cpu_to_le32(8); - inc_rfc1001_len(work->response_buf, 8); - fs_infoclass_size = FS_DEVICE_INFORMATION_SIZE; break; } case FS_ATTRIBUTE_INFORMATION: @@ -5020,8 +5008,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->FileSystemNameLen = cpu_to_le32(len); sz = sizeof(struct filesystem_attribute_info) - 2 + len; rsp->OutputBufferLength = cpu_to_le32(sz); - inc_rfc1001_len(work->response_buf, sz); - fs_infoclass_size = FS_ATTRIBUTE_INFORMATION_SIZE; break; } case FS_VOLUME_INFORMATION: @@ -5048,8 +5034,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->Reserved = 0; sz = sizeof(struct filesystem_vol_info) - 2 + len; rsp->OutputBufferLength = cpu_to_le32(sz); - inc_rfc1001_len(work->response_buf, sz); - fs_infoclass_size = FS_VOLUME_INFORMATION_SIZE; break; } case FS_SIZE_INFORMATION: @@ -5062,8 +5046,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->SectorsPerAllocationUnit = cpu_to_le32(1); info->BytesPerSector = cpu_to_le32(stfs.f_bsize); rsp->OutputBufferLength = cpu_to_le32(24); - inc_rfc1001_len(work->response_buf, 24); - fs_infoclass_size = FS_SIZE_INFORMATION_SIZE; break; } case FS_FULL_SIZE_INFORMATION: @@ -5079,8 +5061,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->SectorsPerAllocationUnit = cpu_to_le32(1); info->BytesPerSector = cpu_to_le32(stfs.f_bsize); rsp->OutputBufferLength = cpu_to_le32(32); - inc_rfc1001_len(work->response_buf, 32); - fs_infoclass_size = FS_FULL_SIZE_INFORMATION_SIZE; break; } case FS_OBJECT_ID_INFORMATION: @@ -5100,8 +5080,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->extended_info.rel_date = 0; memcpy(info->extended_info.version_string, "1.1.0", strlen("1.1.0")); rsp->OutputBufferLength = cpu_to_le32(64); - inc_rfc1001_len(work->response_buf, 64); - fs_infoclass_size = FS_OBJECT_ID_INFORMATION_SIZE; break; } case FS_SECTOR_SIZE_INFORMATION: @@ -5123,8 +5101,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->ByteOffsetForSectorAlignment = 0; info->ByteOffsetForPartitionAlignment = 0; rsp->OutputBufferLength = cpu_to_le32(28); - inc_rfc1001_len(work->response_buf, 28); - fs_infoclass_size = FS_SECTOR_SIZE_INFORMATION_SIZE; break; } case FS_CONTROL_INFORMATION: @@ -5145,8 +5121,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->DefaultQuotaLimit = cpu_to_le64(SMB2_NO_FID); info->Padding = 0; rsp->OutputBufferLength = cpu_to_le32(48); - inc_rfc1001_len(work->response_buf, 48); - fs_infoclass_size = FS_CONTROL_INFORMATION_SIZE; break; } case FS_POSIX_INFORMATION: @@ -5166,8 +5140,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->TotalFileNodes = cpu_to_le64(stfs.f_files); info->FreeFileNodes = cpu_to_le64(stfs.f_ffree); rsp->OutputBufferLength = cpu_to_le32(56); - inc_rfc1001_len(work->response_buf, 56); - fs_infoclass_size = FS_POSIX_INFORMATION_SIZE; } break; } @@ -5176,8 +5148,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, return -EOPNOTSUPP; } rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), - rsp, work->response_buf, - fs_infoclass_size); + rsp, work->response_buf); path_put(&path); return rc; } @@ -5211,7 +5182,6 @@ static int smb2_get_info_sec(struct ksmbd_work *work, secdesclen = sizeof(struct smb_ntsd); rsp->OutputBufferLength = cpu_to_le32(secdesclen); - inc_rfc1001_len(work->response_buf, secdesclen); return 0; } @@ -5256,7 +5226,6 @@ static int smb2_get_info_sec(struct ksmbd_work *work, return rc; rsp->OutputBufferLength = cpu_to_le32(secdesclen); - inc_rfc1001_len(work->response_buf, secdesclen); return 0; } @@ -5295,6 +5264,14 @@ int smb2_query_info(struct ksmbd_work *work) rc = -EOPNOTSUPP; } + if (!rc) { + rsp->StructureSize = cpu_to_le16(9); + rsp->OutputBufferOffset = cpu_to_le16(72); + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, + offsetof(struct smb2_query_info_rsp, Buffer) + + le32_to_cpu(rsp->OutputBufferLength)); + } + if (rc < 0) { if (rc == -EACCES) rsp->hdr.Status = STATUS_ACCESS_DENIED; @@ -5302,6 +5279,8 @@ int smb2_query_info(struct ksmbd_work *work) rsp->hdr.Status = STATUS_FILE_CLOSED; else if (rc == -EIO) rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR; + else if (rc == -ENOMEM) + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; else if (rc == -EOPNOTSUPP || rsp->hdr.Status == 0) rsp->hdr.Status = STATUS_INVALID_INFO_CLASS; smb2_set_err_rsp(work); @@ -5310,9 +5289,6 @@ int smb2_query_info(struct ksmbd_work *work) rc); return rc; } - rsp->StructureSize = cpu_to_le16(9); - rsp->OutputBufferOffset = cpu_to_le16(72); - inc_rfc1001_len(work->response_buf, 8); return 0; } @@ -5343,8 +5319,9 @@ static noinline int smb2_close_pipe(struct ksmbd_work *work) rsp->AllocationSize = 0; rsp->EndOfFile = 0; rsp->Attributes = 0; - inc_rfc1001_len(work->response_buf, 60); - return 0; + + return ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_close_rsp)); } /** @@ -5449,15 +5426,17 @@ int smb2_close(struct ksmbd_work *work) err = ksmbd_close_fd(work, volatile_id); out: + if (!err) + err = ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_close_rsp)); + if (err) { if (rsp->hdr.Status == 0) rsp->hdr.Status = STATUS_FILE_CLOSED; smb2_set_err_rsp(work); - } else { - inc_rfc1001_len(work->response_buf, 60); } - return 0; + return err; } /** @@ -5475,8 +5454,7 @@ int smb2_echo(struct ksmbd_work *work) rsp->StructureSize = cpu_to_le16(4); rsp->Reserved = 0; - inc_rfc1001_len(work->response_buf, 4); - return 0; + return ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_echo_rsp)); } static int smb2_rename(struct ksmbd_work *work, @@ -6068,7 +6046,10 @@ int smb2_set_info(struct ksmbd_work *work) goto err_out; rsp->StructureSize = cpu_to_le16(2); - inc_rfc1001_len(work->response_buf, 2); + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_set_info_rsp)); + if (rc) + goto err_out; ksmbd_fd_put(work, fp); return 0; @@ -6115,28 +6096,36 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) id = req->VolatileFileId; - inc_rfc1001_len(work->response_buf, 16); rpc_resp = ksmbd_rpc_read(work->sess, id); if (rpc_resp) { + void *aux_payload_buf; + if (rpc_resp->flags != KSMBD_RPC_OK) { err = -EINVAL; goto out; } - work->aux_payload_buf = + aux_payload_buf = kvmalloc(rpc_resp->payload_sz, GFP_KERNEL); - if (!work->aux_payload_buf) { + if (!aux_payload_buf) { err = -ENOMEM; goto out; } - memcpy(work->aux_payload_buf, rpc_resp->payload, - rpc_resp->payload_sz); + memcpy(aux_payload_buf, rpc_resp->payload, rpc_resp->payload_sz); nbytes = rpc_resp->payload_sz; - work->resp_hdr_sz = get_rfc1002_len(work->response_buf) + 4; - work->aux_payload_sz = nbytes; kvfree(rpc_resp); + err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, + offsetof(struct smb2_read_rsp, Buffer), + aux_payload_buf, nbytes); + if (err) + goto out; + } else { + err = ksmbd_iov_pin_rsp(work, (void *)rsp, + offsetof(struct smb2_read_rsp, Buffer)); + if (err) + goto out; } rsp->StructureSize = cpu_to_le16(17); @@ -6145,7 +6134,6 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) rsp->DataLength = cpu_to_le32(nbytes); rsp->DataRemaining = 0; rsp->Flags = 0; - inc_rfc1001_len(work->response_buf, nbytes); return 0; out: @@ -6219,13 +6207,8 @@ int smb2_read(struct ksmbd_work *work) int err = 0; bool is_rdma_channel = false; unsigned int max_read_size = conn->vals->max_read_size; - - WORK_BUFFERS(work, req, rsp); - if (work->next_smb2_rcv_hdr_off) { - work->send_no_response = 1; - err = -EOPNOTSUPP; - goto out; - } + unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID; + void *aux_payload_buf; if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_PIPE)) { @@ -6233,6 +6216,25 @@ int smb2_read(struct ksmbd_work *work) return smb2_read_pipe(work); } + if (work->next_smb2_rcv_hdr_off) { + req = ksmbd_req_buf_next(work); + rsp = ksmbd_resp_buf_next(work); + if (!has_file_id(req->VolatileFileId)) { + ksmbd_debug(SMB, "Compound request set FID = %llu\n", + work->compound_fid); + id = work->compound_fid; + pid = work->compound_pfid; + } + } else { + req = smb2_get_msg(work->request_buf); + rsp = smb2_get_msg(work->response_buf); + } + + if (!has_file_id(id)) { + id = req->VolatileFileId; + pid = req->PersistentFileId; + } + if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || req->Channel == SMB2_CHANNEL_RDMA_V1) { is_rdma_channel = true; @@ -6255,7 +6257,7 @@ int smb2_read(struct ksmbd_work *work) goto out; } - fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); + fp = ksmbd_lookup_fd_slow(work, id, pid); if (!fp) { err = -ENOENT; goto out; @@ -6281,21 +6283,20 @@ int smb2_read(struct ksmbd_work *work) ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n", fp->filp, offset, length); - work->aux_payload_buf = kvzalloc(length, GFP_KERNEL); - if (!work->aux_payload_buf) { + aux_payload_buf = kvzalloc(length, GFP_KERNEL); + if (!aux_payload_buf) { err = -ENOMEM; goto out; } - nbytes = ksmbd_vfs_read(work, fp, length, &offset); + nbytes = ksmbd_vfs_read(work, fp, length, &offset, aux_payload_buf); if (nbytes < 0) { err = nbytes; goto out; } if ((nbytes == 0 && length != 0) || nbytes < mincount) { - kvfree(work->aux_payload_buf); - work->aux_payload_buf = NULL; + kvfree(aux_payload_buf); rsp->hdr.Status = STATUS_END_OF_FILE; smb2_set_err_rsp(work); ksmbd_fd_put(work, fp); @@ -6308,10 +6309,9 @@ int smb2_read(struct ksmbd_work *work) if (is_rdma_channel == true) { /* write data to the client using rdma channel */ remain_bytes = smb2_read_rdma_channel(work, req, - work->aux_payload_buf, + aux_payload_buf, nbytes); - kvfree(work->aux_payload_buf); - work->aux_payload_buf = NULL; + kvfree(aux_payload_buf); nbytes = 0; if (remain_bytes < 0) { @@ -6326,10 +6326,11 @@ int smb2_read(struct ksmbd_work *work) rsp->DataLength = cpu_to_le32(nbytes); rsp->DataRemaining = cpu_to_le32(remain_bytes); rsp->Flags = 0; - inc_rfc1001_len(work->response_buf, 16); - work->resp_hdr_sz = get_rfc1002_len(work->response_buf) + 4; - work->aux_payload_sz = nbytes; - inc_rfc1001_len(work->response_buf, nbytes); + err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, + offsetof(struct smb2_read_rsp, Buffer), + aux_payload_buf, nbytes); + if (err) + goto out; ksmbd_fd_put(work, fp); return 0; @@ -6412,8 +6413,8 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work) rsp->DataLength = cpu_to_le32(length); rsp->DataRemaining = 0; rsp->Reserved2 = 0; - inc_rfc1001_len(work->response_buf, 16); - return 0; + err = ksmbd_iov_pin_rsp(work, (void *)rsp, + offsetof(struct smb2_write_rsp, Buffer)); out: if (err) { rsp->hdr.Status = STATUS_INVALID_HANDLE; @@ -6569,7 +6570,9 @@ int smb2_write(struct ksmbd_work *work) rsp->DataLength = cpu_to_le32(nbytes); rsp->DataRemaining = 0; rsp->Reserved2 = 0; - inc_rfc1001_len(work->response_buf, 16); + err = ksmbd_iov_pin_rsp(work, rsp, offsetof(struct smb2_write_rsp, Buffer)); + if (err) + goto out; ksmbd_fd_put(work, fp); return 0; @@ -6616,15 +6619,11 @@ int smb2_flush(struct ksmbd_work *work) rsp->StructureSize = cpu_to_le16(4); rsp->Reserved = 0; - inc_rfc1001_len(work->response_buf, 4); - return 0; + return ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_flush_rsp)); out: - if (err) { - rsp->hdr.Status = STATUS_INVALID_HANDLE; - smb2_set_err_rsp(work); - } - + rsp->hdr.Status = STATUS_INVALID_HANDLE; + smb2_set_err_rsp(work); return err; } @@ -7078,8 +7077,6 @@ int smb2_lock(struct ksmbd_work *work) goto out; } - init_smb2_rsp_hdr(work); - smb2_set_err_rsp(work); rsp->hdr.Status = STATUS_RANGE_NOT_LOCKED; kfree(smb_lock); @@ -7114,7 +7111,10 @@ int smb2_lock(struct ksmbd_work *work) ksmbd_debug(SMB, "successful in taking lock\n"); rsp->hdr.Status = STATUS_SUCCESS; rsp->Reserved = 0; - inc_rfc1001_len(work->response_buf, 4); + err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lock_rsp)); + if (err) + goto out; + ksmbd_fd_put(work, fp); return 0; @@ -7910,9 +7910,9 @@ int smb2_ioctl(struct ksmbd_work *work) rsp->Reserved = cpu_to_le16(0); rsp->Flags = cpu_to_le32(0); rsp->Reserved2 = cpu_to_le32(0); - inc_rfc1001_len(work->response_buf, 48 + nbytes); - - return 0; + ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_ioctl_rsp) + nbytes); + if (!ret) + return ret; out: if (ret == -EACCES) @@ -8047,8 +8047,9 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) rsp->Reserved2 = 0; rsp->VolatileFid = volatile_id; rsp->PersistentFid = persistent_id; - inc_rfc1001_len(work->response_buf, 24); - return; + ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_oplock_break)); + if (!ret) + return; err_out: opinfo->op_state = OPLOCK_STATE_NONE; @@ -8198,8 +8199,9 @@ static void smb21_lease_break_ack(struct ksmbd_work *work) memcpy(rsp->LeaseKey, req->LeaseKey, 16); rsp->LeaseState = lease_state; rsp->LeaseDuration = 0; - inc_rfc1001_len(work->response_buf, 36); - return; + ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lease_ack)); + if (!ret) + return; err_out: opinfo->op_state = OPLOCK_STATE_NONE; @@ -8337,43 +8339,19 @@ int smb2_check_sign_req(struct ksmbd_work *work) void smb2_set_sign_rsp(struct ksmbd_work *work) { struct smb2_hdr *hdr; - struct smb2_hdr *req_hdr; char signature[SMB2_HMACSHA256_SIZE]; - struct kvec iov[2]; - size_t len; + struct kvec *iov; int n_vec = 1; - hdr = smb2_get_msg(work->response_buf); - if (work->next_smb2_rsp_hdr_off) - hdr = ksmbd_resp_buf_next(work); - - req_hdr = ksmbd_req_buf_next(work); - - if (!work->next_smb2_rsp_hdr_off) { - len = get_rfc1002_len(work->response_buf); - if (req_hdr->NextCommand) - len = ALIGN(len, 8); - } else { - len = get_rfc1002_len(work->response_buf) - - work->next_smb2_rsp_hdr_off; - len = ALIGN(len, 8); - } - - if (req_hdr->NextCommand) - hdr->NextCommand = cpu_to_le32(len); - + hdr = ksmbd_resp_buf_curr(work); hdr->Flags |= SMB2_FLAGS_SIGNED; memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE); - iov[0].iov_base = (char *)&hdr->ProtocolId; - iov[0].iov_len = len; - - if (work->aux_payload_sz) { - iov[0].iov_len -= work->aux_payload_sz; - - iov[1].iov_base = work->aux_payload_buf; - iov[1].iov_len = work->aux_payload_sz; + if (hdr->Command == SMB2_READ) { + iov = &work->iov[work->iov_idx - 1]; n_vec++; + } else { + iov = &work->iov[work->iov_idx]; } if (!ksmbd_sign_smb2_pdu(work->conn, work->sess->sess_key, iov, n_vec, @@ -8449,29 +8427,14 @@ int smb3_check_sign_req(struct ksmbd_work *work) void smb3_set_sign_rsp(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; - struct smb2_hdr *req_hdr, *hdr; + struct smb2_hdr *hdr; struct channel *chann; char signature[SMB2_CMACAES_SIZE]; - struct kvec iov[2]; + struct kvec *iov; int n_vec = 1; - size_t len; char *signing_key; - hdr = smb2_get_msg(work->response_buf); - if (work->next_smb2_rsp_hdr_off) - hdr = ksmbd_resp_buf_next(work); - - req_hdr = ksmbd_req_buf_next(work); - - if (!work->next_smb2_rsp_hdr_off) { - len = get_rfc1002_len(work->response_buf); - if (req_hdr->NextCommand) - len = ALIGN(len, 8); - } else { - len = get_rfc1002_len(work->response_buf) - - work->next_smb2_rsp_hdr_off; - len = ALIGN(len, 8); - } + hdr = ksmbd_resp_buf_curr(work); if (conn->binding == false && le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) { @@ -8487,21 +8450,18 @@ void smb3_set_sign_rsp(struct ksmbd_work *work) if (!signing_key) return; - if (req_hdr->NextCommand) - hdr->NextCommand = cpu_to_le32(len); - hdr->Flags |= SMB2_FLAGS_SIGNED; memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE); - iov[0].iov_base = (char *)&hdr->ProtocolId; - iov[0].iov_len = len; - if (work->aux_payload_sz) { - iov[0].iov_len -= work->aux_payload_sz; - iov[1].iov_base = work->aux_payload_buf; - iov[1].iov_len = work->aux_payload_sz; + + if (hdr->Command == SMB2_READ) { + iov = &work->iov[work->iov_idx - 1]; n_vec++; + } else { + iov = &work->iov[work->iov_idx]; } - if (!ksmbd_sign_smb3_pdu(conn, signing_key, iov, n_vec, signature)) + if (!ksmbd_sign_smb3_pdu(conn, signing_key, iov, n_vec, + signature)) memcpy(hdr->Signature, signature, SMB2_SIGNATURE_SIZE); } @@ -8568,45 +8528,22 @@ static void fill_transform_hdr(void *tr_buf, char *old_buf, __le16 cipher_type) int smb3_encrypt_resp(struct ksmbd_work *work) { - char *buf = work->response_buf; - struct kvec iov[3]; + struct kvec *iov = work->iov; int rc = -ENOMEM; - int buf_size = 0, rq_nvec = 2 + (work->aux_payload_sz ? 1 : 0); + void *tr_buf; - if (ARRAY_SIZE(iov) < rq_nvec) - return -ENOMEM; - - work->tr_buf = kzalloc(sizeof(struct smb2_transform_hdr) + 4, GFP_KERNEL); - if (!work->tr_buf) + tr_buf = kzalloc(sizeof(struct smb2_transform_hdr) + 4, GFP_KERNEL); + if (!tr_buf) return rc; /* fill transform header */ - fill_transform_hdr(work->tr_buf, buf, work->conn->cipher_type); + fill_transform_hdr(tr_buf, work->response_buf, work->conn->cipher_type); - iov[0].iov_base = work->tr_buf; + iov[0].iov_base = tr_buf; iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4; - buf_size += iov[0].iov_len - 4; - - iov[1].iov_base = buf + 4; - iov[1].iov_len = get_rfc1002_len(buf); - if (work->aux_payload_sz) { - iov[1].iov_len = work->resp_hdr_sz - 4; - - iov[2].iov_base = work->aux_payload_buf; - iov[2].iov_len = work->aux_payload_sz; - buf_size += iov[2].iov_len; - } - buf_size += iov[1].iov_len; - work->resp_hdr_sz = iov[1].iov_len; + work->tr_buf = tr_buf; - rc = ksmbd_crypt_message(work, iov, rq_nvec, 1); - if (rc) - return rc; - - memmove(buf, iov[1].iov_base, iov[1].iov_len); - *(__be32 *)work->tr_buf = cpu_to_be32(buf_size); - - return rc; + return ksmbd_crypt_message(work, iov, work->iov_idx + 1, 1); } bool smb3_is_transform_hdr(void *buf) diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h index 2767c08a534a3..d12cfd3b09278 100644 --- a/fs/smb/server/smb2pdu.h +++ b/fs/smb/server/smb2pdu.h @@ -361,7 +361,7 @@ struct smb2_ea_info { __u8 Flags; __u8 EaNameLength; __le16 EaValueLength; - char name[1]; + char name[]; /* optionally followed by value */ } __packed; /* level 15 Query */ diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index c2b75d8988528..e6ba1e9b8589a 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -319,12 +319,6 @@ static int init_smb1_rsp_hdr(struct ksmbd_work *work) struct smb_hdr *rsp_hdr = (struct smb_hdr *)work->response_buf; struct smb_hdr *rcv_hdr = (struct smb_hdr *)work->request_buf; - /* - * Remove 4 byte direct TCP header. - */ - *(__be32 *)work->response_buf = - cpu_to_be32(sizeof(struct smb_hdr) - 4); - rsp_hdr->Command = SMB_COM_NEGOTIATE; *(__le32 *)rsp_hdr->Protocol = SMB1_PROTO_NUMBER; rsp_hdr->Flags = SMBFLG_RESPONSE; @@ -560,10 +554,11 @@ static int smb_handle_negotiate(struct ksmbd_work *work) ksmbd_debug(SMB, "Unsupported SMB1 protocol\n"); - /* Add 2 byte bcc and 2 byte DialectIndex. */ - inc_rfc1001_len(work->response_buf, 4); - neg_rsp->hdr.Status.CifsError = STATUS_SUCCESS; + if (ksmbd_iov_pin_rsp(work, (void *)neg_rsp, + sizeof(struct smb_negotiate_rsp) - 4)) + return -ENOMEM; + neg_rsp->hdr.Status.CifsError = STATUS_SUCCESS; neg_rsp->hdr.WordCount = 1; neg_rsp->DialectIndex = cpu_to_le16(work->conn->dialect); neg_rsp->ByteCount = 0; diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index c06efc020bd95..3b269e1f523a1 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1241,14 +1241,12 @@ static int smb_direct_writev(struct ksmbd_transport *t, //FIXME: skip RFC1002 header.. buflen -= 4; - iov[0].iov_base += 4; - iov[0].iov_len -= 4; remaining_data_length = buflen; ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key); - start = i = 0; + start = i = 1; buflen = 0; while (true) { buflen += iov[i].iov_len; @@ -1366,24 +1364,35 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, LIST_HEAD(msg_list); char *desc_buf; int credits_needed; - unsigned int desc_buf_len; - size_t total_length = 0; + unsigned int desc_buf_len, desc_num = 0; if (t->status != SMB_DIRECT_CS_CONNECTED) return -ENOTCONN; + if (buf_len > t->max_rdma_rw_size) + return -EINVAL; + /* calculate needed credits */ credits_needed = 0; desc_buf = buf; for (i = 0; i < desc_len / sizeof(*desc); i++) { + if (!buf_len) + break; + desc_buf_len = le32_to_cpu(desc[i].length); + if (!desc_buf_len) + return -EINVAL; + + if (desc_buf_len > buf_len) { + desc_buf_len = buf_len; + desc[i].length = cpu_to_le32(desc_buf_len); + buf_len = 0; + } credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len); desc_buf += desc_buf_len; - total_length += desc_buf_len; - if (desc_buf_len == 0 || total_length > buf_len || - total_length > t->max_rdma_rw_size) - return -EINVAL; + buf_len -= desc_buf_len; + desc_num++; } ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", @@ -1395,7 +1404,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, /* build rdma_rw_ctx for each descriptor */ desc_buf = buf; - for (i = 0; i < desc_len / sizeof(*desc); i++) { + for (i = 0; i < desc_num; i++) { msg = kzalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); if (!msg) { diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index d48756a339a54..b5a5e50fc9ca3 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -367,15 +367,15 @@ static int check_lock_range(struct file *filp, loff_t start, loff_t end, * @fid: file id of open file * @count: read byte count * @pos: file pos + * @rbuf: read data buffer * * Return: number of read bytes on success, otherwise error */ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count, - loff_t *pos) + loff_t *pos, char *rbuf) { struct file *filp = fp->filp; ssize_t nbytes = 0; - char *rbuf = work->aux_payload_buf; struct inode *inode = file_inode(filp); if (S_ISDIR(inode->i_mode)) diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index 72f9fb4b48d13..00968081856e3 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -76,8 +76,8 @@ void ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess); int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode); int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode); -int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, - size_t count, loff_t *pos); +int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count, + loff_t *pos, char *rbuf); int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, char *buf, size_t count, loff_t *pos, bool sync, ssize_t *written); diff --git a/fs/super.c b/fs/super.c index bd8dcfc822c39..2d762ce67f6e6 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1373,6 +1373,50 @@ int get_tree_keyed(struct fs_context *fc, } EXPORT_SYMBOL(get_tree_keyed); +static int set_bdev_super(struct super_block *s, void *data) +{ + s->s_dev = *(dev_t *)data; + return 0; +} + +static int super_s_dev_set(struct super_block *s, struct fs_context *fc) +{ + return set_bdev_super(s, fc->sget_key); +} + +static int super_s_dev_test(struct super_block *s, struct fs_context *fc) +{ + return !(s->s_iflags & SB_I_RETIRED) && + s->s_dev == *(dev_t *)fc->sget_key; +} + +/** + * sget_dev - Find or create a superblock by device number + * @fc: Filesystem context. + * @dev: device number + * + * Find or create a superblock using the provided device number that + * will be stored in fc->sget_key. + * + * If an extant superblock is matched, then that will be returned with + * an elevated reference count that the caller must transfer or discard. + * + * If no match is made, a new superblock will be allocated and basic + * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will + * be set). The superblock will be published and it will be returned in + * a partially constructed state with SB_BORN and SB_ACTIVE as yet + * unset. + * + * Return: an existing or newly created superblock on success, an error + * pointer on failure. + */ +struct super_block *sget_dev(struct fs_context *fc, dev_t dev) +{ + fc->sget_key = &dev; + return sget_fc(fc, super_s_dev_test, super_s_dev_set); +} +EXPORT_SYMBOL(sget_dev); + #ifdef CONFIG_BLOCK /* * Lock a super block that the callers holds a reference to. @@ -1431,23 +1475,6 @@ const struct blk_holder_ops fs_holder_ops = { }; EXPORT_SYMBOL_GPL(fs_holder_ops); -static int set_bdev_super(struct super_block *s, void *data) -{ - s->s_dev = *(dev_t *)data; - return 0; -} - -static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc) -{ - return set_bdev_super(s, fc->sget_key); -} - -static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc) -{ - return !(s->s_iflags & SB_I_RETIRED) && - s->s_dev == *(dev_t *)fc->sget_key; -} - int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { @@ -1525,8 +1552,7 @@ int get_tree_bdev(struct fs_context *fc, } fc->sb_flags |= SB_NOSEC; - fc->sget_key = &dev; - s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); + s = sget_dev(fc, dev); if (IS_ERR(s)) return PTR_ERR(s); diff --git a/include/linux/dlm_plock.h b/include/linux/dlm_plock.h index e6d76e8715a6c..15fc856d198cf 100644 --- a/include/linux/dlm_plock.h +++ b/include/linux/dlm_plock.h @@ -11,6 +11,8 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, int cmd, struct file_lock *fl); int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, struct file_lock *fl); +int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file, + struct file_lock *fl); int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, struct file_lock *fl); #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index c8ff4156a0a15..4aeb3fa119277 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2397,6 +2397,7 @@ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data); +struct super_block *sget_dev(struct fs_context *fc, dev_t dev); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ #define fops_get(fops) \ diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 44c298aa58d44..52772c826c868 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -630,11 +630,6 @@ struct transaction_s */ struct list_head t_inode_list; - /* - * Protects info related to handles - */ - spinlock_t t_handle_lock; - /* * Longest time some handle had to wait for running transaction */ diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index f42594a9efe0d..0f016d69c996e 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -204,6 +204,8 @@ extern unsigned long nlmsvc_timeout; extern bool nsm_use_hostnames; extern u32 nsm_local_state; +extern struct timer_list nlmsvc_retry; + /* * Lockd client functions */ @@ -280,7 +282,7 @@ __be32 nlmsvc_testlock(struct svc_rqst *, struct nlm_file *, struct nlm_host *, struct nlm_lock *, struct nlm_lock *, struct nlm_cookie *); __be32 nlmsvc_cancel_blocked(struct net *net, struct nlm_file *, struct nlm_lock *); -unsigned long nlmsvc_retry_blocked(void); +void nlmsvc_retry_blocked(void); void nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *, nlm_host_match_fn_t match); void nlmsvc_grant_reply(struct nlm_cookie *, __be32); diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 518bd28f5ab8c..35766963dd145 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -56,10 +56,14 @@ struct cache_head { struct kref ref; unsigned long flags; }; -#define CACHE_VALID 0 /* Entry contains valid data */ -#define CACHE_NEGATIVE 1 /* Negative entry - there is no match for the key */ -#define CACHE_PENDING 2 /* An upcall has been sent but no reply received yet*/ -#define CACHE_CLEANED 3 /* Entry has been cleaned from cache */ + +/* cache_head.flags */ +enum { + CACHE_VALID, /* Entry contains valid data */ + CACHE_NEGATIVE, /* Negative entry - there is no match for the key */ + CACHE_PENDING, /* An upcall has been sent but no reply received yet*/ + CACHE_CLEANED, /* Entry has been cleaned from cache */ +}; #define CACHE_NEW_EXPIRY 120 /* keep new things pending confirmation for 120 seconds */ diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 4f41d839face4..af7358277f1c3 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -148,6 +148,8 @@ struct rpc_create_args { const struct cred *cred; unsigned int max_connect; struct xprtsec_parms xprtsec; + unsigned long connect_timeout; + unsigned long reconnect_timeout; }; struct rpc_add_xprt_test { diff --git a/include/linux/sunrpc/stats.h b/include/linux/sunrpc/stats.h index d94d4f4105074..3ce1550d1beb3 100644 --- a/include/linux/sunrpc/stats.h +++ b/include/linux/sunrpc/stats.h @@ -43,22 +43,6 @@ struct net; #ifdef CONFIG_PROC_FS int rpc_proc_init(struct net *); void rpc_proc_exit(struct net *); -#else -static inline int rpc_proc_init(struct net *net) -{ - return 0; -} - -static inline void rpc_proc_exit(struct net *net) -{ -} -#endif - -#ifdef MODULE -void rpc_modcount(struct inode *, int); -#endif - -#ifdef CONFIG_PROC_FS struct proc_dir_entry * rpc_proc_register(struct net *,struct rpc_stat *); void rpc_proc_unregister(struct net *,const char *); void rpc_proc_zero(const struct rpc_program *); @@ -69,7 +53,14 @@ void svc_proc_unregister(struct net *, const char *); void svc_seq_show(struct seq_file *, const struct svc_stat *); #else +static inline int rpc_proc_init(struct net *net) +{ + return 0; +} +static inline void rpc_proc_exit(struct net *net) +{ +} static inline struct proc_dir_entry *rpc_proc_register(struct net *net, struct rpc_stat *s) { return NULL; } static inline void rpc_proc_unregister(struct net *net, const char *p) {} static inline void rpc_proc_zero(const struct rpc_program *p) {} diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index f8751118c1221..dbf5b21feafe4 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -39,16 +39,20 @@ struct svc_pool { struct list_head sp_all_threads; /* all server threads */ /* statistics on pool operation */ + struct percpu_counter sp_messages_arrived; struct percpu_counter sp_sockets_queued; struct percpu_counter sp_threads_woken; - struct percpu_counter sp_threads_timedout; -#define SP_TASK_PENDING (0) /* still work to do even if no - * xprt is queued. */ -#define SP_CONGESTED (1) unsigned long sp_flags; } ____cacheline_aligned_in_smp; +/* bits for sp_flags */ +enum { + SP_TASK_PENDING, /* still work to do even if no xprt is queued */ + SP_CONGESTED, /* all threads are busy, none idle */ +}; + + /* * RPC service. * @@ -120,19 +124,6 @@ static inline void svc_put(struct svc_serv *serv) kref_put(&serv->sv_refcnt, svc_destroy); } -/** - * svc_put_not_last - decrement non-final reference count on SUNRPC serv - * @serv: the svc_serv to have count decremented - * - * Returns: %true is refcount was decremented. - * - * If the refcount is 1, it is not decremented and instead failure is reported. - */ -static inline bool svc_put_not_last(struct svc_serv *serv) -{ - return refcount_dec_not_one(&serv->sv_refcnt.refcount); -} - /* * Maximum payload size supported by a kernel RPC server. * This is use to determine the max number of pages nfsd is @@ -232,16 +223,6 @@ struct svc_rqst { u32 rq_proc; /* procedure number */ u32 rq_prot; /* IP protocol */ int rq_cachetype; /* catering to nfsd */ -#define RQ_SECURE (0) /* secure port */ -#define RQ_LOCAL (1) /* local request */ -#define RQ_USEDEFERRAL (2) /* use deferral */ -#define RQ_DROPME (3) /* drop current reply */ -#define RQ_SPLICE_OK (4) /* turned off in gss privacy - * to prevent encrypting page - * cache pages */ -#define RQ_VICTIM (5) /* about to be shut down */ -#define RQ_BUSY (6) /* request is busy */ -#define RQ_DATA (7) /* request has data */ unsigned long rq_flags; /* flags field */ ktime_t rq_qtime; /* enqueue time */ @@ -265,7 +246,6 @@ struct svc_rqst { /* Catering to nfsd */ struct auth_domain * rq_client; /* RPC peer info */ struct auth_domain * rq_gssclient; /* "gss/"-style peer info */ - struct svc_cacherep * rq_cacherep; /* cache info */ struct task_struct *rq_task; /* service thread */ struct net *rq_bc_net; /* pointer to backchannel's * net namespace @@ -273,6 +253,19 @@ struct svc_rqst { void ** rq_lease_breaker; /* The v4 client breaking a lease */ }; +/* bits for rq_flags */ +enum { + RQ_SECURE, /* secure port */ + RQ_LOCAL, /* local request */ + RQ_USEDEFERRAL, /* use deferral */ + RQ_DROPME, /* drop current reply */ + RQ_SPLICE_OK, /* turned off in gss privacy to prevent + * encrypting page cache pages */ + RQ_VICTIM, /* about to be shut down */ + RQ_BUSY, /* request is busy */ + RQ_DATA, /* request has data */ +}; + #define SVC_NET(rqst) (rqst->rq_xprt ? rqst->rq_xprt->xpt_net : rqst->rq_bc_net) /* @@ -344,7 +337,7 @@ struct svc_program { char * pg_name; /* service name */ char * pg_class; /* class name: services sharing authentication */ struct svc_stat * pg_stats; /* rpc statistics */ - int (*pg_authenticate)(struct svc_rqst *); + enum svc_auth_status (*pg_authenticate)(struct svc_rqst *rqstp); __be32 (*pg_init_request)(struct svc_rqst *, const struct svc_program *, struct svc_process_info *); @@ -427,6 +420,7 @@ int svc_register(const struct svc_serv *, struct net *, const int, void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); +void svc_pool_wake_idle_thread(struct svc_pool *pool); struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv); char * svc_print_addr(struct svc_rqst *, char *, size_t); const char * svc_proc_name(const struct svc_rqst *rqstp); diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index a6b12631db21c..fa55d12dc7651 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -56,23 +56,6 @@ struct svc_xprt { struct list_head xpt_list; struct list_head xpt_ready; unsigned long xpt_flags; -#define XPT_BUSY 0 /* enqueued/receiving */ -#define XPT_CONN 1 /* conn pending */ -#define XPT_CLOSE 2 /* dead or dying */ -#define XPT_DATA 3 /* data pending */ -#define XPT_TEMP 4 /* connected transport */ -#define XPT_DEAD 6 /* transport closed */ -#define XPT_CHNGBUF 7 /* need to change snd/rcv buf sizes */ -#define XPT_DEFERRED 8 /* deferred request pending */ -#define XPT_OLD 9 /* used for xprt aging mark+sweep */ -#define XPT_LISTENER 10 /* listening endpoint */ -#define XPT_CACHE_AUTH 11 /* cache auth info */ -#define XPT_LOCAL 12 /* connection from loopback interface */ -#define XPT_KILL_TEMP 13 /* call xpo_kill_temp_xprt before closing */ -#define XPT_CONG_CTRL 14 /* has congestion control */ -#define XPT_HANDSHAKE 15 /* xprt requests a handshake */ -#define XPT_TLS_SESSION 16 /* transport-layer security established */ -#define XPT_PEER_AUTH 17 /* peer has been authenticated */ struct svc_serv *xpt_server; /* service for transport */ atomic_t xpt_reserved; /* space on outq that is rsvd */ @@ -97,6 +80,27 @@ struct svc_xprt { struct rpc_xprt_switch *xpt_bc_xps; /* NFSv4.1 backchannel */ }; +/* flag bits for xpt_flags */ +enum { + XPT_BUSY, /* enqueued/receiving */ + XPT_CONN, /* conn pending */ + XPT_CLOSE, /* dead or dying */ + XPT_DATA, /* data pending */ + XPT_TEMP, /* connected transport */ + XPT_DEAD, /* transport closed */ + XPT_CHNGBUF, /* need to change snd/rcv buf sizes */ + XPT_DEFERRED, /* deferred request pending */ + XPT_OLD, /* used for xprt aging mark+sweep */ + XPT_LISTENER, /* listening endpoint */ + XPT_CACHE_AUTH, /* cache auth info */ + XPT_LOCAL, /* connection from loopback interface */ + XPT_KILL_TEMP, /* call xpo_kill_temp_xprt before closing */ + XPT_CONG_CTRL, /* has congestion control */ + XPT_HANDSHAKE, /* xprt requests a handshake */ + XPT_TLS_SESSION, /* transport-layer security established */ + XPT_PEER_AUTH, /* peer has been authenticated */ +}; + static inline void unregister_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u) { spin_lock(&xpt->xpt_lock); diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index 6d9cc9080aca7..6f90203edbf8d 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -83,6 +83,19 @@ struct auth_domain { struct rcu_head rcu_head; }; +enum svc_auth_status { + SVC_GARBAGE = 1, + SVC_SYSERR, + SVC_VALID, + SVC_NEGATIVE, + SVC_OK, + SVC_DROP, + SVC_CLOSE, + SVC_DENIED, + SVC_PENDING, + SVC_COMPLETE, +}; + /* * Each authentication flavour registers an auth_ops * structure. @@ -98,6 +111,8 @@ struct auth_domain { * is (probably) already in place. Certainly space is * reserved for it. * DROP - simply drop the request. It may have been deferred + * CLOSE - like SVC_DROP, but request is definitely lost. + * If there is a tcp connection, it should be closed. * GARBAGE - rpc garbage_args error * SYSERR - rpc system_err error * DENIED - authp holds reason for denial. @@ -111,14 +126,10 @@ struct auth_domain { * * release() is given a request after the procedure has been run. * It should sign/encrypt the results if needed - * It should return: - * OK - the resbuf is ready to be sent - * DROP - the reply should be quitely dropped - * DENIED - authp holds a reason for MSG_DENIED - * SYSERR - rpc system_err * * domain_release() * This call releases a domain. + * * set_client() * Givens a pending request (struct svc_rqst), finds and assigns * an appropriate 'auth_domain' as the client. @@ -127,44 +138,28 @@ struct auth_ops { char * name; struct module *owner; int flavour; - int (*accept)(struct svc_rqst *rq); - int (*release)(struct svc_rqst *rq); - void (*domain_release)(struct auth_domain *); - int (*set_client)(struct svc_rqst *rq); -}; -#define SVC_GARBAGE 1 -#define SVC_SYSERR 2 -#define SVC_VALID 3 -#define SVC_NEGATIVE 4 -#define SVC_OK 5 -#define SVC_DROP 6 -#define SVC_CLOSE 7 /* Like SVC_DROP, but request is definitely - * lost so if there is a tcp connection, it - * should be closed - */ -#define SVC_DENIED 8 -#define SVC_PENDING 9 -#define SVC_COMPLETE 10 + enum svc_auth_status (*accept)(struct svc_rqst *rqstp); + int (*release)(struct svc_rqst *rqstp); + void (*domain_release)(struct auth_domain *dom); + enum svc_auth_status (*set_client)(struct svc_rqst *rqstp); +}; struct svc_xprt; -extern int svc_authenticate(struct svc_rqst *rqstp); +extern enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp); extern int svc_authorise(struct svc_rqst *rqstp); -extern int svc_set_client(struct svc_rqst *rqstp); +extern enum svc_auth_status svc_set_client(struct svc_rqst *rqstp); extern int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops); extern void svc_auth_unregister(rpc_authflavor_t flavor); extern struct auth_domain *unix_domain_find(char *name); extern void auth_domain_put(struct auth_domain *item); -extern int auth_unix_add_addr(struct net *net, struct in6_addr *addr, struct auth_domain *dom); extern struct auth_domain *auth_domain_lookup(char *name, struct auth_domain *new); extern struct auth_domain *auth_domain_find(char *name); -extern struct auth_domain *auth_unix_lookup(struct net *net, struct in6_addr *addr); -extern int auth_unix_forget_old(struct auth_domain *dom); extern void svcauth_unix_purge(struct net *net); extern void svcauth_unix_info_release(struct svc_xprt *xpt); -extern int svcauth_unix_set_client(struct svc_rqst *rqstp); +extern enum svc_auth_status svcauth_unix_set_client(struct svc_rqst *rqstp); extern int unix_gid_cache_create(struct net *net); extern void unix_gid_cache_destroy(struct net *net); diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index a7116048a4d4b..7c78ec6356b92 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -35,8 +35,8 @@ struct svc_sock { /* Total length of the data (not including fragment headers) * received so far in the fragments making up this rpc: */ u32 sk_datalen; - /* Number of queued send requests */ - atomic_t sk_sendqlen; + + struct page_frag_cache sk_frag_cache; struct completion sk_handshake_done; @@ -56,8 +56,7 @@ static inline u32 svc_sock_final_rec(struct svc_sock *svsk) /* * Function prototypes. */ -void svc_close_net(struct svc_serv *, struct net *); -int svc_recv(struct svc_rqst *, long); +void svc_recv(struct svc_rqst *rqstp); void svc_send(struct svc_rqst *rqstp); void svc_drop(struct svc_rqst *); void svc_sock_update_bufs(struct svc_serv *serv); @@ -66,8 +65,6 @@ int svc_addsock(struct svc_serv *serv, struct net *net, const struct cred *cred); void svc_init_xprt_sock(void); void svc_cleanup_xprt_sock(void); -struct svc_xprt *svc_sock_create(struct svc_serv *serv, int prot); -void svc_sock_destroy(struct svc_xprt *); /* * svc_makesock socket characteristics diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index f89ec4b5ea169..5b4fb3c791bc2 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -139,6 +139,8 @@ void xdr_terminate_string(const struct xdr_buf *, const u32); size_t xdr_buf_pagecount(const struct xdr_buf *buf); int xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp); void xdr_free_bvec(struct xdr_buf *buf); +unsigned int xdr_buf_to_bvec(struct bio_vec *bvec, unsigned int bvec_size, + const struct xdr_buf *xdr); static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int len) { @@ -224,6 +226,7 @@ struct xdr_stream { struct kvec *iov; /* pointer to the current kvec */ struct kvec scratch; /* Scratch buffer */ struct page **page_ptr; /* pointer to the current page */ + void *page_kaddr; /* kmapped address of the current page */ unsigned int nwords; /* Remaining decode buffer length */ struct rpc_rqst *rqst; /* For debugging */ @@ -255,6 +258,7 @@ extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst); extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, struct page **pages, unsigned int len); +extern void xdr_finish_decode(struct xdr_stream *xdr); extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); @@ -775,9 +779,7 @@ xdr_stream_decode_uint32_array(struct xdr_stream *xdr, if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0)) return -EBADMSG; - if (len > SIZE_MAX / sizeof(*p)) - return -EBADMSG; - p = xdr_inline_decode(xdr, len * sizeof(*p)); + p = xdr_inline_decode(xdr, size_mul(len, sizeof(*p))); if (unlikely(!p)) return -EBADMSG; if (array == NULL) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index b52411bcfe4e7..4ecc89301eb74 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -351,6 +351,8 @@ struct xprt_create { struct rpc_xprt_switch *bc_xps; unsigned int flags; struct xprtsec_parms xprtsec; + unsigned long connect_timeout; + unsigned long reconnect_timeout; }; struct xprt_class { diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h index 2b09574e12430..c1a146f9fc911 100644 --- a/include/trace/events/dlm.h +++ b/include/trace/events/dlm.h @@ -7,6 +7,7 @@ #include #include +#include #include #include "../../../fs/dlm/dlm_internal.h" @@ -585,6 +586,56 @@ TRACE_EVENT(dlm_recv_message, ); +DECLARE_EVENT_CLASS(dlm_plock_template, + + TP_PROTO(const struct dlm_plock_info *info), + + TP_ARGS(info), + + TP_STRUCT__entry( + __field(uint8_t, optype) + __field(uint8_t, ex) + __field(uint8_t, wait) + __field(uint8_t, flags) + __field(uint32_t, pid) + __field(int32_t, nodeid) + __field(int32_t, rv) + __field(uint32_t, fsid) + __field(uint64_t, number) + __field(uint64_t, start) + __field(uint64_t, end) + __field(uint64_t, owner) + ), + + TP_fast_assign( + __entry->optype = info->optype; + __entry->ex = info->ex; + __entry->wait = info->wait; + __entry->flags = info->flags; + __entry->pid = info->pid; + __entry->nodeid = info->nodeid; + __entry->rv = info->rv; + __entry->fsid = info->fsid; + __entry->number = info->number; + __entry->start = info->start; + __entry->end = info->end; + __entry->owner = info->owner; + ), + + TP_printk("fsid=%u number=%llx owner=%llx optype=%d ex=%d wait=%d flags=%x pid=%u nodeid=%d rv=%d start=%llx end=%llx", + __entry->fsid, __entry->number, __entry->owner, + __entry->optype, __entry->ex, __entry->wait, + __entry->flags, __entry->pid, __entry->nodeid, + __entry->rv, __entry->start, __entry->end) + +); + +DEFINE_EVENT(dlm_plock_template, dlm_plock_read, + TP_PROTO(const struct dlm_plock_info *info), TP_ARGS(info)); + +DEFINE_EVENT(dlm_plock_template, dlm_plock_write, + TP_PROTO(const struct dlm_plock_info *info), TP_ARGS(info)); + TRACE_EVENT(dlm_send, TP_PROTO(int nodeid, int ret), diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 43711753616a1..6beb38c1dcb5e 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1706,7 +1706,7 @@ TRACE_DEFINE_ENUM(SVC_DENIED); TRACE_DEFINE_ENUM(SVC_PENDING); TRACE_DEFINE_ENUM(SVC_COMPLETE); -#define svc_show_status(status) \ +#define show_svc_auth_status(status) \ __print_symbolic(status, \ { SVC_GARBAGE, "SVC_GARBAGE" }, \ { SVC_SYSERR, "SVC_SYSERR" }, \ @@ -1743,7 +1743,10 @@ TRACE_DEFINE_ENUM(SVC_COMPLETE); __entry->xid, __get_sockaddr(server), __get_sockaddr(client) TRACE_EVENT_CONDITION(svc_authenticate, - TP_PROTO(const struct svc_rqst *rqst, int auth_res), + TP_PROTO( + const struct svc_rqst *rqst, + enum svc_auth_status auth_res + ), TP_ARGS(rqst, auth_res), @@ -1766,7 +1769,7 @@ TRACE_EVENT_CONDITION(svc_authenticate, TP_printk(SVC_RQST_ENDPOINT_FORMAT " auth_res=%s auth_stat=%s", SVC_RQST_ENDPOINT_VARARGS, - svc_show_status(__entry->svc_status), + show_svc_auth_status(__entry->svc_status), rpc_show_auth_stat(__entry->auth_stat)) ); @@ -1918,25 +1921,42 @@ TRACE_EVENT(svc_stats_latency, __get_str(procedure), __entry->execute) ); +/* + * from include/linux/sunrpc/svc_xprt.h + */ +#define SVC_XPRT_FLAG_LIST \ + svc_xprt_flag(BUSY) \ + svc_xprt_flag(CONN) \ + svc_xprt_flag(CLOSE) \ + svc_xprt_flag(DATA) \ + svc_xprt_flag(TEMP) \ + svc_xprt_flag(DEAD) \ + svc_xprt_flag(CHNGBUF) \ + svc_xprt_flag(DEFERRED) \ + svc_xprt_flag(OLD) \ + svc_xprt_flag(LISTENER) \ + svc_xprt_flag(CACHE_AUTH) \ + svc_xprt_flag(LOCAL) \ + svc_xprt_flag(KILL_TEMP) \ + svc_xprt_flag(CONG_CTRL) \ + svc_xprt_flag(HANDSHAKE) \ + svc_xprt_flag(TLS_SESSION) \ + svc_xprt_flag_end(PEER_AUTH) + +#undef svc_xprt_flag +#undef svc_xprt_flag_end +#define svc_xprt_flag(x) TRACE_DEFINE_ENUM(XPT_##x); +#define svc_xprt_flag_end(x) TRACE_DEFINE_ENUM(XPT_##x); + +SVC_XPRT_FLAG_LIST + +#undef svc_xprt_flag +#undef svc_xprt_flag_end +#define svc_xprt_flag(x) { BIT(XPT_##x), #x }, +#define svc_xprt_flag_end(x) { BIT(XPT_##x), #x } + #define show_svc_xprt_flags(flags) \ - __print_flags(flags, "|", \ - { BIT(XPT_BUSY), "BUSY" }, \ - { BIT(XPT_CONN), "CONN" }, \ - { BIT(XPT_CLOSE), "CLOSE" }, \ - { BIT(XPT_DATA), "DATA" }, \ - { BIT(XPT_TEMP), "TEMP" }, \ - { BIT(XPT_DEAD), "DEAD" }, \ - { BIT(XPT_CHNGBUF), "CHNGBUF" }, \ - { BIT(XPT_DEFERRED), "DEFERRED" }, \ - { BIT(XPT_OLD), "OLD" }, \ - { BIT(XPT_LISTENER), "LISTENER" }, \ - { BIT(XPT_CACHE_AUTH), "CACHE_AUTH" }, \ - { BIT(XPT_LOCAL), "LOCAL" }, \ - { BIT(XPT_KILL_TEMP), "KILL_TEMP" }, \ - { BIT(XPT_CONG_CTRL), "CONG_CTRL" }, \ - { BIT(XPT_HANDSHAKE), "HANDSHAKE" }, \ - { BIT(XPT_TLS_SESSION), "TLS_SESSION" }, \ - { BIT(XPT_PEER_AUTH), "PEER_AUTH" }) + __print_flags(flags, "|", SVC_XPRT_FLAG_LIST) TRACE_EVENT(svc_xprt_create_err, TP_PROTO( @@ -1994,25 +2014,25 @@ TRACE_EVENT(svc_xprt_create_err, TRACE_EVENT(svc_xprt_enqueue, TP_PROTO( const struct svc_xprt *xprt, - const struct svc_rqst *rqst + unsigned long flags ), - TP_ARGS(xprt, rqst), + TP_ARGS(xprt, flags), TP_STRUCT__entry( SVC_XPRT_ENDPOINT_FIELDS(xprt) - - __field(int, pid) ), TP_fast_assign( - SVC_XPRT_ENDPOINT_ASSIGNMENTS(xprt); - - __entry->pid = rqst? rqst->rq_task->pid : 0; + __assign_sockaddr(server, &xprt->xpt_local, + xprt->xpt_locallen); + __assign_sockaddr(client, &xprt->xpt_remote, + xprt->xpt_remotelen); + __entry->flags = flags; + __entry->netns_ino = xprt->xpt_net->ns.inum; ), - TP_printk(SVC_XPRT_ENDPOINT_FORMAT " pid=%d", - SVC_XPRT_ENDPOINT_VARARGS, __entry->pid) + TP_printk(SVC_XPRT_ENDPOINT_FORMAT, SVC_XPRT_ENDPOINT_VARARGS) ); TRACE_EVENT(svc_xprt_dequeue, diff --git a/include/uapi/linux/dlm_plock.h b/include/uapi/linux/dlm_plock.h index 63b6c1fd91690..eb66afcac40ed 100644 --- a/include/uapi/linux/dlm_plock.h +++ b/include/uapi/linux/dlm_plock.h @@ -22,6 +22,7 @@ enum { DLM_PLOCK_OP_LOCK = 1, DLM_PLOCK_OP_UNLOCK, DLM_PLOCK_OP_GET, + DLM_PLOCK_OP_CANCEL, }; #define DLM_PLOCK_FL_CLOSE 1 diff --git a/net/sunrpc/.kunitconfig b/net/sunrpc/.kunitconfig index a55a00fa649ba..eb02b906c2959 100644 --- a/net/sunrpc/.kunitconfig +++ b/net/sunrpc/.kunitconfig @@ -23,7 +23,6 @@ CONFIG_NFS_FS=y CONFIG_SUNRPC=y CONFIG_SUNRPC_GSS=y CONFIG_RPCSEC_GSS_KRB5=y -CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_DES=y CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1=y CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_CAMELLIA=y CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA2=y diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 4afc5fd71d44f..2d8b67dac7b5b 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -34,38 +34,6 @@ config RPCSEC_GSS_KRB5 If unsure, say Y. -config RPCSEC_GSS_KRB5_SIMPLIFIED - bool - depends on RPCSEC_GSS_KRB5 - -config RPCSEC_GSS_KRB5_CRYPTOSYSTEM - bool - depends on RPCSEC_GSS_KRB5 - -config RPCSEC_GSS_KRB5_ENCTYPES_DES - bool "Enable Kerberos enctypes based on DES (deprecated)" - depends on RPCSEC_GSS_KRB5 - depends on CRYPTO_CBC && CRYPTO_CTS && CRYPTO_ECB - depends on CRYPTO_HMAC && CRYPTO_MD5 && CRYPTO_SHA1 - depends on CRYPTO_DES - default n - select RPCSEC_GSS_KRB5_SIMPLIFIED - help - Choose Y to enable the use of deprecated Kerberos 5 - encryption types that utilize Data Encryption Standard - (DES) based ciphers. These include des-cbc-md5, - des-cbc-crc, and des-cbc-md4, which were deprecated by - RFC 6649, and des3-cbc-sha1, which was deprecated by RFC - 8429. - - These encryption types are known to be insecure, therefore - the default setting of this option is N. Support for these - encryption types is available only for compatibility with - legacy NFS client and server implementations. - - Removal of support is planned for a subsequent kernel - release. - config RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1 bool "Enable Kerberos enctypes based on AES and SHA-1" depends on RPCSEC_GSS_KRB5 @@ -73,7 +41,6 @@ config RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1 depends on CRYPTO_HMAC && CRYPTO_SHA1 depends on CRYPTO_AES default y - select RPCSEC_GSS_KRB5_CRYPTOSYSTEM help Choose Y to enable the use of Kerberos 5 encryption types that utilize Advanced Encryption Standard (AES) ciphers and @@ -86,7 +53,6 @@ config RPCSEC_GSS_KRB5_ENCTYPES_CAMELLIA depends on CRYPTO_CBC && CRYPTO_CTS && CRYPTO_CAMELLIA depends on CRYPTO_CMAC default n - select RPCSEC_GSS_KRB5_CRYPTOSYSTEM help Choose Y to enable the use of Kerberos 5 encryption types that utilize Camellia ciphers (RFC 3713) and CMAC digests @@ -100,7 +66,6 @@ config RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA2 depends on CRYPTO_HMAC && CRYPTO_SHA256 && CRYPTO_SHA512 depends on CRYPTO_AES default n - select RPCSEC_GSS_KRB5_CRYPTOSYSTEM help Choose Y to enable the use of Kerberos 5 encryption types that utilize Advanced Encryption Standard (AES) ciphers and diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile index 012ae17206894..ad1736d93b763 100644 --- a/net/sunrpc/auth_gss/Makefile +++ b/net/sunrpc/auth_gss/Makefile @@ -12,6 +12,6 @@ auth_rpcgss-y := auth_gss.o gss_generic_token.o \ obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o rpcsec_gss_krb5-y := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ - gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o + gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o obj-$(CONFIG_RPCSEC_GSS_KRB5_KUNIT_TEST) += gss_krb5_test.o diff --git a/net/sunrpc/auth_gss/gss_krb5_internal.h b/net/sunrpc/auth_gss/gss_krb5_internal.h index b673e2626acb2..3afd4065bf3d0 100644 --- a/net/sunrpc/auth_gss/gss_krb5_internal.h +++ b/net/sunrpc/auth_gss/gss_krb5_internal.h @@ -33,7 +33,6 @@ struct gss_krb5_enctype { const u32 Ke_length; /* encryption subkey length, in octets */ const u32 Ki_length; /* integrity subkey length, in octets */ - int (*import_ctx)(struct krb5_ctx *ctx, gfp_t gfp_mask); int (*derive_key)(const struct gss_krb5_enctype *gk5e, const struct xdr_netobj *in, struct xdr_netobj *out, @@ -85,24 +84,15 @@ struct krb5_ctx { * GSS Kerberos 5 mechanism Per-Message calls. */ -u32 gss_krb5_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, - struct xdr_netobj *token); u32 gss_krb5_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, struct xdr_netobj *token); -u32 gss_krb5_verify_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *message_buffer, - struct xdr_netobj *read_token); u32 gss_krb5_verify_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *message_buffer, struct xdr_netobj *read_token); -u32 gss_krb5_wrap_v1(struct krb5_ctx *kctx, int offset, - struct xdr_buf *buf, struct page **pages); u32 gss_krb5_wrap_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf, struct page **pages); -u32 gss_krb5_unwrap_v1(struct krb5_ctx *kctx, int offset, int len, - struct xdr_buf *buf, unsigned int *slack, - unsigned int *align); u32 gss_krb5_unwrap_v2(struct krb5_ctx *kctx, int offset, int len, struct xdr_buf *buf, unsigned int *slack, unsigned int *align); @@ -113,12 +103,6 @@ u32 gss_krb5_unwrap_v2(struct krb5_ctx *kctx, int offset, int len, /* Key Derivation Functions */ -int krb5_derive_key_v1(const struct gss_krb5_enctype *gk5e, - const struct xdr_netobj *inkey, - struct xdr_netobj *outkey, - const struct xdr_netobj *label, - gfp_t gfp_mask); - int krb5_derive_key_v2(const struct gss_krb5_enctype *gk5e, const struct xdr_netobj *inkey, struct xdr_netobj *outkey, @@ -169,13 +153,6 @@ static inline int krb5_derive_key(struct krb5_ctx *kctx, return gk5e->derive_key(gk5e, inkey, outkey, &label, gfp_mask); } -s32 krb5_make_seq_num(struct krb5_ctx *kctx, struct crypto_sync_skcipher *key, - int direction, u32 seqnum, unsigned char *cksum, - unsigned char *buf); - -s32 krb5_get_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, - unsigned char *buf, int *direction, u32 *seqnum); - void krb5_make_confounder(u8 *p, int conflen); u32 make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c index 5347fe1cc93f8..06d8ee0db000f 100644 --- a/net/sunrpc/auth_gss/gss_krb5_keys.c +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -222,90 +222,6 @@ static int krb5_DK(const struct gss_krb5_enctype *gk5e, return ret; } -#define smask(step) ((1<>step)&smask(step))) -#define parity_char(x) pstep(pstep(pstep((x), 4), 2), 1) - -static void mit_des_fixup_key_parity(u8 key[8]) -{ - int i; - for (i = 0; i < 8; i++) { - key[i] &= 0xfe; - key[i] |= 1^parity_char(key[i]); - } -} - -static int krb5_random_to_key_v1(const struct gss_krb5_enctype *gk5e, - struct xdr_netobj *randombits, - struct xdr_netobj *key) -{ - int i, ret = -EINVAL; - - if (key->len != 24) { - dprintk("%s: key->len is %d\n", __func__, key->len); - goto err_out; - } - if (randombits->len != 21) { - dprintk("%s: randombits->len is %d\n", - __func__, randombits->len); - goto err_out; - } - - /* take the seven bytes, move them around into the top 7 bits of the - 8 key bytes, then compute the parity bits. Do this three times. */ - - for (i = 0; i < 3; i++) { - memcpy(key->data + i*8, randombits->data + i*7, 7); - key->data[i*8+7] = (((key->data[i*8]&1)<<1) | - ((key->data[i*8+1]&1)<<2) | - ((key->data[i*8+2]&1)<<3) | - ((key->data[i*8+3]&1)<<4) | - ((key->data[i*8+4]&1)<<5) | - ((key->data[i*8+5]&1)<<6) | - ((key->data[i*8+6]&1)<<7)); - - mit_des_fixup_key_parity(key->data + i*8); - } - ret = 0; -err_out: - return ret; -} - -/** - * krb5_derive_key_v1 - Derive a subkey for an RFC 3961 enctype - * @gk5e: Kerberos 5 enctype profile - * @inkey: base protocol key - * @outkey: OUT: derived key - * @label: subkey usage label - * @gfp_mask: memory allocation control flags - * - * Caller sets @outkey->len to the desired length of the derived key. - * - * On success, returns 0 and fills in @outkey. A negative errno value - * is returned on failure. - */ -int krb5_derive_key_v1(const struct gss_krb5_enctype *gk5e, - const struct xdr_netobj *inkey, - struct xdr_netobj *outkey, - const struct xdr_netobj *label, - gfp_t gfp_mask) -{ - struct xdr_netobj inblock; - int ret; - - inblock.len = gk5e->keybytes; - inblock.data = kmalloc(inblock.len, gfp_mask); - if (!inblock.data) - return -ENOMEM; - - ret = krb5_DK(gk5e, inkey, inblock.data, label, gfp_mask); - if (!ret) - ret = krb5_random_to_key_v1(gk5e, &inblock, outkey); - - kfree_sensitive(inblock.data); - return ret; -} - /* * This is the identity function, with some sanity checking. */ diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 20e21d08badb3..e31cfdf7eadcb 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -30,61 +30,7 @@ static struct gss_api_mech gss_kerberos_mech; -#if defined(CONFIG_RPCSEC_GSS_KRB5_SIMPLIFIED) -static int gss_krb5_import_ctx_des(struct krb5_ctx *ctx, gfp_t gfp_mask); -static int gss_krb5_import_ctx_v1(struct krb5_ctx *ctx, gfp_t gfp_mask); -#endif -#if defined(CONFIG_RPCSEC_GSS_KRB5_CRYPTOSYSTEM) -static int gss_krb5_import_ctx_v2(struct krb5_ctx *ctx, gfp_t gfp_mask); -#endif - static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { -#if defined(CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_DES) - /* - * DES (All DES enctypes are mapped to the same gss functionality) - */ - { - .etype = ENCTYPE_DES_CBC_RAW, - .ctype = CKSUMTYPE_RSA_MD5, - .name = "des-cbc-crc", - .encrypt_name = "cbc(des)", - .cksum_name = "md5", - .import_ctx = gss_krb5_import_ctx_des, - .get_mic = gss_krb5_get_mic_v1, - .verify_mic = gss_krb5_verify_mic_v1, - .wrap = gss_krb5_wrap_v1, - .unwrap = gss_krb5_unwrap_v1, - .signalg = SGN_ALG_DES_MAC_MD5, - .sealalg = SEAL_ALG_DES, - .keybytes = 7, - .keylength = 8, - .cksumlength = 8, - .keyed_cksum = 0, - }, - /* - * 3DES - */ - { - .etype = ENCTYPE_DES3_CBC_RAW, - .ctype = CKSUMTYPE_HMAC_SHA1_DES3, - .name = "des3-hmac-sha1", - .encrypt_name = "cbc(des3_ede)", - .cksum_name = "hmac(sha1)", - .import_ctx = gss_krb5_import_ctx_v1, - .derive_key = krb5_derive_key_v1, - .get_mic = gss_krb5_get_mic_v1, - .verify_mic = gss_krb5_verify_mic_v1, - .wrap = gss_krb5_wrap_v1, - .unwrap = gss_krb5_unwrap_v1, - .signalg = SGN_ALG_HMAC_SHA1_DES3_KD, - .sealalg = SEAL_ALG_DES3KD, - .keybytes = 21, - .keylength = 24, - .cksumlength = 20, - .keyed_cksum = 1, - }, -#endif - #if defined(CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1) /* * AES-128 with SHA-1 (RFC 3962) @@ -96,7 +42,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { .encrypt_name = "cts(cbc(aes))", .aux_cipher = "cbc(aes)", .cksum_name = "hmac(sha1)", - .import_ctx = gss_krb5_import_ctx_v2, .derive_key = krb5_derive_key_v2, .encrypt = gss_krb5_aes_encrypt, .decrypt = gss_krb5_aes_decrypt, @@ -126,7 +71,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { .encrypt_name = "cts(cbc(aes))", .aux_cipher = "cbc(aes)", .cksum_name = "hmac(sha1)", - .import_ctx = gss_krb5_import_ctx_v2, .derive_key = krb5_derive_key_v2, .encrypt = gss_krb5_aes_encrypt, .decrypt = gss_krb5_aes_decrypt, @@ -166,7 +110,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { .Ke_length = BITS2OCTETS(128), .Ki_length = BITS2OCTETS(128), - .import_ctx = gss_krb5_import_ctx_v2, .derive_key = krb5_kdf_feedback_cmac, .encrypt = gss_krb5_aes_encrypt, .decrypt = gss_krb5_aes_decrypt, @@ -193,7 +136,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { .Ke_length = BITS2OCTETS(256), .Ki_length = BITS2OCTETS(256), - .import_ctx = gss_krb5_import_ctx_v2, .derive_key = krb5_kdf_feedback_cmac, .encrypt = gss_krb5_aes_encrypt, .decrypt = gss_krb5_aes_decrypt, @@ -223,7 +165,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { .Ke_length = BITS2OCTETS(128), .Ki_length = BITS2OCTETS(128), - .import_ctx = gss_krb5_import_ctx_v2, .derive_key = krb5_kdf_hmac_sha2, .encrypt = krb5_etm_encrypt, .decrypt = krb5_etm_decrypt, @@ -250,7 +191,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { .Ke_length = BITS2OCTETS(256), .Ki_length = BITS2OCTETS(192), - .import_ctx = gss_krb5_import_ctx_v2, .derive_key = krb5_kdf_hmac_sha2, .encrypt = krb5_etm_encrypt, .decrypt = krb5_etm_decrypt, @@ -283,12 +223,6 @@ static void gss_krb5_prepare_enctype_priority_list(void) #if defined(CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1) ENCTYPE_AES256_CTS_HMAC_SHA1_96, ENCTYPE_AES128_CTS_HMAC_SHA1_96, -#endif -#if defined(CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_DES) - ENCTYPE_DES3_CBC_SHA1, - ENCTYPE_DES_CBC_MD5, - ENCTYPE_DES_CBC_CRC, - ENCTYPE_DES_CBC_MD4, #endif }; size_t total, i; @@ -329,185 +263,6 @@ const struct gss_krb5_enctype *gss_krb5_lookup_enctype(u32 etype) } EXPORT_SYMBOL_IF_KUNIT(gss_krb5_lookup_enctype); -static struct crypto_sync_skcipher * -gss_krb5_alloc_cipher_v1(struct krb5_ctx *ctx, struct xdr_netobj *key) -{ - struct crypto_sync_skcipher *tfm; - - tfm = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0); - if (IS_ERR(tfm)) - return NULL; - if (crypto_sync_skcipher_setkey(tfm, key->data, key->len)) { - crypto_free_sync_skcipher(tfm); - return NULL; - } - return tfm; -} - -static inline const void * -get_key(const void *p, const void *end, - struct krb5_ctx *ctx, struct crypto_sync_skcipher **res) -{ - struct crypto_sync_skcipher *tfm; - struct xdr_netobj key; - int alg; - - p = simple_get_bytes(p, end, &alg, sizeof(alg)); - if (IS_ERR(p)) - goto out_err; - switch (alg) { - case ENCTYPE_DES_CBC_CRC: - case ENCTYPE_DES_CBC_MD4: - case ENCTYPE_DES_CBC_MD5: - /* Map all these key types to ENCTYPE_DES_CBC_RAW */ - alg = ENCTYPE_DES_CBC_RAW; - break; - } - if (!gss_krb5_lookup_enctype(alg)) { - pr_warn("gss_krb5: unsupported enctype: %d\n", alg); - goto out_err_inval; - } - - p = simple_get_netobj(p, end, &key); - if (IS_ERR(p)) - goto out_err; - tfm = gss_krb5_alloc_cipher_v1(ctx, &key); - kfree(key.data); - if (!tfm) { - pr_warn("gss_krb5: failed to initialize cipher '%s'\n", - ctx->gk5e->encrypt_name); - goto out_err_inval; - } - *res = tfm; - - return p; - -out_err_inval: - p = ERR_PTR(-EINVAL); -out_err: - return p; -} - -static int -gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) -{ - u32 seq_send; - int tmp; - u32 time32; - - p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); - if (IS_ERR(p)) - goto out_err; - - /* Old format supports only DES! Any other enctype uses new format */ - ctx->enctype = ENCTYPE_DES_CBC_RAW; - - ctx->gk5e = gss_krb5_lookup_enctype(ctx->enctype); - if (ctx->gk5e == NULL) { - p = ERR_PTR(-EINVAL); - goto out_err; - } - - /* The downcall format was designed before we completely understood - * the uses of the context fields; so it includes some stuff we - * just give some minimal sanity-checking, and some we ignore - * completely (like the next twenty bytes): */ - if (unlikely(p + 20 > end || p + 20 < p)) { - p = ERR_PTR(-EFAULT); - goto out_err; - } - p += 20; - p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); - if (IS_ERR(p)) - goto out_err; - if (tmp != SGN_ALG_DES_MAC_MD5) { - p = ERR_PTR(-ENOSYS); - goto out_err; - } - p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); - if (IS_ERR(p)) - goto out_err; - if (tmp != SEAL_ALG_DES) { - p = ERR_PTR(-ENOSYS); - goto out_err; - } - p = simple_get_bytes(p, end, &time32, sizeof(time32)); - if (IS_ERR(p)) - goto out_err; - /* unsigned 32-bit time overflows in year 2106 */ - ctx->endtime = (time64_t)time32; - p = simple_get_bytes(p, end, &seq_send, sizeof(seq_send)); - if (IS_ERR(p)) - goto out_err; - atomic_set(&ctx->seq_send, seq_send); - p = simple_get_netobj(p, end, &ctx->mech_used); - if (IS_ERR(p)) - goto out_err; - p = get_key(p, end, ctx, &ctx->enc); - if (IS_ERR(p)) - goto out_err_free_mech; - p = get_key(p, end, ctx, &ctx->seq); - if (IS_ERR(p)) - goto out_err_free_key1; - if (p != end) { - p = ERR_PTR(-EFAULT); - goto out_err_free_key2; - } - - return 0; - -out_err_free_key2: - crypto_free_sync_skcipher(ctx->seq); -out_err_free_key1: - crypto_free_sync_skcipher(ctx->enc); -out_err_free_mech: - kfree(ctx->mech_used.data); -out_err: - return PTR_ERR(p); -} - -#if defined(CONFIG_RPCSEC_GSS_KRB5_SIMPLIFIED) -static int -gss_krb5_import_ctx_des(struct krb5_ctx *ctx, gfp_t gfp_mask) -{ - return -EINVAL; -} - -static int -gss_krb5_import_ctx_v1(struct krb5_ctx *ctx, gfp_t gfp_mask) -{ - struct xdr_netobj keyin, keyout; - - keyin.data = ctx->Ksess; - keyin.len = ctx->gk5e->keylength; - - ctx->seq = gss_krb5_alloc_cipher_v1(ctx, &keyin); - if (ctx->seq == NULL) - goto out_err; - ctx->enc = gss_krb5_alloc_cipher_v1(ctx, &keyin); - if (ctx->enc == NULL) - goto out_free_seq; - - /* derive cksum */ - keyout.data = ctx->cksum; - keyout.len = ctx->gk5e->keylength; - if (krb5_derive_key(ctx, &keyin, &keyout, KG_USAGE_SIGN, - KEY_USAGE_SEED_CHECKSUM, gfp_mask)) - goto out_free_enc; - - return 0; - -out_free_enc: - crypto_free_sync_skcipher(ctx->enc); -out_free_seq: - crypto_free_sync_skcipher(ctx->seq); -out_err: - return -EINVAL; -} -#endif - -#if defined(CONFIG_RPCSEC_GSS_KRB5_CRYPTOSYSTEM) - static struct crypto_sync_skcipher * gss_krb5_alloc_cipher_v2(const char *cname, const struct xdr_netobj *key) { @@ -636,8 +391,6 @@ gss_krb5_import_ctx_v2(struct krb5_ctx *ctx, gfp_t gfp_mask) goto out; } -#endif - static int gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, gfp_t gfp_mask) @@ -671,9 +424,6 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype)); if (IS_ERR(p)) goto out_err; - /* Map ENCTYPE_DES3_CBC_SHA1 to ENCTYPE_DES3_CBC_RAW */ - if (ctx->enctype == ENCTYPE_DES3_CBC_SHA1) - ctx->enctype = ENCTYPE_DES3_CBC_RAW; ctx->gk5e = gss_krb5_lookup_enctype(ctx->enctype); if (ctx->gk5e == NULL) { dprintk("gss_kerberos_mech: unsupported krb5 enctype %u\n", @@ -700,7 +450,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, } ctx->mech_used.len = gss_kerberos_mech.gm_oid.len; - return ctx->gk5e->import_ctx(ctx, gfp_mask); + return gss_krb5_import_ctx_v2(ctx, gfp_mask); out_err: return PTR_ERR(p); @@ -718,10 +468,7 @@ gss_krb5_import_sec_context(const void *p, size_t len, struct gss_ctx *ctx_id, if (ctx == NULL) return -ENOMEM; - if (len == 85) - ret = gss_import_v1_context(p, end, ctx); - else - ret = gss_import_v2_context(p, end, ctx, gfp_mask); + ret = gss_import_v2_context(p, end, ctx, gfp_mask); memzero_explicit(&ctx->Ksess, sizeof(ctx->Ksess)); if (ret) { kfree(ctx); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 146aa755f07df..ce540df9bce46 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -71,75 +71,6 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif -#if defined(CONFIG_RPCSEC_GSS_KRB5_SIMPLIFIED) - -static void * -setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) -{ - u16 *ptr; - void *krb5_hdr; - int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; - - token->len = g_token_size(&ctx->mech_used, body_size); - - ptr = (u16 *)token->data; - g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr); - - /* ptr now at start of header described in rfc 1964, section 1.2.1: */ - krb5_hdr = ptr; - *ptr++ = KG_TOK_MIC_MSG; - /* - * signalg is stored as if it were converted from LE to host endian, even - * though it's an opaque pair of bytes according to the RFC. - */ - *ptr++ = (__force u16)cpu_to_le16(ctx->gk5e->signalg); - *ptr++ = SEAL_ALG_NONE; - *ptr = 0xffff; - - return krb5_hdr; -} - -u32 -gss_krb5_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, - struct xdr_netobj *token) -{ - char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), - .data = cksumdata}; - void *ptr; - time64_t now; - u32 seq_send; - u8 *cksumkey; - - dprintk("RPC: %s\n", __func__); - BUG_ON(ctx == NULL); - - now = ktime_get_real_seconds(); - - ptr = setup_token(ctx, token); - - if (ctx->gk5e->keyed_cksum) - cksumkey = ctx->cksum; - else - cksumkey = NULL; - - if (make_checksum(ctx, ptr, 8, text, 0, cksumkey, - KG_USAGE_SIGN, &md5cksum)) - return GSS_S_FAILURE; - - memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - - seq_send = atomic_fetch_inc(&ctx->seq_send); - - if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, - seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) - return GSS_S_FAILURE; - - return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; -} - -#endif - static void * setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) { diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c deleted file mode 100644 index 1babc3474e102..0000000000000 --- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c +++ /dev/null @@ -1,106 +0,0 @@ -/* - * linux/net/sunrpc/gss_krb5_seqnum.c - * - * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/util_seqnum.c - * - * Copyright (c) 2000 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson - */ - -/* - * Copyright 1993 by OpenVision Technologies, Inc. - * - * Permission to use, copy, modify, distribute, and sell this software - * and its documentation for any purpose is hereby granted without fee, - * provided that the above copyright notice appears in all copies and - * that both that copyright notice and this permission notice appear in - * supporting documentation, and that the name of OpenVision not be used - * in advertising or publicity pertaining to distribution of the software - * without specific, written prior permission. OpenVision makes no - * representations about the suitability of this software for any - * purpose. It is provided "as is" without express or implied warranty. - * - * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, - * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO - * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF - * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR - * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR - * PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include - -#include "gss_krb5_internal.h" - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - -s32 -krb5_make_seq_num(struct krb5_ctx *kctx, - struct crypto_sync_skcipher *key, - int direction, - u32 seqnum, - unsigned char *cksum, unsigned char *buf) -{ - unsigned char *plain; - s32 code; - - plain = kmalloc(8, GFP_KERNEL); - if (!plain) - return -ENOMEM; - - plain[0] = (unsigned char) (seqnum & 0xff); - plain[1] = (unsigned char) ((seqnum >> 8) & 0xff); - plain[2] = (unsigned char) ((seqnum >> 16) & 0xff); - plain[3] = (unsigned char) ((seqnum >> 24) & 0xff); - - plain[4] = direction; - plain[5] = direction; - plain[6] = direction; - plain[7] = direction; - - code = krb5_encrypt(key, cksum, plain, buf, 8); - kfree(plain); - return code; -} - -s32 -krb5_get_seq_num(struct krb5_ctx *kctx, - unsigned char *cksum, - unsigned char *buf, - int *direction, u32 *seqnum) -{ - s32 code; - unsigned char *plain; - struct crypto_sync_skcipher *key = kctx->seq; - - dprintk("RPC: krb5_get_seq_num:\n"); - - plain = kmalloc(8, GFP_KERNEL); - if (!plain) - return -ENOMEM; - - if ((code = krb5_decrypt(key, cksum, buf, plain, 8))) - goto out; - - if ((plain[4] != plain[5]) || (plain[4] != plain[6]) || - (plain[4] != plain[7])) { - code = (s32)KG_BAD_SEQ; - goto out; - } - - *direction = plain[4]; - - *seqnum = ((plain[0]) | - (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24)); - -out: - kfree(plain); - return code; -} diff --git a/net/sunrpc/auth_gss/gss_krb5_test.c b/net/sunrpc/auth_gss/gss_krb5_test.c index 95ca783795c5e..85625e3f3814e 100644 --- a/net/sunrpc/auth_gss/gss_krb5_test.c +++ b/net/sunrpc/auth_gss/gss_krb5_test.c @@ -320,208 +320,12 @@ static void rfc3961_nfold_case(struct kunit *test) "result mismatch"); } -/* - * RFC 3961 Appendix A.3. DES3 DR and DK - * - * These tests show the derived-random and derived-key values for the - * des3-hmac-sha1-kd encryption scheme, using the DR and DK functions - * defined in section 6.3.1. The input keys were randomly generated; - * the usage values are from this specification. - * - * This test material is copyright (C) The Internet Society (2005). - */ - -DEFINE_HEX_XDR_NETOBJ(des3_dk_usage_155, - 0x00, 0x00, 0x00, 0x01, 0x55 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_usage_1aa, - 0x00, 0x00, 0x00, 0x01, 0xaa -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_usage_kerberos, - 0x6b, 0x65, 0x72, 0x62, 0x65, 0x72, 0x6f, 0x73 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test1_base_key, - 0xdc, 0xe0, 0x6b, 0x1f, 0x64, 0xc8, 0x57, 0xa1, - 0x1c, 0x3d, 0xb5, 0x7c, 0x51, 0x89, 0x9b, 0x2c, - 0xc1, 0x79, 0x10, 0x08, 0xce, 0x97, 0x3b, 0x92 -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test1_derived_key, - 0x92, 0x51, 0x79, 0xd0, 0x45, 0x91, 0xa7, 0x9b, - 0x5d, 0x31, 0x92, 0xc4, 0xa7, 0xe9, 0xc2, 0x89, - 0xb0, 0x49, 0xc7, 0x1f, 0x6e, 0xe6, 0x04, 0xcd -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test2_base_key, - 0x5e, 0x13, 0xd3, 0x1c, 0x70, 0xef, 0x76, 0x57, - 0x46, 0x57, 0x85, 0x31, 0xcb, 0x51, 0xc1, 0x5b, - 0xf1, 0x1c, 0xa8, 0x2c, 0x97, 0xce, 0xe9, 0xf2 -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test2_derived_key, - 0x9e, 0x58, 0xe5, 0xa1, 0x46, 0xd9, 0x94, 0x2a, - 0x10, 0x1c, 0x46, 0x98, 0x45, 0xd6, 0x7a, 0x20, - 0xe3, 0xc4, 0x25, 0x9e, 0xd9, 0x13, 0xf2, 0x07 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test3_base_key, - 0x98, 0xe6, 0xfd, 0x8a, 0x04, 0xa4, 0xb6, 0x85, - 0x9b, 0x75, 0xa1, 0x76, 0x54, 0x0b, 0x97, 0x52, - 0xba, 0xd3, 0xec, 0xd6, 0x10, 0xa2, 0x52, 0xbc -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test3_derived_key, - 0x13, 0xfe, 0xf8, 0x0d, 0x76, 0x3e, 0x94, 0xec, - 0x6d, 0x13, 0xfd, 0x2c, 0xa1, 0xd0, 0x85, 0x07, - 0x02, 0x49, 0xda, 0xd3, 0x98, 0x08, 0xea, 0xbf -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test4_base_key, - 0x62, 0x2a, 0xec, 0x25, 0xa2, 0xfe, 0x2c, 0xad, - 0x70, 0x94, 0x68, 0x0b, 0x7c, 0x64, 0x94, 0x02, - 0x80, 0x08, 0x4c, 0x1a, 0x7c, 0xec, 0x92, 0xb5 -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test4_derived_key, - 0xf8, 0xdf, 0xbf, 0x04, 0xb0, 0x97, 0xe6, 0xd9, - 0xdc, 0x07, 0x02, 0x68, 0x6b, 0xcb, 0x34, 0x89, - 0xd9, 0x1f, 0xd9, 0xa4, 0x51, 0x6b, 0x70, 0x3e -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test5_base_key, - 0xd3, 0xf8, 0x29, 0x8c, 0xcb, 0x16, 0x64, 0x38, - 0xdc, 0xb9, 0xb9, 0x3e, 0xe5, 0xa7, 0x62, 0x92, - 0x86, 0xa4, 0x91, 0xf8, 0x38, 0xf8, 0x02, 0xfb -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test5_derived_key, - 0x23, 0x70, 0xda, 0x57, 0x5d, 0x2a, 0x3d, 0xa8, - 0x64, 0xce, 0xbf, 0xdc, 0x52, 0x04, 0xd5, 0x6d, - 0xf7, 0x79, 0xa7, 0xdf, 0x43, 0xd9, 0xda, 0x43 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test6_base_key, - 0xc1, 0x08, 0x16, 0x49, 0xad, 0xa7, 0x43, 0x62, - 0xe6, 0xa1, 0x45, 0x9d, 0x01, 0xdf, 0xd3, 0x0d, - 0x67, 0xc2, 0x23, 0x4c, 0x94, 0x07, 0x04, 0xda -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test6_derived_key, - 0x34, 0x80, 0x57, 0xec, 0x98, 0xfd, 0xc4, 0x80, - 0x16, 0x16, 0x1c, 0x2a, 0x4c, 0x7a, 0x94, 0x3e, - 0x92, 0xae, 0x49, 0x2c, 0x98, 0x91, 0x75, 0xf7 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test7_base_key, - 0x5d, 0x15, 0x4a, 0xf2, 0x38, 0xf4, 0x67, 0x13, - 0x15, 0x57, 0x19, 0xd5, 0x5e, 0x2f, 0x1f, 0x79, - 0x0d, 0xd6, 0x61, 0xf2, 0x79, 0xa7, 0x91, 0x7c -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test7_derived_key, - 0xa8, 0x80, 0x8a, 0xc2, 0x67, 0xda, 0xda, 0x3d, - 0xcb, 0xe9, 0xa7, 0xc8, 0x46, 0x26, 0xfb, 0xc7, - 0x61, 0xc2, 0x94, 0xb0, 0x13, 0x15, 0xe5, 0xc1 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test8_base_key, - 0x79, 0x85, 0x62, 0xe0, 0x49, 0x85, 0x2f, 0x57, - 0xdc, 0x8c, 0x34, 0x3b, 0xa1, 0x7f, 0x2c, 0xa1, - 0xd9, 0x73, 0x94, 0xef, 0xc8, 0xad, 0xc4, 0x43 -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test8_derived_key, - 0xc8, 0x13, 0xf8, 0x8a, 0x3b, 0xe3, 0xb3, 0x34, - 0xf7, 0x54, 0x25, 0xce, 0x91, 0x75, 0xfb, 0xe3, - 0xc8, 0x49, 0x3b, 0x89, 0xc8, 0x70, 0x3b, 0x49 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test9_base_key, - 0x26, 0xdc, 0xe3, 0x34, 0xb5, 0x45, 0x29, 0x2f, - 0x2f, 0xea, 0xb9, 0xa8, 0x70, 0x1a, 0x89, 0xa4, - 0xb9, 0x9e, 0xb9, 0x94, 0x2c, 0xec, 0xd0, 0x16 -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test9_derived_key, - 0xf4, 0x8f, 0xfd, 0x6e, 0x83, 0xf8, 0x3e, 0x73, - 0x54, 0xe6, 0x94, 0xfd, 0x25, 0x2c, 0xf8, 0x3b, - 0xfe, 0x58, 0xf7, 0xd5, 0xba, 0x37, 0xec, 0x5d -); - -static const struct gss_krb5_test_param rfc3961_kdf_test_params[] = { - { - .desc = "des3-hmac-sha1 key derivation case 1", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test1_base_key, - .usage = &des3_dk_usage_155, - .expected_result = &des3_dk_test1_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 2", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test2_base_key, - .usage = &des3_dk_usage_1aa, - .expected_result = &des3_dk_test2_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 3", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test3_base_key, - .usage = &des3_dk_usage_155, - .expected_result = &des3_dk_test3_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 4", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test4_base_key, - .usage = &des3_dk_usage_1aa, - .expected_result = &des3_dk_test4_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 5", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test5_base_key, - .usage = &des3_dk_usage_kerberos, - .expected_result = &des3_dk_test5_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 6", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test6_base_key, - .usage = &des3_dk_usage_155, - .expected_result = &des3_dk_test6_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 7", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test7_base_key, - .usage = &des3_dk_usage_1aa, - .expected_result = &des3_dk_test7_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 8", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test8_base_key, - .usage = &des3_dk_usage_155, - .expected_result = &des3_dk_test8_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 9", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test9_base_key, - .usage = &des3_dk_usage_1aa, - .expected_result = &des3_dk_test9_derived_key, - }, -}; - -/* Creates the function rfc3961_kdf_gen_params */ -KUNIT_ARRAY_PARAM(rfc3961_kdf, rfc3961_kdf_test_params, gss_krb5_get_desc); - static struct kunit_case rfc3961_test_cases[] = { { .name = "RFC 3961 n-fold", .run_case = rfc3961_nfold_case, .generate_params = rfc3961_nfold_gen_params, }, - { - .name = "RFC 3961 key derivation", - .run_case = kdf_case, - .generate_params = rfc3961_kdf_gen_params, - }, {} }; diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index 7d6d4ae4a3c96..4fbc50a0a2c4b 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -69,83 +69,6 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif - -#if defined(CONFIG_RPCSEC_GSS_KRB5_SIMPLIFIED) -/* read_token is a mic token, and message_buffer is the data that the mic was - * supposedly taken over. */ -u32 -gss_krb5_verify_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *message_buffer, - struct xdr_netobj *read_token) -{ - int signalg; - int sealalg; - char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), - .data = cksumdata}; - s32 now; - int direction; - u32 seqnum; - unsigned char *ptr = (unsigned char *)read_token->data; - int bodysize; - u8 *cksumkey; - - dprintk("RPC: krb5_read_token\n"); - - if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr, - read_token->len)) - return GSS_S_DEFECTIVE_TOKEN; - - if ((ptr[0] != ((KG_TOK_MIC_MSG >> 8) & 0xff)) || - (ptr[1] != (KG_TOK_MIC_MSG & 0xff))) - return GSS_S_DEFECTIVE_TOKEN; - - /* XXX sanity-check bodysize?? */ - - signalg = ptr[2] + (ptr[3] << 8); - if (signalg != ctx->gk5e->signalg) - return GSS_S_DEFECTIVE_TOKEN; - - sealalg = ptr[4] + (ptr[5] << 8); - if (sealalg != SEAL_ALG_NONE) - return GSS_S_DEFECTIVE_TOKEN; - - if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) - return GSS_S_DEFECTIVE_TOKEN; - - if (ctx->gk5e->keyed_cksum) - cksumkey = ctx->cksum; - else - cksumkey = NULL; - - if (make_checksum(ctx, ptr, 8, message_buffer, 0, - cksumkey, KG_USAGE_SIGN, &md5cksum)) - return GSS_S_FAILURE; - - if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, - ctx->gk5e->cksumlength)) - return GSS_S_BAD_SIG; - - /* it got through unscathed. Make sure the context is unexpired */ - - now = ktime_get_real_seconds(); - - if (now > ctx->endtime) - return GSS_S_CONTEXT_EXPIRED; - - /* do sequencing checks */ - - if (krb5_get_seq_num(ctx, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, - &direction, &seqnum)) - return GSS_S_FAILURE; - - if ((ctx->initiate && direction != 0xff) || - (!ctx->initiate && direction != 0)) - return GSS_S_BAD_SIG; - - return GSS_S_COMPLETE; -} -#endif - u32 gss_krb5_verify_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *message_buffer, struct xdr_netobj *read_token) diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 6d6b082380b23..b3e1738ff6bfa 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -40,293 +40,6 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif -#if defined(CONFIG_RPCSEC_GSS_KRB5_SIMPLIFIED) - -static inline int -gss_krb5_padding(int blocksize, int length) -{ - return blocksize - (length % blocksize); -} - -static inline void -gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize) -{ - int padding = gss_krb5_padding(blocksize, buf->len - offset); - char *p; - struct kvec *iov; - - if (buf->page_len || buf->tail[0].iov_len) - iov = &buf->tail[0]; - else - iov = &buf->head[0]; - p = iov->iov_base + iov->iov_len; - iov->iov_len += padding; - buf->len += padding; - memset(p, padding, padding); -} - -static inline int -gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize) -{ - u8 *ptr; - u8 pad; - size_t len = buf->len; - - if (len <= buf->head[0].iov_len) { - pad = *(u8 *)(buf->head[0].iov_base + len - 1); - if (pad > buf->head[0].iov_len) - return -EINVAL; - buf->head[0].iov_len -= pad; - goto out; - } else - len -= buf->head[0].iov_len; - if (len <= buf->page_len) { - unsigned int last = (buf->page_base + len - 1) - >>PAGE_SHIFT; - unsigned int offset = (buf->page_base + len - 1) - & (PAGE_SIZE - 1); - ptr = kmap_atomic(buf->pages[last]); - pad = *(ptr + offset); - kunmap_atomic(ptr); - goto out; - } else - len -= buf->page_len; - BUG_ON(len > buf->tail[0].iov_len); - pad = *(u8 *)(buf->tail[0].iov_base + len - 1); -out: - /* XXX: NOTE: we do not adjust the page lengths--they represent - * a range of data in the real filesystem page cache, and we need - * to know that range so the xdr code can properly place read data. - * However adjusting the head length, as we do above, is harmless. - * In the case of a request that fits into a single page, the server - * also uses length and head length together to determine the original - * start of the request to copy the request for deferal; so it's - * easier on the server if we adjust head and tail length in tandem. - * It's not really a problem that we don't fool with the page and - * tail lengths, though--at worst badly formed xdr might lead the - * server to attempt to parse the padding. - * XXX: Document all these weird requirements for gss mechanism - * wrap/unwrap functions. */ - if (pad > blocksize) - return -EINVAL; - if (buf->len > pad) - buf->len -= pad; - else - return -EINVAL; - return 0; -} - -/* Assumptions: the head and tail of inbuf are ours to play with. - * The pages, however, may be real pages in the page cache and we replace - * them with scratch pages from **pages before writing to them. */ -/* XXX: obviously the above should be documentation of wrap interface, - * and shouldn't be in this kerberos-specific file. */ - -/* XXX factor out common code with seal/unseal. */ - -u32 -gss_krb5_wrap_v1(struct krb5_ctx *kctx, int offset, - struct xdr_buf *buf, struct page **pages) -{ - char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), - .data = cksumdata}; - int blocksize = 0, plainlen; - unsigned char *ptr, *msg_start; - time64_t now; - int headlen; - struct page **tmp_pages; - u32 seq_send; - u8 *cksumkey; - u32 conflen = crypto_sync_skcipher_blocksize(kctx->enc); - - dprintk("RPC: %s\n", __func__); - - now = ktime_get_real_seconds(); - - blocksize = crypto_sync_skcipher_blocksize(kctx->enc); - gss_krb5_add_padding(buf, offset, blocksize); - BUG_ON((buf->len - offset) % blocksize); - plainlen = conflen + buf->len - offset; - - headlen = g_token_size(&kctx->mech_used, - GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength + plainlen) - - (buf->len - offset); - - ptr = buf->head[0].iov_base + offset; - /* shift data to make room for header. */ - xdr_extend_head(buf, offset, headlen); - - /* XXX Would be cleverer to encrypt while copying. */ - BUG_ON((buf->len - offset - headlen) % blocksize); - - g_make_token_header(&kctx->mech_used, - GSS_KRB5_TOK_HDR_LEN + - kctx->gk5e->cksumlength + plainlen, &ptr); - - - /* ptr now at header described in rfc 1964, section 1.2.1: */ - ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff); - ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff); - - msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength; - - /* - * signalg and sealalg are stored as if they were converted from LE - * to host endian, even though they're opaque pairs of bytes according - * to the RFC. - */ - *(__le16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg); - *(__le16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg); - ptr[6] = 0xff; - ptr[7] = 0xff; - - krb5_make_confounder(msg_start, conflen); - - if (kctx->gk5e->keyed_cksum) - cksumkey = kctx->cksum; - else - cksumkey = NULL; - - /* XXXJBF: UGH!: */ - tmp_pages = buf->pages; - buf->pages = pages; - if (make_checksum(kctx, ptr, 8, buf, offset + headlen - conflen, - cksumkey, KG_USAGE_SEAL, &md5cksum)) - return GSS_S_FAILURE; - buf->pages = tmp_pages; - - memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - - seq_send = atomic_fetch_inc(&kctx->seq_send); - - /* XXX would probably be more efficient to compute checksum - * and encrypt at the same time: */ - if ((krb5_make_seq_num(kctx, kctx->seq, kctx->initiate ? 0 : 0xff, - seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))) - return GSS_S_FAILURE; - - if (gss_encrypt_xdr_buf(kctx->enc, buf, - offset + headlen - conflen, pages)) - return GSS_S_FAILURE; - - return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; -} - -u32 -gss_krb5_unwrap_v1(struct krb5_ctx *kctx, int offset, int len, - struct xdr_buf *buf, unsigned int *slack, - unsigned int *align) -{ - int signalg; - int sealalg; - char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), - .data = cksumdata}; - time64_t now; - int direction; - s32 seqnum; - unsigned char *ptr; - int bodysize; - void *data_start, *orig_start; - int data_len; - int blocksize; - u32 conflen = crypto_sync_skcipher_blocksize(kctx->enc); - int crypt_offset; - u8 *cksumkey; - unsigned int saved_len = buf->len; - - dprintk("RPC: gss_unwrap_kerberos\n"); - - ptr = (u8 *)buf->head[0].iov_base + offset; - if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr, - len - offset)) - return GSS_S_DEFECTIVE_TOKEN; - - if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) || - (ptr[1] != (KG_TOK_WRAP_MSG & 0xff))) - return GSS_S_DEFECTIVE_TOKEN; - - /* XXX sanity-check bodysize?? */ - - /* get the sign and seal algorithms */ - - signalg = ptr[2] + (ptr[3] << 8); - if (signalg != kctx->gk5e->signalg) - return GSS_S_DEFECTIVE_TOKEN; - - sealalg = ptr[4] + (ptr[5] << 8); - if (sealalg != kctx->gk5e->sealalg) - return GSS_S_DEFECTIVE_TOKEN; - - if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) - return GSS_S_DEFECTIVE_TOKEN; - - /* - * Data starts after token header and checksum. ptr points - * to the beginning of the token header - */ - crypt_offset = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) - - (unsigned char *)buf->head[0].iov_base; - - buf->len = len; - if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset)) - return GSS_S_DEFECTIVE_TOKEN; - - if (kctx->gk5e->keyed_cksum) - cksumkey = kctx->cksum; - else - cksumkey = NULL; - - if (make_checksum(kctx, ptr, 8, buf, crypt_offset, - cksumkey, KG_USAGE_SEAL, &md5cksum)) - return GSS_S_FAILURE; - - if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, - kctx->gk5e->cksumlength)) - return GSS_S_BAD_SIG; - - /* it got through unscathed. Make sure the context is unexpired */ - - now = ktime_get_real_seconds(); - - if (now > kctx->endtime) - return GSS_S_CONTEXT_EXPIRED; - - /* do sequencing checks */ - - if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN, - ptr + 8, &direction, &seqnum)) - return GSS_S_BAD_SIG; - - if ((kctx->initiate && direction != 0xff) || - (!kctx->initiate && direction != 0)) - return GSS_S_BAD_SIG; - - /* Copy the data back to the right position. XXX: Would probably be - * better to copy and encrypt at the same time. */ - - blocksize = crypto_sync_skcipher_blocksize(kctx->enc); - data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) + - conflen; - orig_start = buf->head[0].iov_base + offset; - data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; - memmove(orig_start, data_start, data_len); - buf->head[0].iov_len -= (data_start - orig_start); - buf->len = len - (data_start - orig_start); - - if (gss_krb5_remove_padding(buf, blocksize)) - return GSS_S_DEFECTIVE_TOKEN; - - /* slack must include room for krb5 padding */ - *slack = XDR_QUADLEN(saved_len - buf->len); - /* The GSS blob always precedes the RPC message payload */ - *align = *slack; - return GSS_S_COMPLETE; -} - -#endif - /* * We can shift data by up to LOCAL_BUF_LEN bytes in a pass. If we need * to do more than that, we shift repeatedly. Kevin Coffman reports diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index c4a566737085c..18734e70c5ddb 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -986,7 +986,7 @@ svcauth_gss_unwrap_priv(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) return -EINVAL; } -static int +static enum svc_auth_status svcauth_gss_set_client(struct svc_rqst *rqstp) { struct gss_svc_data *svcdata = rqstp->rq_auth_data; @@ -1634,7 +1634,7 @@ svcauth_gss_decode_credbody(struct xdr_stream *xdr, * * The rqstp->rq_auth_stat field is also set (see RFCs 2203 and 5531). */ -static int +static enum svc_auth_status svcauth_gss_accept(struct svc_rqst *rqstp) { struct gss_svc_data *svcdata = rqstp->rq_auth_data; @@ -1945,9 +1945,6 @@ static int svcauth_gss_wrap_priv(struct svc_rqst *rqstp) * %0: the Reply is ready to be sent * %-ENOMEM: failed to allocate memory * %-EINVAL: encoding error - * - * XXX: These return values do not match the return values documented - * for the auth_ops ->release method in linux/sunrpc/svcauth.h. */ static int svcauth_gss_release(struct svc_rqst *rqstp) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d7c697af3762f..8d75290f1a31d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -534,6 +534,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) .servername = args->servername, .bc_xprt = args->bc_xprt, .xprtsec = args->xprtsec, + .connect_timeout = args->connect_timeout, + .reconnect_timeout = args->reconnect_timeout, }; char servername[48]; struct rpc_clnt *clnt; @@ -2602,6 +2604,7 @@ call_decode(struct rpc_task *task) case 0: task->tk_action = rpc_exit_task; task->tk_status = rpcauth_unwrap_resp(task, &xdr); + xdr_finish_decode(&xdr); return; case -EAGAIN: task->tk_status = 0; @@ -3069,6 +3072,11 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, } xprt->resvport = resvport; xprt->reuseport = reuseport; + + if (xprtargs->connect_timeout) + connect_timeout = xprtargs->connect_timeout; + if (xprtargs->reconnect_timeout) + reconnect_timeout = xprtargs->reconnect_timeout; if (xprt->ops->set_connect_timeout != NULL) xprt->ops->set_connect_timeout(xprt, connect_timeout, diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 587811a002c98..812fda9d45dd6 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -513,9 +513,9 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, INIT_LIST_HEAD(&pool->sp_all_threads); spin_lock_init(&pool->sp_lock); + percpu_counter_init(&pool->sp_messages_arrived, 0, GFP_KERNEL); percpu_counter_init(&pool->sp_sockets_queued, 0, GFP_KERNEL); percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL); - percpu_counter_init(&pool->sp_threads_timedout, 0, GFP_KERNEL); } return serv; @@ -588,9 +588,9 @@ svc_destroy(struct kref *ref) for (i = 0; i < serv->sv_nrpools; i++) { struct svc_pool *pool = &serv->sv_pools[i]; + percpu_counter_destroy(&pool->sp_messages_arrived); percpu_counter_destroy(&pool->sp_sockets_queued); percpu_counter_destroy(&pool->sp_threads_woken); - percpu_counter_destroy(&pool->sp_threads_timedout); } kfree(serv->sv_pools); kfree(serv); @@ -689,23 +689,44 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) return rqstp; } -/* - * Choose a pool in which to create a new thread, for svc_set_num_threads +/** + * svc_pool_wake_idle_thread - Awaken an idle thread in @pool + * @pool: service thread pool + * + * Can be called from soft IRQ or process context. Finding an idle + * service thread and marking it BUSY is atomic with respect to + * other calls to svc_pool_wake_idle_thread(). + * */ -static inline struct svc_pool * -choose_pool(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +void svc_pool_wake_idle_thread(struct svc_pool *pool) { - if (pool != NULL) - return pool; + struct svc_rqst *rqstp; + + rcu_read_lock(); + list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { + if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) + continue; + + WRITE_ONCE(rqstp->rq_qtime, ktime_get()); + wake_up_process(rqstp->rq_task); + rcu_read_unlock(); + percpu_counter_inc(&pool->sp_threads_woken); + trace_svc_wake_up(rqstp->rq_task->pid); + return; + } + rcu_read_unlock(); - return &serv->sv_pools[(*state)++ % serv->sv_nrpools]; + set_bit(SP_CONGESTED, &pool->sp_flags); } -/* - * Choose a thread to kill, for svc_set_num_threads - */ -static inline struct task_struct * -choose_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +static struct svc_pool * +svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +{ + return pool ? pool : &serv->sv_pools[(*state)++ % serv->sv_nrpools]; +} + +static struct task_struct * +svc_pool_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) { unsigned int i; struct task_struct *task = NULL; @@ -713,7 +734,6 @@ choose_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) if (pool != NULL) { spin_lock_bh(&pool->sp_lock); } else { - /* choose a pool in round-robin fashion */ for (i = 0; i < serv->sv_nrpools; i++) { pool = &serv->sv_pools[--(*state) % serv->sv_nrpools]; spin_lock_bh(&pool->sp_lock); @@ -728,21 +748,15 @@ choose_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) if (!list_empty(&pool->sp_all_threads)) { struct svc_rqst *rqstp; - /* - * Remove from the pool->sp_all_threads list - * so we don't try to kill it again. - */ rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all); set_bit(RQ_VICTIM, &rqstp->rq_flags); list_del_rcu(&rqstp->rq_all); task = rqstp->rq_task; } spin_unlock_bh(&pool->sp_lock); - return task; } -/* create new threads */ static int svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { @@ -754,13 +768,12 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) do { nrservs--; - chosen_pool = choose_pool(serv, pool, &state); - + chosen_pool = svc_pool_next(serv, pool, &state); node = svc_pool_map_get_node(chosen_pool->sp_id); + rqstp = svc_prepare_thread(serv, chosen_pool, node); if (IS_ERR(rqstp)) return PTR_ERR(rqstp); - task = kthread_create_on_node(serv->sv_threadfn, rqstp, node, "%s", serv->sv_name); if (IS_ERR(task)) { @@ -779,15 +792,6 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) return 0; } -/* - * Create or destroy enough new threads to make the number - * of threads the given number. If `pool' is non-NULL, applies - * only to threads in that pool, otherwise round-robins between - * all pools. Caller must ensure that mutual exclusion between this and - * server startup or shutdown. - */ - -/* destroy old threads */ static int svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { @@ -795,9 +799,8 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) struct task_struct *task; unsigned int state = serv->sv_nrthreads-1; - /* destroy old threads */ do { - task = choose_victim(serv, pool, &state); + task = svc_pool_victim(serv, pool, &state); if (task == NULL) break; rqstp = kthread_data(task); @@ -809,6 +812,23 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) return 0; } +/** + * svc_set_num_threads - adjust number of threads per RPC service + * @serv: RPC service to adjust + * @pool: Specific pool from which to choose threads, or NULL + * @nrservs: New number of threads for @serv (0 or less means kill all threads) + * + * Create or destroy threads to make the number of threads for @serv the + * given number. If @pool is non-NULL, change only threads in that pool; + * otherwise, round-robin between all pools for @serv. @serv's + * sv_nrthreads is adjusted for each thread created or destroyed. + * + * Caller must ensure mutual exclusion between this and server startup or + * shutdown. + * + * Returns zero on success or a negative errno if an error occurred while + * starting a thread. + */ int svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { @@ -1277,8 +1297,9 @@ svc_process_common(struct svc_rqst *rqstp) const struct svc_procedure *procp = NULL; struct svc_serv *serv = rqstp->rq_server; struct svc_process_info process; - int auth_res, rc; + enum svc_auth_status auth_res; unsigned int aoffset; + int rc; __be32 *p; /* Will be turned off by GSS integrity and privacy services */ @@ -1333,6 +1354,9 @@ svc_process_common(struct svc_rqst *rqstp) goto dropit; case SVC_COMPLETE: goto sendit; + default: + pr_warn_once("Unexpected svc_auth_status (%d)\n", auth_res); + goto err_system_err; } if (progp == NULL) @@ -1370,6 +1394,8 @@ svc_process_common(struct svc_rqst *rqstp) rc = process.dispatch(rqstp); if (procp->pc_release) procp->pc_release(rqstp); + xdr_finish_decode(xdr); + if (!rc) goto dropit; if (rqstp->rq_auth_stat != rpc_auth_ok) @@ -1516,7 +1542,6 @@ void svc_process(struct svc_rqst *rqstp) out_drop: svc_drop(rqstp); } -EXPORT_SYMBOL_GPL(svc_process); #if defined(CONFIG_SUNRPC_BACKCHANNEL) /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 62c7919ea6106..4cfe9640df481 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -434,6 +434,7 @@ static bool svc_xprt_ready(struct svc_xprt *xprt) smp_rmb(); xpt_flags = READ_ONCE(xprt->xpt_flags); + trace_svc_xprt_enqueue(xprt, xpt_flags); if (xpt_flags & BIT(XPT_BUSY)) return false; if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE) | BIT(XPT_HANDSHAKE))) @@ -456,7 +457,6 @@ static bool svc_xprt_ready(struct svc_xprt *xprt) void svc_xprt_enqueue(struct svc_xprt *xprt) { struct svc_pool *pool; - struct svc_rqst *rqstp = NULL; if (!svc_xprt_ready(xprt)) return; @@ -476,21 +476,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); spin_unlock_bh(&pool->sp_lock); - /* find a thread for this xprt */ - rcu_read_lock(); - list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { - if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) - continue; - percpu_counter_inc(&pool->sp_threads_woken); - rqstp->rq_qtime = ktime_get(); - wake_up_process(rqstp->rq_task); - goto out_unlock; - } - set_bit(SP_CONGESTED, &pool->sp_flags); - rqstp = NULL; -out_unlock: - rcu_read_unlock(); - trace_svc_xprt_enqueue(xprt, rqstp); + svc_pool_wake_idle_thread(pool); } EXPORT_SYMBOL_GPL(svc_xprt_enqueue); @@ -581,7 +567,10 @@ static void svc_xprt_release(struct svc_rqst *rqstp) svc_xprt_put(xprt); } -/* +/** + * svc_wake_up - Wake up a service thread for non-transport work + * @serv: RPC service + * * Some svc_serv's will have occasional work to do, even when a xprt is not * waiting to be serviced. This function is there to "kick" a task in one of * those services so that it can wake up and do that work. Note that we only @@ -590,27 +579,10 @@ static void svc_xprt_release(struct svc_rqst *rqstp) */ void svc_wake_up(struct svc_serv *serv) { - struct svc_rqst *rqstp; - struct svc_pool *pool; - - pool = &serv->sv_pools[0]; + struct svc_pool *pool = &serv->sv_pools[0]; - rcu_read_lock(); - list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { - /* skip any that aren't queued */ - if (test_bit(RQ_BUSY, &rqstp->rq_flags)) - continue; - rcu_read_unlock(); - wake_up_process(rqstp->rq_task); - trace_svc_wake_up(rqstp->rq_task->pid); - return; - } - rcu_read_unlock(); - - /* No free entries available */ set_bit(SP_TASK_PENDING, &pool->sp_flags); - smp_wmb(); - trace_svc_wake_up(0); + svc_pool_wake_idle_thread(pool); } EXPORT_SYMBOL_GPL(svc_wake_up); @@ -679,7 +651,7 @@ static void svc_check_conn_limits(struct svc_serv *serv) } } -static int svc_alloc_arg(struct svc_rqst *rqstp) +static bool svc_alloc_arg(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; struct xdr_buf *arg = &rqstp->rq_arg; @@ -701,10 +673,10 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) /* Made progress, don't sleep yet */ continue; - set_current_state(TASK_INTERRUPTIBLE); - if (signalled() || kthread_should_stop()) { + set_current_state(TASK_IDLE); + if (kthread_should_stop()) { set_current_state(TASK_RUNNING); - return -EINTR; + return false; } trace_svc_alloc_arg_err(pages, ret); memalloc_retry_wait(GFP_KERNEL); @@ -723,7 +695,7 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) arg->tail[0].iov_len = 0; rqstp->rq_xid = xdr_zero; - return 0; + return true; } static bool @@ -732,7 +704,7 @@ rqst_should_sleep(struct svc_rqst *rqstp) struct svc_pool *pool = rqstp->rq_pool; /* did someone call svc_wake_up? */ - if (test_and_clear_bit(SP_TASK_PENDING, &pool->sp_flags)) + if (test_bit(SP_TASK_PENDING, &pool->sp_flags)) return false; /* was a socket queued? */ @@ -740,7 +712,7 @@ rqst_should_sleep(struct svc_rqst *rqstp) return false; /* are we shutting down? */ - if (signalled() || kthread_should_stop()) + if (kthread_should_stop()) return false; /* are we freezing? */ @@ -750,10 +722,9 @@ rqst_should_sleep(struct svc_rqst *rqstp) return true; } -static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) +static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp) { struct svc_pool *pool = rqstp->rq_pool; - long time_left = 0; /* rq_xprt should be clear on entry */ WARN_ON_ONCE(rqstp->rq_xprt); @@ -762,18 +733,14 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) if (rqstp->rq_xprt) goto out_found; - /* - * We have to be able to interrupt this wait - * to bring down the daemons ... - */ - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_IDLE); smp_mb__before_atomic(); clear_bit(SP_CONGESTED, &pool->sp_flags); clear_bit(RQ_BUSY, &rqstp->rq_flags); smp_mb__after_atomic(); if (likely(rqst_should_sleep(rqstp))) - time_left = schedule_timeout(timeout); + schedule(); else __set_current_state(TASK_RUNNING); @@ -781,17 +748,16 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) set_bit(RQ_BUSY, &rqstp->rq_flags); smp_mb__after_atomic(); + clear_bit(SP_TASK_PENDING, &pool->sp_flags); rqstp->rq_xprt = svc_xprt_dequeue(pool); if (rqstp->rq_xprt) goto out_found; - if (!time_left) - percpu_counter_inc(&pool->sp_threads_timedout); - - if (signalled() || kthread_should_stop()) - return ERR_PTR(-EINTR); - return ERR_PTR(-EAGAIN); + if (kthread_should_stop()) + return NULL; + return NULL; out_found: + clear_bit(SP_TASK_PENDING, &pool->sp_flags); /* Normally we will wait up to 5 seconds for any required * cache information to be provided. */ @@ -867,37 +833,35 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) return len; } -/* - * Receive the next request on any transport. This code is carefully - * organised not to touch any cachelines in the shared svc_serv - * structure, only cachelines in the local svc_pool. +/** + * svc_recv - Receive and process the next request on any transport + * @rqstp: an idle RPC service thread + * + * This code is carefully organised not to touch any cachelines in + * the shared svc_serv structure, only cachelines in the local + * svc_pool. */ -int svc_recv(struct svc_rqst *rqstp, long timeout) +void svc_recv(struct svc_rqst *rqstp) { struct svc_xprt *xprt = NULL; struct svc_serv *serv = rqstp->rq_server; - int len, err; + int len; - err = svc_alloc_arg(rqstp); - if (err) + if (!svc_alloc_arg(rqstp)) goto out; try_to_freeze(); cond_resched(); - err = -EINTR; - if (signalled() || kthread_should_stop()) + if (kthread_should_stop()) goto out; - xprt = svc_get_next_xprt(rqstp, timeout); - if (IS_ERR(xprt)) { - err = PTR_ERR(xprt); + xprt = svc_get_next_xprt(rqstp); + if (!xprt) goto out; - } len = svc_handle_xprt(rqstp, xprt); /* No data, incomplete (TCP) read, or accept() */ - err = -EAGAIN; if (len <= 0) goto out_release; @@ -909,13 +873,14 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (serv->sv_stats) serv->sv_stats->netcnt++; + percpu_counter_inc(&rqstp->rq_pool->sp_messages_arrived); rqstp->rq_stime = ktime_get(); - return len; + svc_process(rqstp); +out: + return; out_release: rqstp->rq_res.len = 0; svc_xprt_release(rqstp); -out: - return err; } EXPORT_SYMBOL_GPL(svc_recv); @@ -1456,12 +1421,11 @@ static int svc_pool_stats_show(struct seq_file *m, void *p) return 0; } - seq_printf(m, "%u %llu %llu %llu %llu\n", - pool->sp_id, - percpu_counter_sum_positive(&pool->sp_sockets_queued), - percpu_counter_sum_positive(&pool->sp_sockets_queued), - percpu_counter_sum_positive(&pool->sp_threads_woken), - percpu_counter_sum_positive(&pool->sp_threads_timedout)); + seq_printf(m, "%u %llu %llu %llu 0\n", + pool->sp_id, + percpu_counter_sum_positive(&pool->sp_messages_arrived), + percpu_counter_sum_positive(&pool->sp_sockets_queued), + percpu_counter_sum_positive(&pool->sp_threads_woken)); return 0; } diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 67d8245a08afb..aa4429d0b8106 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -60,8 +60,19 @@ svc_put_auth_ops(struct auth_ops *aops) module_put(aops->owner); } -int -svc_authenticate(struct svc_rqst *rqstp) +/** + * svc_authenticate - Initialize an outgoing credential + * @rqstp: RPC execution context + * + * Return values: + * %SVC_OK: XDR encoding of the result can begin + * %SVC_DENIED: Credential or verifier is not valid + * %SVC_GARBAGE: Failed to decode credential or verifier + * %SVC_COMPLETE: GSS context lifetime event; no further action + * %SVC_DROP: Drop this request; no further action + * %SVC_CLOSE: Like drop, but also close transport connection + */ +enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp) { struct auth_ops *aops; u32 flavor; @@ -89,16 +100,28 @@ svc_authenticate(struct svc_rqst *rqstp) } EXPORT_SYMBOL_GPL(svc_authenticate); -int svc_set_client(struct svc_rqst *rqstp) +/** + * svc_set_client - Assign an appropriate 'auth_domain' as the client + * @rqstp: RPC execution context + * + * Return values: + * %SVC_OK: Client was found and assigned + * %SVC_DENY: Client was explicitly denied + * %SVC_DROP: Ignore this request + * %SVC_CLOSE: Ignore this request and close the connection + */ +enum svc_auth_status svc_set_client(struct svc_rqst *rqstp) { rqstp->rq_client = NULL; return rqstp->rq_authop->set_client(rqstp); } EXPORT_SYMBOL_GPL(svc_set_client); -/* A request, which was authenticated, has now executed. - * Time to finalise the credentials and verifier - * and release and resources +/** + * svc_authorise - Finalize credentials/verifier and release resources + * @rqstp: RPC execution context + * + * Returns zero on success, or a negative errno. */ int svc_authorise(struct svc_rqst *rqstp) { diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 174783f804fa5..04b45588ae6fe 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -665,7 +665,7 @@ static struct group_info *unix_gid_find(kuid_t uid, struct svc_rqst *rqstp) } } -int +enum svc_auth_status svcauth_unix_set_client(struct svc_rqst *rqstp) { struct sockaddr_in *sin; @@ -736,7 +736,6 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } - EXPORT_SYMBOL_GPL(svcauth_unix_set_client); /** @@ -751,7 +750,7 @@ EXPORT_SYMBOL_GPL(svcauth_unix_set_client); * * rqstp->rq_auth_stat is set as mandated by RFC 5531. */ -static int +static enum svc_auth_status svcauth_null_accept(struct svc_rqst *rqstp) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; @@ -828,7 +827,7 @@ struct auth_ops svcauth_null = { * * rqstp->rq_auth_stat is set as mandated by RFC 5531. */ -static int +static enum svc_auth_status svcauth_tls_accept(struct svc_rqst *rqstp) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; @@ -913,7 +912,7 @@ struct auth_ops svcauth_tls = { * * rqstp->rq_auth_stat is set as mandated by RFC 5531. */ -static int +static enum svc_auth_status svcauth_unix_accept(struct svc_rqst *rqstp) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 8c9a8ee76aa00..998687421fa6a 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -36,6 +36,8 @@ #include #include #include +#include + #include #include #include @@ -695,9 +697,10 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) .msg_name = &rqstp->rq_addr, .msg_namelen = rqstp->rq_addrlen, .msg_control = cmh, + .msg_flags = MSG_SPLICE_PAGES, .msg_controllen = sizeof(buffer), }; - unsigned int sent; + unsigned int count; int err; svc_udp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); @@ -710,22 +713,23 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) if (svc_xprt_is_dead(xprt)) goto out_notconn; - err = xdr_alloc_bvec(xdr, GFP_KERNEL); - if (err < 0) - goto out_unlock; + count = xdr_buf_to_bvec(rqstp->rq_bvec, + ARRAY_SIZE(rqstp->rq_bvec), xdr); - err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + count, 0); + err = sock_sendmsg(svsk->sk_sock, &msg); if (err == -ECONNREFUSED) { /* ICMP error on earlier request. */ - err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + count, 0); + err = sock_sendmsg(svsk->sk_sock, &msg); } - xdr_free_bvec(xdr); + trace_svcsock_udp_send(xprt, err); -out_unlock: + mutex_unlock(&xprt->xpt_mutex); - if (err < 0) - return err; - return sent; + return err; out_notconn: mutex_unlock(&xprt->xpt_mutex); @@ -1089,6 +1093,9 @@ static void svc_tcp_fragment_received(struct svc_sock *svsk) /* If we have more data, signal svc_xprt_enqueue() to try again */ svsk->sk_tcplen = 0; svsk->sk_marker = xdr_zero; + + smp_wmb(); + tcp_set_rcvlowat(svsk->sk_sk, 1); } /** @@ -1178,10 +1185,17 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) goto err_delete; if (len == want) svc_tcp_fragment_received(svsk); - else + else { + /* Avoid more ->sk_data_ready() calls until the rest + * of the message has arrived. This reduces service + * thread wake-ups on large incoming messages. */ + tcp_set_rcvlowat(svsk->sk_sk, + svc_sock_reclen(svsk) - svsk->sk_tcplen); + trace_svcsock_tcp_recv_short(&svsk->sk_xprt, svc_sock_reclen(svsk), svsk->sk_tcplen - sizeof(rpc_fraghdr)); + } goto err_noclose; error: if (len != -EAGAIN) @@ -1198,75 +1212,51 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) return 0; /* record not complete */ } -static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, - int flags) -{ - struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | flags, }; - - iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, vec, 1, vec->iov_len); - return sock_sendmsg(sock, &msg); -} - /* * MSG_SPLICE_PAGES is used exclusively to reduce the number of * copy operations in this path. Therefore the caller must ensure * that the pages backing @xdr are unchanging. * - * In addition, the logic assumes that * .bv_len is never larger - * than PAGE_SIZE. + * Note that the send is non-blocking. The caller has incremented + * the reference count on each page backing the RPC message, and + * the network layer will "put" these pages when transmission is + * complete. + * + * This is safe for our RPC services because the memory backing + * the head and tail components is never kmalloc'd. These always + * come from pages in the svc_rqst::rq_pages array. */ -static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr, +static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, rpc_fraghdr marker, unsigned int *sentp) { - const struct kvec *head = xdr->head; - const struct kvec *tail = xdr->tail; - struct kvec rm = { - .iov_base = &marker, - .iov_len = sizeof(marker), - }; struct msghdr msg = { - .msg_flags = 0, + .msg_flags = MSG_SPLICE_PAGES, }; + unsigned int count; + void *buf; int ret; *sentp = 0; - ret = xdr_alloc_bvec(xdr, GFP_KERNEL); - if (ret < 0) - return ret; - - ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len); - if (ret < 0) - return ret; - *sentp += ret; - if (ret != rm.iov_len) - return -EAGAIN; - - ret = svc_tcp_send_kvec(sock, head, 0); - if (ret < 0) - return ret; - *sentp += ret; - if (ret != head->iov_len) - goto out; - if (xdr_buf_pagecount(xdr)) - xdr->bvec[0].bv_offset = offset_in_page(xdr->page_base); - - msg.msg_flags = MSG_SPLICE_PAGES; - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec, - xdr_buf_pagecount(xdr), xdr->page_len); - ret = sock_sendmsg(sock, &msg); + /* The stream record marker is copied into a temporary page + * fragment buffer so that it can be included in rq_bvec. + */ + buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker), + GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, &marker, sizeof(marker)); + bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker)); + + count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, + ARRAY_SIZE(rqstp->rq_bvec) - 1, &rqstp->rq_res); + + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + 1 + count, sizeof(marker) + rqstp->rq_res.len); + ret = sock_sendmsg(svsk->sk_sock, &msg); if (ret < 0) return ret; *sentp += ret; - - if (tail->iov_len) { - ret = svc_tcp_send_kvec(sock, tail, 0); - if (ret < 0) - return ret; - *sentp += ret; - } - -out: return 0; } @@ -1292,23 +1282,17 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) svc_tcp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); rqstp->rq_xprt_ctxt = NULL; - atomic_inc(&svsk->sk_sendqlen); mutex_lock(&xprt->xpt_mutex); if (svc_xprt_is_dead(xprt)) goto out_notconn; - tcp_sock_set_cork(svsk->sk_sk, true); - err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent); - xdr_free_bvec(xdr); + err = svc_tcp_sendmsg(svsk, rqstp, marker, &sent); trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent); if (err < 0 || sent != (xdr->len + sizeof(marker))) goto out_close; - if (atomic_dec_and_test(&svsk->sk_sendqlen)) - tcp_sock_set_cork(svsk->sk_sk, false); mutex_unlock(&xprt->xpt_mutex); return sent; out_notconn: - atomic_dec(&svsk->sk_sendqlen); mutex_unlock(&xprt->xpt_mutex); return -ENOTCONN; out_close: @@ -1317,7 +1301,6 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) (err < 0) ? "got error" : "sent", (err < 0) ? err : sent, xdr->len); svc_xprt_deferred_close(xprt); - atomic_dec(&svsk->sk_sendqlen); mutex_unlock(&xprt->xpt_mutex); return -EAGAIN; } @@ -1644,6 +1627,7 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) static void svc_sock_free(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct page_frag_cache *pfc = &svsk->sk_frag_cache; struct socket *sock = svsk->sk_sock; trace_svcsock_free(svsk, sock); @@ -1653,5 +1637,8 @@ static void svc_sock_free(struct svc_xprt *xprt) sockfd_put(sock); else sock_release(sock); + if (pfc->va) + __page_frag_cache_drain(virt_to_head_page(pfc->va), + pfc->pagecnt_bias); kfree(svsk); } diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 2a22e78af116e..62e07c330a66f 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -164,6 +164,56 @@ xdr_free_bvec(struct xdr_buf *buf) buf->bvec = NULL; } +/** + * xdr_buf_to_bvec - Copy components of an xdr_buf into a bio_vec array + * @bvec: bio_vec array to populate + * @bvec_size: element count of @bio_vec + * @xdr: xdr_buf to be copied + * + * Returns the number of entries consumed in @bvec. + */ +unsigned int xdr_buf_to_bvec(struct bio_vec *bvec, unsigned int bvec_size, + const struct xdr_buf *xdr) +{ + const struct kvec *head = xdr->head; + const struct kvec *tail = xdr->tail; + unsigned int count = 0; + + if (head->iov_len) { + bvec_set_virt(bvec++, head->iov_base, head->iov_len); + ++count; + } + + if (xdr->page_len) { + unsigned int offset, len, remaining; + struct page **pages = xdr->pages; + + offset = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining > 0) { + len = min_t(unsigned int, remaining, + PAGE_SIZE - offset); + bvec_set_page(bvec++, *pages++, len, offset); + remaining -= len; + offset = 0; + if (unlikely(++count > bvec_size)) + goto bvec_overflow; + } + } + + if (tail->iov_len) { + bvec_set_virt(bvec, tail->iov_base, tail->iov_len); + if (unlikely(++count > bvec_size)) + goto bvec_overflow; + } + + return count; + +bvec_overflow: + pr_warn_once("%s: bio_vec array overflow\n", __func__); + return count - 1; +} + /** * xdr_inline_pages - Prepare receive buffer for a large reply * @xdr: xdr_buf into which reply will be placed @@ -1288,6 +1338,14 @@ static unsigned int xdr_set_tail_base(struct xdr_stream *xdr, return xdr_set_iov(xdr, buf->tail, base, len); } +static void xdr_stream_unmap_current_page(struct xdr_stream *xdr) +{ + if (xdr->page_kaddr) { + kunmap_local(xdr->page_kaddr); + xdr->page_kaddr = NULL; + } +} + static unsigned int xdr_set_page_base(struct xdr_stream *xdr, unsigned int base, unsigned int len) { @@ -1305,12 +1363,18 @@ static unsigned int xdr_set_page_base(struct xdr_stream *xdr, if (len > maxlen) len = maxlen; + xdr_stream_unmap_current_page(xdr); xdr_stream_page_set_pos(xdr, base); base += xdr->buf->page_base; pgnr = base >> PAGE_SHIFT; xdr->page_ptr = &xdr->buf->pages[pgnr]; - kaddr = page_address(*xdr->page_ptr); + + if (PageHighMem(*xdr->page_ptr)) { + xdr->page_kaddr = kmap_local_page(*xdr->page_ptr); + kaddr = xdr->page_kaddr; + } else + kaddr = page_address(*xdr->page_ptr); pgoff = base & ~PAGE_MASK; xdr->p = (__be32*)(kaddr + pgoff); @@ -1364,6 +1428,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst) { xdr->buf = buf; + xdr->page_kaddr = NULL; xdr_reset_scratch_buffer(xdr); xdr->nwords = XDR_QUADLEN(buf->len); if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 && @@ -1396,6 +1461,16 @@ void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, } EXPORT_SYMBOL_GPL(xdr_init_decode_pages); +/** + * xdr_finish_decode - Clean up the xdr_stream after decoding data. + * @xdr: pointer to xdr_stream struct + */ +void xdr_finish_decode(struct xdr_stream *xdr) +{ + xdr_stream_unmap_current_page(xdr); +} +EXPORT_SYMBOL(xdr_finish_decode); + static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) { unsigned int nwords = XDR_QUADLEN(nbytes); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 5e5ff6784ef5f..da409450dfc05 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -593,7 +593,6 @@ void xprt_rdma_cleanup(void); int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int); size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *); -int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst); void xprt_rdma_bc_free_rqst(struct rpc_rqst *); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 268a2cc61acd1..71cd916e384f1 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2237,9 +2237,13 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct net *net = sock_net(sock->sk); + unsigned long connect_timeout; + unsigned long syn_retries; unsigned int keepidle; unsigned int keepcnt; unsigned int timeo; + unsigned long t; spin_lock(&xprt->transport_lock); keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ); @@ -2257,6 +2261,35 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, /* TCP user timeout (see RFC5482) */ tcp_sock_set_user_timeout(sock->sk, timeo); + + /* Connect timeout */ + connect_timeout = max_t(unsigned long, + DIV_ROUND_UP(xprt->connect_timeout, HZ), 1); + syn_retries = max_t(unsigned long, + READ_ONCE(net->ipv4.sysctl_tcp_syn_retries), 1); + for (t = 0; t <= syn_retries && (1UL << t) < connect_timeout; t++) + ; + if (t <= syn_retries) + tcp_sock_set_syncnt(sock->sk, t - 1); +} + +static void xs_tcp_do_set_connect_timeout(struct rpc_xprt *xprt, + unsigned long connect_timeout) +{ + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + struct rpc_timeout to; + unsigned long initval; + + memcpy(&to, xprt->timeout, sizeof(to)); + /* Arbitrary lower limit */ + initval = max_t(unsigned long, connect_timeout, XS_TCP_INIT_REEST_TO); + to.to_initval = initval; + to.to_maxval = initval; + to.to_retries = 0; + memcpy(&transport->tcp_timeout, &to, sizeof(transport->tcp_timeout)); + xprt->timeout = &transport->tcp_timeout; + xprt->connect_timeout = connect_timeout; } static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, @@ -2264,25 +2297,12 @@ static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, unsigned long reconnect_timeout) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - struct rpc_timeout to; - unsigned long initval; spin_lock(&xprt->transport_lock); if (reconnect_timeout < xprt->max_reconnect_timeout) xprt->max_reconnect_timeout = reconnect_timeout; - if (connect_timeout < xprt->connect_timeout) { - memcpy(&to, xprt->timeout, sizeof(to)); - initval = DIV_ROUND_UP(connect_timeout, to.to_retries + 1); - /* Arbitrary lower limit */ - if (initval < XS_TCP_INIT_REEST_TO << 1) - initval = XS_TCP_INIT_REEST_TO << 1; - to.to_initval = initval; - to.to_maxval = initval; - memcpy(&transport->tcp_timeout, &to, - sizeof(transport->tcp_timeout)); - xprt->timeout = &transport->tcp_timeout; - xprt->connect_timeout = connect_timeout; - } + if (connect_timeout < xprt->connect_timeout) + xs_tcp_do_set_connect_timeout(xprt, connect_timeout); set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); spin_unlock(&xprt->transport_lock); } @@ -3335,8 +3355,13 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->timeout = &xs_tcp_default_timeout; xprt->max_reconnect_timeout = xprt->timeout->to_maxval; + if (args->reconnect_timeout) + xprt->max_reconnect_timeout = args->reconnect_timeout; + xprt->connect_timeout = xprt->timeout->to_initval * (xprt->timeout->to_retries + 1); + if (args->connect_timeout) + xs_tcp_do_set_connect_timeout(xprt, args->connect_timeout); INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); INIT_WORK(&transport->error_worker, xs_error_handle);