diff --git a/COPYING.SWsoft b/COPYING.SWsoft new file mode 100644 index 0000000..059256d --- /dev/null +++ b/COPYING.SWsoft @@ -0,0 +1,350 @@ + +Nothing in this license should be construed as a grant by SWsoft of any rights +beyond the rights specified in the GNU General Public License, and nothing in +this license should be construed as a waiver by SWsoft of its patent, copyright +and/or trademark rights, beyond the waiver required by the GNU General Public +License. This license is expressly inapplicable to any product that is not +within the scope of the GNU General Public License + +---------------------------------------- + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/Makefile b/Makefile index 16e3fbb..fcffc7e 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 27 EXTRAVERSION = +VZVERSION = 037test001 NAME = Rotary Wombat # *DOCUMENTATION* @@ -347,7 +348,7 @@ KBUILD_AFLAGS := -D__ASSEMBLY__ KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null) KERNELVERSION = $(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) -export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION +export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION VZVERSION export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC export CPP AR NM STRIP OBJCOPY OBJDUMP MAKE AWK GENKSYMS PERL UTS_MACHINE export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS @@ -1002,7 +1003,8 @@ define filechk_utsrelease.h echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2; \ exit 1; \ fi; \ - (echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";) + (echo \#define UTS_RELEASE \"$(KERNELRELEASE)\"; \ + echo \#define VZVERSION \"$(VZVERSION)\";) endef define filechk_version.h diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index e9842f6..643f220 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -191,7 +191,7 @@ int __cpuexit __cpu_disable(void) local_flush_tlb_all(); read_lock(&tasklist_lock); - for_each_process(p) { + for_each_process_all(p) { if (p->mm) cpu_clear(cpu, p->mm->cpu_vm_mask); } diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 48e496f..8a2572b 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -612,6 +612,7 @@ source "arch/ia64/kvm/Kconfig" source "lib/Kconfig" +source "kernel/bc/Kconfig" # # Use the generic interrupt handling code in kernel/irq/: # @@ -639,6 +640,8 @@ source "arch/ia64/hp/sim/Kconfig" source "arch/ia64/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" source "crypto/Kconfig" diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c index 4f0c30c..067cb28 100644 --- a/arch/ia64/ia32/binfmt_elf32.c +++ b/arch/ia64/ia32/binfmt_elf32.c @@ -17,6 +17,8 @@ #include #include +#include + #include "ia32priv.h" #include "elfcore32.h" @@ -132,6 +134,12 @@ ia64_elf32_init (struct pt_regs *regs) up_write(¤t->mm->mmap_sem); } + if (ub_memory_charge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * + IA32_LDT_ENTRY_SIZE), + VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, + NULL, UB_SOFT)) + goto skip; + /* * Install LDT as anonymous memory. This gives us all-zero segment descriptors * until a task modifies them via modify_ldt(). @@ -152,7 +160,12 @@ ia64_elf32_init (struct pt_regs *regs) } } up_write(¤t->mm->mmap_sem); - } + } else + ub_memory_uncharge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * + IA32_LDT_ENTRY_SIZE), + VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, NULL); + +skip: ia64_psr(regs)->ac = 0; /* turn off alignment checking */ regs->loadrs = 0; diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index b9ac1a6..9504729 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -20,11 +20,13 @@ #include #include +#include + #include static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) @@ -41,7 +43,7 @@ pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) @@ -59,7 +61,7 @@ pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd) static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) @@ -87,7 +89,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr) struct page *page; void *pg; - pg = quicklist_alloc(0, GFP_KERNEL, NULL); + pg = quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL); if (!pg) return NULL; page = virt_to_page(pg); diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index f88fa05..695c23f 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -361,7 +361,7 @@ struct thread_struct { regs->loadrs = 0; \ regs->r8 = get_dumpable(current->mm); /* set "don't zap registers" flag */ \ regs->r12 = new_sp - 16; /* allocate 16 byte scratch area */ \ - if (unlikely(!get_dumpable(current->mm))) { \ + if (unlikely(!get_dumpable(current->mm) || !current->mm->vps_dumpable)) { \ /* \ * Zap scratch regs to avoid leaking bits between processes with different \ * uid/privileges. \ diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h index d535833..5b5eb9c 100644 --- a/arch/ia64/include/asm/unistd.h +++ b/arch/ia64/include/asm/unistd.h @@ -308,6 +308,16 @@ #define __NR_dup3 1316 #define __NR_pipe2 1317 #define __NR_inotify_init1 1318 +#define __NR_fairsched_vcpus 1499 +#define __NR_fairsched_mknod 1500 +#define __NR_fairsched_rmnod 1501 +#define __NR_fairsched_chwt 1502 +#define __NR_fairsched_mvpr 1503 +#define __NR_fairsched_rate 1504 +#define __NR_getluid 1505 +#define __NR_setluid 1506 +#define __NR_setublimit 1507 +#define __NR_ubstat 1508 #ifdef __KERNEL__ diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 0dd6c14..d96ff73 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -510,6 +510,74 @@ GLOBAL_ENTRY(clone) br.ret.sptk.many rp END(clone) +GLOBAL_ENTRY(ia64_ret_from_resume) + PT_REGS_UNWIND_INFO(0) +{ /* + * Some versions of gas generate bad unwind info if the first instruction of a + * procedure doesn't go into the first slot of a bundle. This is a workaround. + */ + nop.m 0 + nop.i 0 + /* + * We need to call schedule_tail() to complete the scheduling process. + * Called by ia64_switch_to() after do_fork()->copy_thread(). r8 contains the + * address of the previously executing task. + */ + br.call.sptk.many rp=ia64_invoke_schedule_tail +} + br.call.sptk.many rp=ia64_invoke_resume + ;; + adds sp=256,sp + ;; + /* Return from interrupt, we are all right. */ +(pNonSys) br ia64_leave_kernel + ;; + /* Tricky part follows. We must restore correct syscall + * register frame before doing normal syscall exit job. + * It would the most natural to keep sw->ar_pfs correct, + * then we would be here with correct register frame. + * Unfortunately, IA64 has a feature. Registers were in backstore + * after context switch, and the first br.ret does _NOT_ fetch + * output registers. + * It is quite natural: look, if caller has output regs in his + * frame, they should be consumed. If callee does not have (enough of) + * input/local registers (1 in this case), the situation is unusual. + * Practical evidence: they are filled with something random crap. + * The only case, when this is essential in mainstream kernel + * is sys_clone(). The result is that new process gets some kernel + * information in its register frame. Which is a security problem, btw. + * + * So, we set sw->ar_pfs to pretend the whole frame is of local + * regs. And we have to repartition the frame it manually, using + * information from pt->cr_ifs (the register is invalid in this + * case, but it holds correct pfm). + */ + adds r3=PT(CR_IFS)+16,sp + ;; + ld8 r2=[r3],-(PT(CR_IFS)-PT(R8)) + ;; + extr.u r2=r2,0,37 + mov r8=ar.ec + ;; + extr.u r8=r8,0,5 + ;; + shl r8=r8,52 + ;; + or r2=r2,r8 + ;; + mov ar.pfs=r2 + ;; + movl r2=ia64_leave_syscall + ;; + mov rp=r2 + /* Plus, we should fetch r8 and r10 from pt_regs. Something else? */ + ld8 r8=[r3],PT(R10)-PT(R8) + ;; + ld8 r10=[r3] + ;; + br.ret.sptk.many rp +END(ia64_ret_from_resume) + /* * Invoke a system call, but do some tracing before and after the call. * We MUST preserve the current register frame throughout this routine @@ -1264,6 +1332,34 @@ GLOBAL_ENTRY(ia64_invoke_schedule_tail) br.ret.sptk.many rp END(ia64_invoke_schedule_tail) +GLOBAL_ENTRY(ia64_invoke_resume) + alloc loc1=ar.pfs,0,3,1,0 + mov loc0=rp + adds out0=16,sp + ;; + ld8 r8=[out0] + ;; + cmp.eq p6,p0=r8,r0 + ;; +(p6) br.cond.sptk 1f + ;; + mov loc2=gp + ;; + ld8 r10=[r8],8 + ;; + ld8 gp=[r8] + ;; + mov b7=r10 + ;; + br.call.sptk.many rp=b7 + ;; + mov gp=loc2 +1: + mov ar.pfs=loc1 + mov rp=loc0 + br.ret.sptk.many rp +END(ia64_invoke_resume) + /* * Setup stack and call do_notify_resume_user(), keeping interrupts * disabled. @@ -1698,5 +1794,18 @@ sys_call_table: data8 sys_pipe2 data8 sys_inotify_init1 +.rept 1499-1313 + data8 sys_ni_syscall +.endr + data8 sys_fairsched_vcpus + data8 sys_fairsched_mknod // 1500 + data8 sys_fairsched_rmnod + data8 sys_fairsched_chwt + data8 sys_fairsched_mvpr + data8 sys_fairsched_rate + data8 sys_getluid // 1505 + data8 sys_setluid + data8 sys_setublimit + data8 sys_ubstat .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls #endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index c1625c7..634b102 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -90,53 +90,6 @@ ENTRY(fsys_getpid) FSYS_RETURN END(fsys_getpid) -ENTRY(fsys_getppid) - .prologue - .altrp b6 - .body - add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16 - ;; - ld8 r17=[r17] // r17 = current->group_leader - add r9=TI_FLAGS+IA64_TASK_SIZE,r16 - ;; - - ld4 r9=[r9] - add r17=IA64_TASK_REAL_PARENT_OFFSET,r17 // r17 = ¤t->group_leader->real_parent - ;; - and r9=TIF_ALLWORK_MASK,r9 - -1: ld8 r18=[r17] // r18 = current->group_leader->real_parent - ;; - cmp.ne p8,p0=0,r9 - add r8=IA64_TASK_TGID_OFFSET,r18 // r8 = ¤t->group_leader->real_parent->tgid - ;; - - /* - * The .acq is needed to ensure that the read of tgid has returned its data before - * we re-check "real_parent". - */ - ld4.acq r8=[r8] // r8 = current->group_leader->real_parent->tgid -#ifdef CONFIG_SMP - /* - * Re-read current->group_leader->real_parent. - */ - ld8 r19=[r17] // r19 = current->group_leader->real_parent -(p8) br.spnt.many fsys_fallback_syscall - ;; - cmp.ne p6,p0=r18,r19 // did real_parent change? - mov r19=0 // i must not leak kernel bits... -(p6) br.cond.spnt.few 1b // yes -> redo the read of tgid and the check - ;; - mov r17=0 // i must not leak kernel bits... - mov r18=0 // i must not leak kernel bits... -#else - mov r17=0 // i must not leak kernel bits... - mov r18=0 // i must not leak kernel bits... - mov r19=0 // i must not leak kernel bits... -#endif - FSYS_RETURN -END(fsys_getppid) - ENTRY(fsys_set_tid_address) .prologue .altrp b6 @@ -767,7 +720,7 @@ fsyscall_table: data8 0 // chown data8 0 // lseek // 1040 data8 fsys_getpid // getpid - data8 fsys_getppid // getppid + data8 0 // getppid data8 0 // mount data8 0 // umount data8 0 // setuid // 1045 diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S index 66e491d..4308d48 100644 --- a/arch/ia64/kernel/head.S +++ b/arch/ia64/kernel/head.S @@ -1097,7 +1097,7 @@ GLOBAL_ENTRY(start_kernel_thread) mov out1 = r11;; br.call.sptk.many rp = kernel_thread_helper;; mov out0 = r8 - br.call.sptk.many rp = sys_exit;; + br.call.sptk.many rp = do_exit;; 1: br.sptk.few 1b // not reached END(start_kernel_thread) diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c index 6da1f20..24950d6 100644 --- a/arch/ia64/kernel/ia64_ksyms.c +++ b/arch/ia64/kernel/ia64_ksyms.c @@ -75,6 +75,8 @@ EXPORT_SYMBOL(xor_ia64_4); EXPORT_SYMBOL(xor_ia64_5); #endif +EXPORT_SYMBOL(empty_zero_page); + #include EXPORT_SYMBOL(ia64_pal_call_phys_stacked); EXPORT_SYMBOL(ia64_pal_call_phys_static); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 7dd96c1..d849ed0 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1608,10 +1608,10 @@ default_monarch_init_process(struct notifier_block *self, unsigned long val, voi } printk("\n\n"); if (read_trylock(&tasklist_lock)) { - do_each_thread (g, t) { + do_each_thread_all (g, t) { printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); show_stack(t, NULL); - } while_each_thread (g, t); + } while_each_thread_all (g, t); read_unlock(&tasklist_lock); } /* FIXME: This will not restore zapped printk locks. */ diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index fc8f350..057bbb3 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -4176,12 +4176,12 @@ pfm_check_task_exist(pfm_context_t *ctx) read_lock(&tasklist_lock); - do_each_thread (g, t) { + do_each_thread_ve (g, t) { if (t->thread.pfm_context == ctx) { ret = 0; goto out; } - } while_each_thread (g, t); + } while_each_thread_ve (g, t); out: read_unlock(&tasklist_lock); diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 3ab8373..af4e88a 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -391,6 +392,9 @@ ia64_load_extra (struct task_struct *task) #endif } +extern char ia64_ret_from_resume; +EXPORT_SYMBOL(ia64_ret_from_resume); + /* * Copy the state of an ia-64 thread. * @@ -464,7 +468,6 @@ copy_thread (int nr, unsigned long clone_flags, child_ptregs->r12 = user_stack_base + user_stack_size - 16; child_ptregs->ar_bspstore = user_stack_base; child_ptregs->ar_rnat = 0; - child_ptregs->loadrs = 0; } } else { /* @@ -676,16 +679,25 @@ out: return error; } +extern void start_kernel_thread (void); +EXPORT_SYMBOL(start_kernel_thread); + pid_t kernel_thread (int (*fn)(void *), void *arg, unsigned long flags) { - extern void start_kernel_thread (void); unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; struct { struct switch_stack sw; struct pt_regs pt; } regs; + /* Don't allow kernel_thread() inside VE */ + if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) { + printk("kernel_thread call inside container\n"); + dump_stack(); + return -EPERM; + } + memset(®s, 0, sizeof(regs)); regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ regs.pt.r1 = helper_fptr[1]; /* set GP */ diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 2a9943b..e44debf 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -10,6 +10,7 @@ * Derived from the x86 and Alpha versions. */ #include +#include #include #include #include @@ -105,6 +106,8 @@ ia64_get_scratch_nat_bits (struct pt_regs *pt, unsigned long scratch_unat) # undef GET_BITS } +EXPORT_SYMBOL(ia64_get_scratch_nat_bits); +EXPORT_SYMBOL(__ia64_save_fpu); /* * Set the NaT bits for the scratch registers according to NAT and @@ -461,6 +464,7 @@ ia64_peek (struct task_struct *child, struct switch_stack *child_stack, *val = ret; return 0; } +EXPORT_SYMBOL(ia64_peek); long ia64_poke (struct task_struct *child, struct switch_stack *child_stack, @@ -525,6 +529,7 @@ ia64_get_user_rbs_end (struct task_struct *child, struct pt_regs *pt, *cfmp = cfm; return (unsigned long) ia64_rse_skip_regs(bspstore, ndirty); } +EXPORT_SYMBOL(ia64_get_user_rbs_end); /* * Synchronize (i.e, write) the RSE backing store living in kernel @@ -820,20 +825,20 @@ access_nat_bits (struct task_struct *child, struct pt_regs *pt, if (write_access) { nat_bits = *data; scratch_unat = ia64_put_scratch_nat_bits(pt, nat_bits); - if (unw_set_ar(info, UNW_AR_UNAT, scratch_unat) < 0) { - dprintk("ptrace: failed to set ar.unat\n"); - return -1; - } + if (info->pri_unat_loc) + *info->pri_unat_loc = scratch_unat; + else + info->sw->caller_unat = scratch_unat; for (regnum = 4; regnum <= 7; ++regnum) { unw_get_gr(info, regnum, &dummy, &nat); unw_set_gr(info, regnum, dummy, (nat_bits >> regnum) & 1); } } else { - if (unw_get_ar(info, UNW_AR_UNAT, &scratch_unat) < 0) { - dprintk("ptrace: failed to read ar.unat\n"); - return -1; - } + if (info->pri_unat_loc) + scratch_unat = *info->pri_unat_loc; + else + scratch_unat = info->sw->caller_unat; nat_bits = ia64_get_scratch_nat_bits(pt, scratch_unat); for (regnum = 4; regnum <= 7; ++regnum) { unw_get_gr(info, regnum, &dummy, &nat); diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 19c5a78..cc6c4e6 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -464,6 +465,12 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) if (!user_mode(&scr->pt)) return; + if (try_to_freeze() && !signal_pending(current)) { + if ((long) scr->pt.r10 != -1) + restart = 0; + goto no_signal; + } + if (current_thread_info()->status & TS_RESTORE_SIGMASK) oldset = ¤t->saved_sigmask; else @@ -519,8 +526,10 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) if (IS_IA32_PROCESS(&scr->pt)) { scr->pt.r8 = scr->pt.r1; scr->pt.cr_iip -= 2; - } else + } else { ia64_decrement_ip(&scr->pt); + scr->pt.r10 = 0; + } restart = 0; /* don't restart twice if handle_signal() fails... */ } } @@ -542,6 +551,7 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) } /* Did we come from a system call? */ +no_signal: if (restart) { /* Restart the system call - no handlers present */ if (errno == ERESTARTNOHAND || errno == ERESTARTSYS || errno == ERESTARTNOINTR @@ -561,6 +571,7 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) ia64_decrement_ip(&scr->pt); if (errno == ERESTART_RESTARTBLOCK) scr->pt.r15 = __NR_restart_syscall; + scr->pt.r10 = 0; } } } diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index bcbb6d8..40c8320 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -204,7 +204,7 @@ do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, un /* Careful about overflows.. */ len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) { + if (len > TASK_SIZE) { addr = -EINVAL; goto out; } diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 65c10a4..9f0cdde 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -41,6 +41,8 @@ struct fsyscall_gtod_data_t fsyscall_gtod_data = { struct itc_jitter_data_t itc_jitter_data; volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */ +unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); #ifdef CONFIG_IA64_DEBUG_IRQ @@ -358,6 +360,8 @@ ia64_init_itm (void) /* avoid softlock up message when cpu is unplug and plugged again. */ touch_softlockup_watchdog(); + cpu_khz = local_cpu_data->proc_freq / 1000; + /* Setup the CPU local timer tick */ ia64_cpu_local_tick(); diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c index ff0e7c1..7288a9f 100644 --- a/arch/ia64/kernel/unaligned.c +++ b/arch/ia64/kernel/unaligned.c @@ -1291,7 +1291,7 @@ within_logging_rate_limit (void) { static unsigned long count, last_time; - if (time_after(jiffies, last_time + 5 * HZ)) + if (time_after(jiffies, last_time + 60 * HZ)) count = 0; if (count < 5) { last_time = jiffies; diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 23088be..da13815 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -148,7 +148,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re if ((vma->vm_flags & mask) != mask) goto bad_area; - survive: /* * If for any reason at all we couldn't handle the fault, make * sure we exit gracefully rather than endlessly redo the @@ -276,13 +275,13 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re out_of_memory: up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; + if (user_mode(regs)) { + /* + * 0-order allocation always success if something really + * fatal not happen: beancounter overdraft or OOM. + */ + force_sig(SIGKILL, current); + return; } - printk(KERN_CRIT "VM: killing process %s\n", current->comm); - if (user_mode(regs)) - do_group_exit(SIGKILL); goto no_context; } diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 200100e..226b5cc 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -37,6 +37,8 @@ #include #include +#include + DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); extern void ia64_tlb_init (void); @@ -111,6 +113,10 @@ ia64_init_addr_space (void) ia64_set_rbs_bot(); + if (ub_memory_charge(current->mm, PAGE_SIZE, VM_DATA_DEFAULT_FLAGS, + NULL, UB_SOFT)) + goto skip; + /* * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore * the problem. When the process attempts to write to the register backing store @@ -127,11 +133,16 @@ ia64_init_addr_space (void) if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); kmem_cache_free(vm_area_cachep, vma); + ub_memory_uncharge(current->mm, PAGE_SIZE, + VM_DATA_DEFAULT_FLAGS, NULL); return; } up_write(¤t->mm->mmap_sem); - } + } else + ub_memory_uncharge(current->mm, PAGE_SIZE, + VM_DATA_DEFAULT_FLAGS, NULL); +skip: /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ if (!(current->personality & MMAP_PAGE_ZERO)) { vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 587da5e..a9d6b81 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -833,8 +833,12 @@ source "arch/powerpc/sysdev/qe_lib/Kconfig" source "lib/Kconfig" +source "kernel/bc/Kconfig" + source "arch/powerpc/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" config KEYS_COMPAT diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h index 812a1d8..c0f7a7f 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/pgalloc-64.h @@ -26,7 +26,8 @@ extern struct kmem_cache *pgtable_cache[]; static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL); + return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], + GFP_KERNEL_UBC | __GFP_SOFT_UBC); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) @@ -42,7 +43,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM], - GFP_KERNEL|__GFP_REPEAT); + GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) @@ -88,10 +89,15 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd); } +static inline pte_t *do_pte_alloc(gfp_t flags) +{ + return (pte_t *)__get_free_page(flags); +} + static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); + return do_pte_alloc(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm, @@ -100,7 +106,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, struct page *page; pte_t *pte; - pte = pte_alloc_one_kernel(mm, address); + pte = do_pte_alloc(GFP_KERNEL_UBC | __GFP_REPEAT | __GFP_ZERO); if (!pte) return NULL; page = virt_to_page(pte); diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index f6cc7a4..34fc004 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -322,3 +322,19 @@ SYSCALL_SPU(epoll_create1) SYSCALL_SPU(dup3) SYSCALL_SPU(pipe2) SYSCALL(inotify_init1) +SYS_SKIP(319, 400) +SYSCALL(ni_syscall) +SYS_SKIP_END() +SYSCALL(fairsched_mknod) /* 400 */ +SYSCALL(fairsched_rmnod) +SYSCALL(fairsched_chwt) +SYSCALL(fairsched_mvpr) +SYSCALL(fairsched_rate) +SYSCALL(fairsched_vcpus) +SYS_SKIP(406, 410) +SYSCALL(ni_syscall) +SYS_SKIP_END() +SYSCALL(getluid) /* 410 */ +SYSCALL(setluid) +SYSCALL(setublimit) +SYSCALL(ubstat) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index e07d0c7..3fea592 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -342,9 +342,14 @@ #define __NR_pipe2 317 #define __NR_inotify_init1 318 +#define __NR_getluid 410 +#define __NR_setluid 411 +#define __NR_setublimit 412 +#define __NR_ubstat 413 + #ifdef __KERNEL__ -#define __NR_syscalls 319 +#define __NR_syscalls 414 #define __NR__exit __NR_exit #define NR_syscalls __NR_syscalls diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 7a6dfbc..28c26b4 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -835,7 +835,7 @@ _GLOBAL(abs) * Create a kernel thread * kernel_thread(fn, arg, flags) */ -_GLOBAL(kernel_thread) +_GLOBAL(ppc_kernel_thread) stwu r1,-16(r1) stw r30,8(r1) stw r31,12(r1) diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 4dd70cf..2e10116 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -415,7 +415,7 @@ _GLOBAL(scom970_write) * Create a kernel thread * kernel_thread(fn, arg, flags) */ -_GLOBAL(kernel_thread) +_GLOBAL(ppc_kernel_thread) std r29,-24(r1) std r30,-16(r1) stdu r1,-STACK_FRAME_OVERHEAD(r1) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 957bded..ca7410c 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -50,6 +50,8 @@ #include #include +#include + extern unsigned long _get_SP(void); #ifndef CONFIG_SMP @@ -501,8 +503,9 @@ void show_regs(struct pt_regs * regs) printk("NIP: "REG" LR: "REG" CTR: "REG"\n", regs->nip, regs->link, regs->ctr); - printk("REGS: %p TRAP: %04lx %s (%s)\n", - regs, regs->trap, print_tainted(), init_utsname()->release); + printk("REGS: %p TRAP: %04lx %s (%s %s)\n", + regs, regs->trap, print_tainted(), init_utsname()->release, + VZVERSION); printk("MSR: "REG" ", regs->msr); printbits(regs->msr, msr_bits); printk(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); @@ -1057,6 +1060,20 @@ void dump_stack(void) } EXPORT_SYMBOL(dump_stack); +long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + extern long ppc_kernel_thread(int (*fn)(void *), void *arg, + unsigned long flags); + + if (!ve_is_super(get_exec_env())) { + printk("kernel_thread call inside container\n"); + dump_stack(); + return -EPERM; + } + + return ppc_kernel_thread(fn, arg, flags); +} + #ifdef CONFIG_PPC64 void ppc64_runlatch_on(void) { diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index 93219c3..a9e16bb 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -43,5 +43,9 @@ .p2align 3 #endif +#define SYS_SKIP(from, to) .rept to - from \ + SYSCALL(sys_ni_syscall) \ + .endr + _GLOBAL(sys_call_table) #include diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 565b7a2..8400dec 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -309,7 +309,6 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - survive: ret = handle_mm_fault(mm, vma, address, is_write); if (unlikely(ret & VM_FAULT_ERROR)) { if (ret & VM_FAULT_OOM) @@ -349,14 +348,12 @@ bad_area_nosemaphore: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", current->comm); if (user_mode(regs)) - do_group_exit(SIGKILL); + /* + * 0-order allocation always success if something really + * fatal not happen: beancounter overdraft or OOM. Den + */ + force_sig(SIGKILL, current); return SIGKILL; do_sigbus: diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 036fe2f..807473a 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -168,8 +168,8 @@ struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; void pgtable_cache_init(void) { - pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor); - pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor); + pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC|SLAB_UBC|SLAB_NO_CHARGE, pgd_ctor); + pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC|SLAB_UBC|SLAB_NO_CHARGE, pmd_ctor); } #ifdef CONFIG_SPARSEMEM_VMEMMAP diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 2001abd..ea128b6 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -83,7 +83,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret; - ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); + ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC | + __GFP_ZERO, PGDIR_ORDER); return ret; } @@ -117,6 +118,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) #else gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO; #endif + flags |= (__GFP_UBC | __GFP_SOFT_UBC); ptepage = alloc_pages(flags, 0); if (!ptepage) diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c index 19f6bfd..4f23f43 100644 --- a/arch/powerpc/platforms/cell/spu_callbacks.c +++ b/arch/powerpc/platforms/cell/spu_callbacks.c @@ -46,6 +46,8 @@ static void *spu_syscall_table[] = { #define PPC_SYS_SPU(func) ppc_##func, #define SYSX_SPU(f, f3264, f32) f, +#define SYS_SKIP(from, to) [from ... to] = sys_ni_syscall, + #include }; diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8d41908..2e2f811 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -587,6 +587,8 @@ source "fs/Kconfig" source "arch/s390/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" source "crypto/Kconfig" diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 00b9b4d..6194a6a 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -573,8 +573,19 @@ out: */ int __cpuinit start_secondary(void *cpuvoid) { - /* Setup the cpu */ - cpu_init(); + /* Setup the cpu */ + cpu_init(); + +#ifdef CONFIG_VE + /* TSC reset. kill whatever might rely on old values */ + VE_TASK_INFO(current)->wakeup_stamp = 0; + /* + * Cosmetic: sleep_time won't be changed afterwards for the idle + * thread; keep it 0 rather than -cycles. + */ + VE_TASK_INFO(idle)->sleep_time = 0; +#endif + preempt_disable(); /* Enable TOD clock interrupts on the secondary cpu. */ init_cpu_timer(); @@ -831,6 +842,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus) for_each_possible_cpu(cpu) if (cpu != smp_processor_id()) smp_create_idle(cpu); + +#ifdef CONFIG_VE + /* TSC reset. kill whatever might rely on old values */ + VE_TASK_INFO(current)->wakeup_stamp = 0; +#endif } void __init smp_prepare_boot_cpu(void) diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index b9dbd2d..d0a8c54 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -670,7 +670,7 @@ asids_proc_info(char *buf, char **start, off_t fpos, int length, int *eof, void int len=0; struct task_struct *p; read_lock(&tasklist_lock); - for_each_process(p) { + for_each_process_ve(p) { int pid = p->pid; if (!pid) diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h index 5bdfa2c..dd89e73 100644 --- a/arch/sparc/include/asm/pgalloc_64.h +++ b/arch/sparc/include/asm/pgalloc_64.h @@ -16,7 +16,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return quicklist_alloc(0, GFP_KERNEL_UBC, NULL); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) @@ -28,7 +28,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_REPEAT, NULL); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) @@ -48,7 +48,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, struct page *page; void *pg; - pg = quicklist_alloc(0, GFP_KERNEL, NULL); + pg = quicklist_alloc(0, GFP_KERNEL_UBC, NULL); if (!pg) return NULL; page = virt_to_page(pg); diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index c0a737d..5ef8b1d 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -163,14 +163,14 @@ register struct thread_info *current_thread_info_reg asm("g6"); struct thread_info *ret; \ \ ret = (struct thread_info *) \ - __get_free_pages(GFP_KERNEL, __THREAD_INFO_ORDER); \ + __get_free_pages(GFP_KERNEL_UBC, __THREAD_INFO_ORDER); \ if (ret) \ memset(ret, 0, PAGE_SIZE<<__THREAD_INFO_ORDER); \ ret; \ }) #else #define alloc_thread_info(tsk) \ - ((struct thread_info *)__get_free_pages(GFP_KERNEL, __THREAD_INFO_ORDER)) + ((struct thread_info *)__get_free_pages(GFP_KERNEL_UBC, __THREAD_INFO_ORDER)) #endif #define free_thread_info(ti) \ @@ -237,6 +237,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define TIF_ABI_PENDING 12 #define TIF_MEMDIE 13 #define TIF_POLLING_NRFLAG 14 +#define TIF_FREEZE 15 /* Freeze request (atomic PF_FREEZE) */ #define _TIF_SYSCALL_TRACE (1<comm, task_pid_nr(current), str, ++die_counter); + printk("VE:EXCVE %d:%d, CPU %d, VCPU %d:%d\n", + VEID(VE_TASK_INFO(current)->owner_env), VEID(get_exec_env()), + smp_processor_id(), + task_vsched_id(current), task_cpu(current)); notify_die(DIE_OOPS, str, regs, 0, 255, SIGSEGV); __asm__ __volatile__("flushw"); show_regs(regs); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864..518d26d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1777,6 +1777,7 @@ config SYSVIPC_COMPAT endmenu +source "kernel/Kconfig.openvz" source "net/Kconfig" @@ -1795,3 +1796,5 @@ source "crypto/Kconfig" source "arch/x86/kvm/Kconfig" source "lib/Kconfig" + +source "kernel/bc/Kconfig" diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ffc1bb4..0c8651c 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -612,7 +612,7 @@ ia32_sys_call_table: .quad stub32_iopl /* 110 */ .quad sys_vhangup .quad quiet_ni_syscall /* old "idle" system call */ - .quad sys32_vm86_warning /* vm86old */ + .quad quiet_ni_syscall /* vm86old */ .quad compat_sys_wait4 .quad sys_swapoff /* 115 */ .quad compat_sys_sysinfo @@ -665,7 +665,7 @@ ia32_sys_call_table: .quad sys_mremap .quad sys_setresuid16 .quad sys_getresuid16 /* 165 */ - .quad sys32_vm86_warning /* vm86 */ + .quad quiet_ni_syscall /* vm86 */ .quad quiet_ni_syscall /* query_module */ .quad sys_poll .quad compat_sys_nfsservctl diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index d3c6408..3b2163f 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -817,20 +817,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, advice); } -long sys32_vm86_warning(void) -{ - struct task_struct *me = current; - static char lastcomm[sizeof(me->comm)]; - - if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - compat_printk(KERN_INFO - "%s: vm86 mode not supported on 64 bit kernel\n", - me->comm); - strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } - return -ENOSYS; -} - long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, char __user *buf, size_t len) { diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 109792b..37f57b0 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -225,6 +225,7 @@ ENTRY(ret_from_fork) GET_THREAD_INFO(%ebp) popl %eax CFI_ADJUST_CFA_OFFSET -4 +ret_from_fork_tail: pushl $0x0202 # Reset kernel eflags CFI_ADJUST_CFA_OFFSET 4 popfl @@ -233,6 +234,25 @@ ENTRY(ret_from_fork) CFI_ENDPROC END(ret_from_fork) +ENTRY(i386_ret_from_resume) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + movl (%esp),%eax + testl %eax,%eax + jz 1f + pushl %esp + call *%eax + addl $4,%esp +1: + addl $256,%esp + jmp ret_from_fork_tail + CFI_ENDPROC + /* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 89434d4..f422ac6 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -279,7 +279,12 @@ ENTRY(ret_from_fork) popf # reset kernel eflags CFI_ADJUST_CFA_OFFSET -4 call schedule_tail +ret_from_fork_tail: GET_THREAD_INFO(%rcx) + btr $TIF_RESUME,TI_flags(%rcx) + jc x86_64_ret_from_resume + +ret_from_fork_check: testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) jnz rff_trace rff_action: @@ -295,6 +300,19 @@ rff_trace: call syscall_trace_leave GET_THREAD_INFO(%rcx) jmp rff_action + +x86_64_ret_from_resume: + movq (%rsp),%rax + testq %rax,%rax + jz 1f + movq %rsp,%rdi + call *%rax +1: + addq $256,%rsp + cmpq $0,ORIG_RAX(%rsp) + jge ret_from_fork_tail + RESTORE_REST + jmp int_ret_from_sys_call CFI_ENDPROC END(ret_from_fork) @@ -1155,7 +1173,7 @@ ENTRY(kernel_thread) xorl %r9d,%r9d # clone now - call do_fork + call do_fork_kthread movq %rax,RAX(%rsp) xorl %edi,%edi diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index eb9ddd8..ee119de 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -144,6 +144,7 @@ int init_fpu(struct task_struct *tsk) set_stopped_child_used_math(tsk); return 0; } +EXPORT_SYMBOL(init_fpu); int fpregs_active(struct task_struct *target, const struct user_regset *regset) { diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index b68e21f..c0d3285 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -38,9 +40,9 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount * LDT_ENTRY_SIZE); + newldt = ub_vmalloc(mincount * LDT_ENTRY_SIZE); else - newldt = (void *)__get_free_page(GFP_KERNEL); + newldt = (void *)__get_free_page(GFP_KERNEL_UBC); if (!newldt) return -ENOMEM; @@ -110,6 +112,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) } return retval; } +EXPORT_SYMBOL_GPL(init_new_context); /* * No need to lock the MM as we are the last user diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index abb78a2..2f01bd2 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -378,6 +378,21 @@ void touch_nmi_watchdog(void) } EXPORT_SYMBOL(touch_nmi_watchdog); +void smp_show_regs(struct pt_regs *regs, void *info) +{ + static DEFINE_SPINLOCK(show_regs_lock); + + if (regs == NULL) + return; + + spin_lock(&show_regs_lock); + bust_spinlocks(1); + printk("----------- IPI show regs -----------"); + show_regs(regs); + bust_spinlocks(0); + spin_unlock(&show_regs_lock); +} + notrace __kprobes int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) { @@ -423,10 +438,10 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... + * wait a few IRQs (30 seconds) before doing the oops ... */ local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) + if (local_read(&__get_cpu_var(alert_counter)) == 30 * nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 31f40b2..f269e6b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include @@ -58,6 +60,9 @@ #include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); +EXPORT_SYMBOL(ret_from_fork); +asmlinkage void i386_ret_from_resume(void) __asm__("i386_ret_from_resume"); +EXPORT_SYMBOL_GPL(i386_ret_from_resume); DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); @@ -173,16 +178,17 @@ void __show_registers(struct pt_regs *regs, int all) } printk("\n"); - printk("Pid: %d, comm: %s %s (%s %.*s)\n", + printk("Pid: %d, comm: %s %s (%s %.*s %s)\n", task_pid_nr(current), current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + init_utsname()->version, VZVERSION); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, smp_processor_id()); - print_symbol("EIP is at %s\n", regs->ip); + if (decode_call_traces) + print_symbol("EIP is at %s\n", regs->ip); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); @@ -218,6 +224,8 @@ void show_regs(struct pt_regs *regs) { __show_registers(regs, 1); show_trace(NULL, regs, ®s->sp, regs->bp); + if (!decode_call_traces) + printk(" EIP: [<%08lx>]\n", regs->ip); } /* @@ -226,6 +234,7 @@ void show_regs(struct pt_regs *regs) * the "args". */ extern void kernel_thread_helper(void); +EXPORT_SYMBOL(kernel_thread_helper); /* * Create a kernel thread @@ -234,6 +243,13 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) { struct pt_regs regs; + /* Don't allow kernel_thread() inside VE */ + if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) { + printk("kernel_thread call inside container\n"); + dump_stack(); + return -EPERM; + } + memset(®s, 0, sizeof(regs)); regs.bx = (unsigned long) fn; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e12e0e4..df9a2c1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -26,8 +26,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -52,8 +54,6 @@ #include #include -asmlinkage extern void ret_from_fork(void); - unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -162,13 +162,14 @@ void __show_regs(struct pt_regs * regs) printk("\n"); print_modules(); - printk("Pid: %d, comm: %.20s %s %s %.*s\n", + printk("Pid: %d, comm: %.20s %s %s %.*s %s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + init_utsname()->version, VZVERSION); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); - printk_address(regs->ip, 1); + if (decode_call_traces) + printk_address(regs->ip, 1); printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, regs->flags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", @@ -216,7 +217,9 @@ void show_regs(struct pt_regs *regs) { printk("CPU %d:", smp_processor_id()); __show_regs(regs); - show_trace(NULL, regs, (void *)(regs + 1), regs->bp); + show_trace(NULL, regs, ®s->sp, regs->bp); + if (!decode_call_traces) + printk(" EIP: [<%08lx>]\n", regs->ip); } /* @@ -857,3 +860,20 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) unsigned long range_end = mm->brk + 0x02000000; return randomize_range(mm->brk, range_end, 0) ? : mm->brk; } + +long do_fork_kthread(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + if (ve_allow_kthreads || ve_is_super(get_exec_env())) + return do_fork(clone_flags, stack_start, regs, stack_size, + parent_tidptr, child_tidptr); + + /* Don't allow kernel_thread() inside VE */ + printk("kernel_thread call inside container\n"); + dump_stack(); + return -EPERM; +} diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 6fb5bcd..9636847 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -593,6 +594,9 @@ static void do_signal(struct pt_regs *regs) if (!user_mode(regs)) return; + if (try_to_freeze() && !signal_pending(current)) + goto no_signal; + if (current_thread_info()->status & TS_RESTORE_SIGMASK) oldset = ¤t->saved_sigmask; else @@ -622,6 +626,7 @@ static void do_signal(struct pt_regs *regs) return; } +no_signal: /* Did we come from a system call? */ if ((long)regs->orig_ax >= 0) { /* Restart the system call - no handlers present */ diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ca316b5..aab9142 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -489,6 +490,9 @@ static void do_signal(struct pt_regs *regs) if (!user_mode(regs)) return; + if (try_to_freeze() && !signal_pending(current)) + goto no_signal; + if (current_thread_info()->status & TS_RESTORE_SIGMASK) oldset = ¤t->saved_sigmask; else @@ -517,6 +521,7 @@ static void do_signal(struct pt_regs *regs) return; } +no_signal: /* Did we come from a system call? */ if (current_syscall(regs) >= 0) { /* Restart the system call - no handlers present */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 361b7a4..a4cd6cc 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -140,6 +141,89 @@ void native_send_call_func_ipi(cpumask_t mask) send_IPI_mask(mask, CALL_FUNCTION_VECTOR); } +static DEFINE_SPINLOCK(nmi_call_lock); +static struct nmi_call_data_struct { + smp_nmi_function func; + void *info; + atomic_t started; + atomic_t finished; + cpumask_t cpus_called; + int wait; +} *nmi_call_data; + +static int smp_nmi_callback(struct pt_regs *regs, int cpu) +{ + smp_nmi_function func; + void *info; + int wait; + + func = nmi_call_data->func; + info = nmi_call_data->info; + wait = nmi_call_data->wait; + ack_APIC_irq(); + /* prevent from calling func() multiple times */ + if (cpu_test_and_set(cpu, nmi_call_data->cpus_called)) + return 0; + /* + * notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&nmi_call_data->started); + /* at this point the nmi_call_data structure is out of scope */ + irq_enter(); + func(regs, info); + irq_exit(); + if (wait) + atomic_inc(&nmi_call_data->finished); + + return 1; +} + +/* + * This function tries to call func(regs, info) on each cpu. + * Func must be fast and non-blocking. + * May be called with disabled interrupts and from any context. + */ +int smp_nmi_call_function(smp_nmi_function func, void *info, int wait) +{ + struct nmi_call_data_struct data; + int cpus; + + cpus = num_online_cpus() - 1; + if (!cpus) + return 0; + + data.func = func; + data.info = info; + data.wait = wait; + atomic_set(&data.started, 0); + atomic_set(&data.finished, 0); + cpus_clear(data.cpus_called); + /* prevent this cpu from calling func if NMI happens */ + cpu_set(smp_processor_id(), data.cpus_called); + + if (!spin_trylock(&nmi_call_lock)) + return -1; + + nmi_call_data = &data; + set_nmi_ipi_callback(smp_nmi_callback); + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(APIC_DM_NMI); + while (atomic_read(&data.started) != cpus) + barrier(); + + unset_nmi_ipi_callback(); + if (wait) + while (atomic_read(&data.finished) != cpus) + barrier(); + spin_unlock(&nmi_call_lock); + + return 0; +} + static void stop_this_cpu(void *dummy) { local_irq_disable(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7985c5b..b806c31 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -870,6 +870,12 @@ do_rest: initial_code = (unsigned long)start_secondary; stack_start.sp = (void *) c_idle.idle->thread.sp; +#ifdef CONFIG_VE + /* Cosmetic: sleep_time won't be changed afterwards for the idle + * thread; keep it 0 rather than -cycles. */ + VE_TASK_INFO(c_idle.idle)->sleep_time = 0; +#endif + /* start_ip had better be page-aligned! */ start_ip = setup_trampoline(); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d44395f..5493e66 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -332,3 +332,22 @@ ENTRY(sys_call_table) .long sys_dup3 /* 330 */ .long sys_pipe2 .long sys_inotify_init1 + .rept 500-(.-sys_call_table)/4 + .long sys_ni_syscall + .endr + .long sys_fairsched_mknod /* 500 */ + .long sys_fairsched_rmnod + .long sys_fairsched_chwt + .long sys_fairsched_mvpr + .long sys_fairsched_rate + .long sys_fairsched_vcpus /* 505 */ + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_getluid /* 510 */ + .long sys_setluid + .long sys_setublimit + .long sys_ubstat + .long sys_ni_syscall + .long sys_ni_syscall diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index fec1ece..75f4b4e 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -204,6 +204,8 @@ void flush_tlb_mm(struct mm_struct *mm) preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_mm); + void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) { struct mm_struct *mm = vma->vm_mm; diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index dcbf7a1..d8ab2ef 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -242,6 +242,8 @@ void flush_tlb_mm(struct mm_struct *mm) preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_mm); + void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) { struct mm_struct *mm = vma->vm_mm; diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 03df8e4..7f2e838 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -203,6 +203,8 @@ print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) { printk(data); print_symbol(msg, symbol); + if (decode_call_traces) + print_symbol("%s\n", symbol); printk("\n"); } @@ -240,7 +242,10 @@ show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, char *log_lvl) { dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); - printk("%s =======================\n", log_lvl); + if (decode_call_traces) + printk("%s =======================\n", log_lvl); + else + printk("%s ==", log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, @@ -271,9 +276,14 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("\n%s ", log_lvl); printk("%08lx ", *stack++); } - printk("\n%sCall Trace:\n", log_lvl); + if (decode_call_traces) + printk("\n%s Call Trace:\n", log_lvl); + else + printk("\n%s Call Trace: ", log_lvl); show_trace_log_lvl(task, regs, sp, bp, log_lvl); + if (!decode_call_traces) + printk("\n"); } void show_stack(struct task_struct *task, unsigned long *sp) @@ -302,6 +312,8 @@ void dump_stack(void) init_utsname()->version); show_trace(current, NULL, &stack, bp); + if (!decode_call_traces) + printk("\n"); } EXPORT_SYMBOL(dump_stack); @@ -313,8 +325,9 @@ void show_registers(struct pt_regs *regs) print_modules(); __show_registers(regs, 0); - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", + printk(KERN_EMERG "Process %.*s (pid: %d, veid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, task_pid_nr(current), + VEID(current->ve_task_info.owner_env), current_thread_info(), current, task_thread_info(current)); /* * When in-kernel, we also print out the stack and code at the @@ -739,6 +752,21 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } +/* + * Voyager doesn't implement these + */ +void __attribute__((weak)) smp_show_regs(struct pt_regs *regs, void *info) +{ +} + +#ifdef CONFIG_SMP +int __attribute__((weak)) +smp_nmi_call_function(smp_nmi_function func, void *info, int wait) +{ + return 0; +} +#endif + static DEFINE_SPINLOCK(nmi_print_lock); void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) @@ -756,6 +784,10 @@ void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); show_registers(regs); + smp_nmi_call_function(smp_show_regs, NULL, 1); + bust_spinlocks(1); + if (!decode_call_traces) + show_registers(regs); if (do_panic) panic("Non maskable interrupt"); console_silent(); @@ -774,6 +806,13 @@ void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) do_exit(SIGSEGV); } +static int dummy_nmi_callback(struct pt_regs *regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback; + static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; @@ -829,12 +868,24 @@ notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) ++nmi_count(cpu); - if (!ignore_nmis) - default_do_nmi(regs); + if (!ignore_nmis) { + if (!nmi_ipi_callback(regs, cpu)) + default_do_nmi(regs); + } nmi_exit(); } +void set_nmi_ipi_callback(nmi_callback_t callback) +{ + nmi_ipi_callback = callback; +} + +void unset_nmi_ipi_callback(void) +{ + nmi_ipi_callback = dummy_nmi_callback; +} + void stop_nmi(void) { acpi_nmi_disable(); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 513caac..67846bf 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -381,7 +381,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (((long) stack & (THREAD_SIZE-1)) == 0) break; } - if (i && ((i % 4) == 0)) + if (i && ((i % 4) == 0) && decode_call_traces) printk("\n"); printk(" %016lx", *stack++); touch_nmi_watchdog(); @@ -425,10 +425,12 @@ void show_registers(struct pt_regs *regs) struct task_struct *cur = cpu_pda(cpu)->pcurrent; sp = regs->sp; - printk("CPU %d ", cpu); + printk("CPU: %d ", cpu); __show_regs(regs); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); + printk("Process %s (pid: %d, veid=%d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, + VEID(VE_TASK_INFO(current)->owner_env), + task_thread_info(cur), cur); /* * When in-kernel, we also print out the stack and code at the @@ -830,6 +832,13 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) io_check_error(reason, regs); } +static int dummy_nmi_callback(struct pt_regs *regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback; + asmlinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { @@ -837,12 +846,24 @@ do_nmi(struct pt_regs *regs, long error_code) add_pda(__nmi_count, 1); - if (!ignore_nmis) - default_do_nmi(regs); + if (!ignore_nmis) { + if (!nmi_ipi_callback(regs, smp_processor_id())) + default_do_nmi(regs); + } nmi_exit(); } +void set_nmi_ipi_callback(nmi_callback_t callback) +{ + nmi_ipi_callback = callback; +} + +void unset_nmi_ipi_callback(void) +{ + nmi_ipi_callback = dummy_nmi_callback; +} + void stop_nmi(void) { acpi_nmi_disable(); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9ffb01c..f523122 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -140,6 +140,10 @@ void __cpuinit check_tsc_sync_source(int cpu) printk(" passed.\n"); } +#ifdef CONFIG_VE + /* TSC reset. kill whatever might rely on old values */ + VE_TASK_INFO(current)->wakeup_stamp = 0; +#endif /* * Reset it - just in case we boot another CPU later: */ diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index b545f37..949ee93 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -3,6 +3,7 @@ #include #include +#include #include @@ -17,6 +18,7 @@ EXPORT_SYMBOL(mcount); #endif +EXPORT_SYMBOL(kernel_execve); EXPORT_SYMBOL(kernel_thread); EXPORT_SYMBOL(__get_user_1); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 455f3fe..1f51c50 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -405,7 +405,8 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT "paging request"); printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); - printk_address(regs->ip, 1); + if (decode_call_traces) + printk_address(regs->ip, 1); dump_pagetable(address); } @@ -571,7 +572,7 @@ static int vmalloc_fault(unsigned long address) #endif } -int show_unhandled_signals = 1; +int show_unhandled_signals = 0; /* * This routine handles page faults. It determines the address, @@ -678,7 +679,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) */ if (user_mode_vm(regs)) error_code |= PF_USER; -again: #endif /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -744,7 +744,6 @@ good_area: } #ifdef CONFIG_X86_32 -survive: #endif /* * If for any reason at all we couldn't handle the fault, @@ -804,7 +803,7 @@ bad_area_nosemaphore: if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { - printk( + ve_printk(VE_LOG, "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, @@ -878,19 +877,14 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_global_init(tsk)) { - yield(); -#ifdef CONFIG_X86_32 - down_read(&mm->mmap_sem); - goto survive; -#else - goto again; -#endif + if (error_code & PF_USER) { + /* + * 0-order allocation always success if something really + * fatal not happen: beancounter overdraft or OOM. + */ + force_sig(SIGKILL, tsk); + return; } - - printk("VM: killing process %s\n", tsk->comm); - if (error_code & PF_USER) - do_group_exit(SIGKILL); goto no_context; do_sigbus: diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 8f307d9..996b1ce 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -226,6 +227,7 @@ int pud_huge(pud_t pud) { return !!(pud_val(pud) & _PAGE_PSE); } +EXPORT_SYMBOL(pmd_huge); struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index d503027..c45bb24 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -14,9 +14,9 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) struct page *pte; #ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); + pte = alloc_pages(GFP_KERNEL_UBC|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); #else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + pte = alloc_pages(GFP_KERNEL_UBC|__GFP_REPEAT|__GFP_ZERO, 0); #endif if (pte) pgtable_page_ctor(pte); @@ -230,7 +230,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t *pmds[PREALLOCATED_PMDS]; unsigned long flags; - pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + pgd = (pgd_t *)__get_free_page(GFP_KERNEL_UBC | __GFP_ZERO); if (pgd == NULL) goto out; diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 513f330..b964987 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -17,6 +17,8 @@ #include #include +#include + #include #include #include @@ -37,6 +39,8 @@ enum { #else #define VDSO_DEFAULT VDSO_ENABLED #endif +#undef VDSO_DEFAULT +#define VDSO_DEFAULT VDSO_DISABLED #ifdef CONFIG_X86_64 #define vdso_enabled sysctl_vsyscall32 @@ -193,7 +197,8 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) } } -static struct page *vdso32_pages[1]; +struct page *vdso32_pages[1]; +EXPORT_SYMBOL_GPL(vdso32_pages); #ifdef CONFIG_X86_64 @@ -309,16 +314,30 @@ int __init sysenter_setup(void) return 0; } +EXPORT_SYMBOL_GPL(VDSO32_SYSENTER_RETURN); +EXPORT_SYMBOL_GPL(VDSO32_PRELINK); + /* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack, + unsigned long map_address) { struct mm_struct *mm = current->mm; - unsigned long addr; + unsigned long addr = map_address; int ret = 0; bool compat; + unsigned long flags; - if (vdso_enabled == VDSO_DISABLED) + if (vdso_enabled == VDSO_DISABLED && map_address == 0) { + current->mm->context.vdso = NULL; return 0; + } + + flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE | + mm->def_flags; + + ret = -ENOMEM; + if (ub_memory_charge(mm, PAGE_SIZE, flags, NULL, UB_SOFT)) + goto err_charge; down_write(&mm->mmap_sem); @@ -328,17 +347,16 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) map_compat_vdso(compat); - if (compat) - addr = VDSO_HIGH_BASE; - else { - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (!compat || map_address) { + addr = get_unmapped_area(NULL, addr, PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } - } + } else + addr = VDSO_HIGH_BASE; - if (compat_uses_vma || !compat) { + if (compat_uses_vma || !compat || map_address) { /* * MAYWRITE to allow gdb to COW and set breakpoints * @@ -364,9 +382,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) up_fail: up_write(&mm->mmap_sem); + if (ret < 0) + ub_memory_uncharge(mm, PAGE_SIZE, flags, NULL); +err_charge: return ret; } +EXPORT_SYMBOL_GPL(arch_setup_additional_pages); #ifdef CONFIG_X86_64 diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 257ba4a..36cd3c0 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -4,6 +4,7 @@ * Subject to the GPL, v.2 */ #include +#include #include #include #include @@ -98,17 +99,23 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) /* Setup a VMA at program startup for the vsyscall page. Not called for compat tasks */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack, + unsigned long map_address) { struct mm_struct *mm = current->mm; unsigned long addr; int ret; - if (!vdso_enabled) + if (!vdso_enabled && map_address == 0) { + current->mm->context.vdso = NULL; return 0; + } down_write(&mm->mmap_sem); - addr = vdso_addr(mm->start_stack, vdso_size); + if (map_address) + addr = map_address; + else + addr = vdso_addr(mm->start_stack, vdso_size); addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; @@ -128,6 +135,7 @@ up_fail: up_write(&mm->mmap_sem); return ret; } +EXPORT_SYMBOL_GPL(arch_setup_additional_pages); static __init int vdso_setup(char *s) { diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 1e2aff8..277481e 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -12,6 +12,11 @@ #include #include #include +#include +#include +#include +#include +#include /* * tunables @@ -27,6 +32,7 @@ static const int cfq_slice_sync = HZ / 10; static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; +static int cfq_ub_slice = HZ / 2; /* * offset from end of service tree @@ -44,14 +50,12 @@ static int cfq_slice_idle = HZ / 125; ((struct cfq_io_context *) (rq)->elevator_private) #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) -static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; static DEFINE_PER_CPU(unsigned long, ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); -#define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) @@ -60,106 +64,6 @@ static DEFINE_SPINLOCK(ioc_gone_lock); #define sample_valid(samples) ((samples) > 80) -/* - * Most of our rbtree usage is for sorting with min extraction, so - * if we cache the leftmost node we don't have to walk down the tree - * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should - * move this into the elevator for the rq sorting as well. - */ -struct cfq_rb_root { - struct rb_root rb; - struct rb_node *left; -}; -#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } - -/* - * Per block device queue structure - */ -struct cfq_data { - struct request_queue *queue; - - /* - * rr list of queues with requests and the count of them - */ - struct cfq_rb_root service_tree; - unsigned int busy_queues; - - int rq_in_driver; - int sync_flight; - int hw_tag; - - /* - * idle window management - */ - struct timer_list idle_slice_timer; - struct work_struct unplug_work; - - struct cfq_queue *active_queue; - struct cfq_io_context *active_cic; - - /* - * async queue for each priority case - */ - struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; - struct cfq_queue *async_idle_cfqq; - - sector_t last_position; - unsigned long last_end_request; - - /* - * tunables, see top of file - */ - unsigned int cfq_quantum; - unsigned int cfq_fifo_expire[2]; - unsigned int cfq_back_penalty; - unsigned int cfq_back_max; - unsigned int cfq_slice[2]; - unsigned int cfq_slice_async_rq; - unsigned int cfq_slice_idle; - - struct list_head cic_list; -}; - -/* - * Per process-grouping structure - */ -struct cfq_queue { - /* reference count */ - atomic_t ref; - /* various state flags, see below */ - unsigned int flags; - /* parent cfq_data */ - struct cfq_data *cfqd; - /* service_tree member */ - struct rb_node rb_node; - /* service_tree key */ - unsigned long rb_key; - /* sorted list of pending requests */ - struct rb_root sort_list; - /* if fifo isn't expired, next request to serve */ - struct request *next_rq; - /* requests queued in sort_list */ - int queued[2]; - /* currently allocated requests */ - int allocated[2]; - /* fifo list of requests in sort_list */ - struct list_head fifo; - - unsigned long slice_end; - long slice_resid; - - /* pending metadata requests */ - int meta_pending; - /* number of requests that are on the dispatch list or inside driver */ - int dispatched; - - /* io prio of this group */ - unsigned short ioprio, org_ioprio; - unsigned short ioprio_class, org_ioprio_class; - - pid_t pid; -}; - enum cfqq_state_flags { CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ @@ -209,6 +113,67 @@ CFQ_CFQQ_FNS(sync); static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *, int, struct io_context *, gfp_t); +static void cfq_put_queue(struct cfq_queue *cfqq); + +static void __cfq_put_async_queues(struct cfq_bc_data *cfq_bc) +{ + int i; + + for (i = 0; i < CFQ_PRIO_LISTS; i++) { + if (cfq_bc->async_cfqq[0][i]) { + cfq_put_queue(cfq_bc->async_cfqq[0][i]); + cfq_bc->async_cfqq[0][i] = NULL; + } + if (cfq_bc->async_cfqq[1][i]) { + cfq_put_queue(cfq_bc->async_cfqq[1][i]); + cfq_bc->async_cfqq[1][i] = NULL; + } + } + if (cfq_bc->async_idle_cfqq) { + cfq_put_queue(cfq_bc->async_idle_cfqq); + cfq_bc->async_idle_cfqq = NULL; + } +} + +#ifdef CONFIG_BC_IO_SCHED +static inline struct ub_iopriv *cfqq_ub_iopriv(struct cfq_data *cfqd, int sync) +{ + int mode; + + mode = sync ? cfqd->virt_mode : cfqd->write_virt_mode; + return mode ? &get_io_ub()->iopriv : &get_ub0()->iopriv; +} + +static inline void cfq_put_async_queues(struct cfq_data *cfqd) +{ + struct user_beancounter *ub; + struct cfq_bc_data *cfq_bc; + + rcu_read_lock(); + for_each_beancounter(ub) { + write_lock(&ub->iopriv.cfq_bc_list_lock); + cfq_bc = __find_cfq_bc(&ub->iopriv, cfqd); + if (!cfq_bc) { + write_unlock(&ub->iopriv.cfq_bc_list_lock); + continue; + } + __cfq_put_async_queues(cfq_bc); + write_unlock(&ub->iopriv.cfq_bc_list_lock); + } + rcu_read_unlock(); +} +#else +static inline struct ub_iopriv *cfqq_ub_iopriv(struct cfq_data *cfqd, int sync) +{ + return NULL; +} + +static inline void cfq_put_async_queues(struct cfq_data *cfqd) +{ + __cfq_put_async_queues(&cfqd->cfq_bc); +} +#endif + static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, struct io_context *); @@ -298,6 +263,11 @@ static inline int cfq_slice_used(struct cfq_queue *cfqq) return 1; } +static inline struct user_beancounter *ub_by_iopriv(struct ub_iopriv *iopriv) +{ + return container_of(iopriv, struct user_beancounter, iopriv); +} + /* * Lifted from AS - choose which of rq1 and rq2 that is best served now. * We choose the request that is closest to the head right now. Distance @@ -461,6 +431,7 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, int add_front) { + struct cfq_bc_data *cfq_bc = cfqq->cfq_bc; struct rb_node **p, *parent; struct cfq_queue *__cfqq; unsigned long rb_key; @@ -468,7 +439,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, if (cfq_class_idle(cfqq)) { rb_key = CFQ_IDLE_DELAY; - parent = rb_last(&cfqd->service_tree.rb); + parent = rb_last(&cfq_bc->service_tree.rb); if (parent && parent != &cfqq->rb_node) { __cfqq = rb_entry(parent, struct cfq_queue, rb_node); rb_key += __cfqq->rb_key; @@ -488,12 +459,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, if (rb_key == cfqq->rb_key) return; - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + cfq_rb_erase(&cfqq->rb_node, &cfq_bc->service_tree); } left = 1; parent = NULL; - p = &cfqd->service_tree.rb.rb_node; + p = &cfq_bc->service_tree.rb.rb_node; while (*p) { struct rb_node **n; @@ -525,11 +496,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, } if (left) - cfqd->service_tree.left = &cfqq->rb_node; + cfq_bc->service_tree.left = &cfqq->rb_node; cfqq->rb_key = rb_key; rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); + rb_insert_color(&cfqq->rb_node, &cfq_bc->service_tree.rb); } /* @@ -554,6 +525,7 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; + bc_inc_rqnum(cfqq); cfq_resort_rr_list(cfqd, cfqq); } @@ -564,15 +536,20 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) */ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + struct cfq_bc_data *cfq_bc; + cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); BUG_ON(!cfq_cfqq_on_rr(cfqq)); cfq_clear_cfqq_on_rr(cfqq); + cfq_bc = cfqq->cfq_bc; + if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + cfq_rb_erase(&cfqq->rb_node, &cfq_bc->service_tree); BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; + bc_dec_rqnum(cfqq); } /* @@ -692,8 +669,7 @@ static void cfq_remove_request(struct request *rq) } } -static int cfq_merge(struct request_queue *q, struct request **req, - struct bio *bio) +static int cfq_merge(struct request_queue *q, struct request **req, struct bio *bio) { struct cfq_data *cfqd = q->elevator->elevator_data; struct request *__rq; @@ -822,10 +798,16 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out) */ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { - if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) + struct cfq_bc_data *cfq_bc; + + cfq_bc = cfqd->active_cfq_bc; + if (!cfq_bc) return NULL; - return cfq_rb_first(&cfqd->service_tree); + if (RB_EMPTY_ROOT(&cfq_bc->service_tree.rb)) + return NULL; + + return cfq_rb_first(&cfq_bc->service_tree); } /* @@ -833,9 +815,17 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) */ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) { - struct cfq_queue *cfqq; + struct cfq_queue *cfqq = NULL; + struct cfq_bc_data *cfq_bc; + + bc_schedule_active(cfqd); + + cfq_bc = cfqd->active_cfq_bc; + if (!cfq_bc) + goto out; cfqq = cfq_get_next_queue(cfqd); +out: __cfq_set_active_queue(cfqd, cfqq); return cfqq; } @@ -935,6 +925,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfq_remove_request(rq); cfqq->dispatched++; + cfqq->cfq_bc->on_dispatch++; elv_dispatch_sort(q, rq); if (cfq_cfqq_sync(cfqq)) @@ -993,7 +984,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) /* * The active queue has run out of time, expire it and select new. */ - if (cfq_slice_used(cfqq)) + if (cfq_slice_used(cfqq) || bc_expired(cfqd)) goto expire; /* @@ -1092,14 +1083,33 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) * Drain our current requests. Used for barriers and when switching * io schedulers on-the-fly. */ -static int cfq_forced_dispatch(struct cfq_data *cfqd) +static int __cfq_forced_dispatch(struct cfq_bc_data *cfq_bc) { struct cfq_queue *cfqq; int dispatched = 0; - while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) + while ((cfqq = cfq_rb_first(&cfq_bc->service_tree)) != NULL) dispatched += __cfq_forced_dispatch_cfqq(cfqq); + return dispatched; +} + +static int cfq_forced_dispatch(struct cfq_data *cfqd) +{ + struct cfq_bc_data *cfq_bc; + struct cfq_bc_data *cfq_bc_tmp; + int dispatched; + + dispatched = 0; + /* + * We use here _safe iterating, because + * __cfq_forced_dispatch() produces list_del() implicitly + */ + list_for_each_entry_safe(cfq_bc, cfq_bc_tmp, + &cfqd->act_cfq_bc_head, act_cfq_bc_list) { + dispatched += __cfq_forced_dispatch(cfq_bc); + } + cfq_slice_expired(cfqd, 0); BUG_ON(cfqd->busy_queues); @@ -1289,6 +1299,10 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, if (ioc->ioc_data == cic) rcu_assign_pointer(ioc->ioc_data, NULL); + /* + * cic->cfqq[ASYNC] is always NULL and the put of async queues + * happens on appropriate bc death or device unplug + */ if (cic->cfqq[ASYNC]) { cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]); cic->cfqq[ASYNC] = NULL; @@ -1397,6 +1411,10 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) spin_lock_irqsave(cfqd->queue->queue_lock, flags); + /* + * cic->cfqq[ASYNC] is always NULL, ioprio change + * for async queues happens automatically + */ cfqq = cic->cfqq[ASYNC]; if (cfqq) { struct cfq_queue *new_cfqq; @@ -1426,8 +1444,11 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync, { struct cfq_queue *cfqq, *new_cfqq = NULL; struct cfq_io_context *cic; + struct ub_iopriv *iopriv; + struct cfq_bc_data *cfq_bc = NULL; retry: + iopriv = cfqq_ub_iopriv(cfqd, is_sync); cic = cfq_cic_lookup(cfqd, ioc); /* cic always exists here */ cfqq = cic_to_cfqq(cic, is_sync); @@ -1445,18 +1466,32 @@ retry: */ spin_unlock_irq(cfqd->queue->queue_lock); new_cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_NOFAIL | __GFP_ZERO, + gfp_mask|__GFP_NOFAIL|__GFP_ZERO, cfqd->queue->node); + if (new_cfqq) { + cfq_bc = bc_findcreate_cfq_bc(iopriv, + cfqd, gfp_mask); + if (!cfq_bc) { + kmem_cache_free(cfq_pool, new_cfqq); + new_cfqq = NULL; + } + } spin_lock_irq(cfqd->queue->queue_lock); goto retry; } else { cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); + gfp_mask|__GFP_ZERO, cfqd->queue->node); if (!cfqq) goto out; + cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask); + if (!cfq_bc) { + kmem_cache_free(cfq_pool, cfqq); + cfqq = NULL; + goto out; + } } + cfqq->cfq_bc = cfq_bc; RB_CLEAR_NODE(&cfqq->rb_node); INIT_LIST_HEAD(&cfqq->fifo); @@ -1486,15 +1521,15 @@ out: } static struct cfq_queue ** -cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) +cfq_async_queue_prio(struct cfq_bc_data *cfq_bc, int ioprio_class, int ioprio) { switch (ioprio_class) { case IOPRIO_CLASS_RT: - return &cfqd->async_cfqq[0][ioprio]; + return &cfq_bc->async_cfqq[0][ioprio]; case IOPRIO_CLASS_BE: - return &cfqd->async_cfqq[1][ioprio]; + return &cfq_bc->async_cfqq[1][ioprio]; case IOPRIO_CLASS_IDLE: - return &cfqd->async_idle_cfqq; + return &cfq_bc->async_idle_cfqq; default: BUG(); } @@ -1508,9 +1543,16 @@ cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc, const int ioprio_class = task_ioprio_class(ioc); struct cfq_queue **async_cfqq = NULL; struct cfq_queue *cfqq = NULL; + struct cfq_bc_data *cfq_bc; + struct ub_iopriv *iopriv; + + iopriv = cfqq_ub_iopriv(cfqd, is_sync); if (!is_sync) { - async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); + cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask); + if (!cfq_bc) + return NULL; + async_cfqq = cfq_async_queue_prio(cfq_bc, ioprio_class, ioprio); cfqq = *async_cfqq; } @@ -1894,6 +1936,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) WARN_ON(!cfqq->dispatched); cfqd->rq_in_driver--; cfqq->dispatched--; + cfqq->cfq_bc->on_dispatch--; if (cfq_cfqq_sync(cfqq)) cfqd->sync_flight--; @@ -2006,6 +2049,7 @@ static void cfq_put_request(struct request *rq) rq->elevator_private = NULL; rq->elevator_private2 = NULL; + put_beancounter(ub_by_iopriv(cfqq->cfq_bc->ub_iopriv)); cfq_put_queue(cfqq); } } @@ -2022,14 +2066,19 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) const int is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; unsigned long flags; + struct ub_iopriv *iopriv; + struct cfq_bc_data *cfq_bc = NULL; might_sleep_if(gfp_mask & __GFP_WAIT); cic = cfq_get_io_context(cfqd, gfp_mask); + iopriv = cfqq_ub_iopriv(cfqd, is_sync); + if (!is_sync) + cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask); spin_lock_irqsave(q->queue_lock, flags); - if (!cic) + if (!cic || (!is_sync && cfq_bc == NULL)) goto queue_fail; cfqq = cic_to_cfqq(cic, is_sync); @@ -2050,6 +2099,7 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) rq->elevator_private = cic; rq->elevator_private2 = cfqq; + get_beancounter(ub_by_iopriv(cfqq->cfq_bc->ub_iopriv)); return 0; queue_fail: @@ -2127,21 +2177,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) kblockd_flush_work(&cfqd->unplug_work); } -static void cfq_put_async_queues(struct cfq_data *cfqd) -{ - int i; - - for (i = 0; i < IOPRIO_BE_NR; i++) { - if (cfqd->async_cfqq[0][i]) - cfq_put_queue(cfqd->async_cfqq[0][i]); - if (cfqd->async_cfqq[1][i]) - cfq_put_queue(cfqd->async_cfqq[1][i]); - } - - if (cfqd->async_idle_cfqq) - cfq_put_queue(cfqd->async_idle_cfqq); -} - static void cfq_exit_queue(elevator_t *e) { struct cfq_data *cfqd = e->elevator_data; @@ -2168,6 +2203,8 @@ static void cfq_exit_queue(elevator_t *e) cfq_shutdown_timer_wq(cfqd); + bc_cfq_exit_queue(cfqd); + kfree(cfqd); } @@ -2175,11 +2212,19 @@ static void *cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; - cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); + cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL|__GFP_ZERO, q->node); if (!cfqd) return NULL; - cfqd->service_tree = CFQ_RB_ROOT; + INIT_LIST_HEAD(&cfqd->act_cfq_bc_head); +#ifndef CONFIG_BC_IO_SCHED + cfq_init_cfq_bc(&cfqd->cfq_bc); + /* + * Adding ub0 to active list in order to serve force dispatching + * case uniformally. Note, that nobody removes ub0 from this list. + */ + list_add_tail(&cfqd->cfq_bc.act_cfq_bc_list, &cfqd->act_cfq_bc_head); +#endif INIT_LIST_HEAD(&cfqd->cic_list); cfqd->queue = q; @@ -2200,6 +2245,9 @@ static void *cfq_init_queue(struct request_queue *q) cfqd->cfq_slice[1] = cfq_slice_sync; cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; + cfqd->cfq_ub_slice = cfq_ub_slice; + cfqd->virt_mode = 1; + cfqd->write_virt_mode = 1; return cfqd; } @@ -2268,6 +2316,9 @@ SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); +SHOW_FUNCTION(cfq_ub_slice_show, cfqd->cfq_ub_slice, 1); +SHOW_FUNCTION(cfq_virt_mode_show, cfqd->virt_mode, 0); +SHOW_FUNCTION(cfq_write_virt_mode_show, cfqd->write_virt_mode, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -2299,6 +2350,9 @@ STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); +STORE_FUNCTION(cfq_ub_slice_store, &cfqd->cfq_ub_slice, 1, UINT_MAX, 1); +STORE_FUNCTION(cfq_virt_mode_store, &cfqd->virt_mode, 0, 1, 0); +STORE_FUNCTION(cfq_write_virt_mode_store, &cfqd->write_virt_mode, 0, 1, 0); #undef STORE_FUNCTION #define CFQ_ATTR(name) \ @@ -2314,6 +2368,9 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(slice_async), CFQ_ATTR(slice_async_rq), CFQ_ATTR(slice_idle), + CFQ_ATTR(ub_slice), + CFQ_ATTR(virt_mode), + CFQ_ATTR(write_virt_mode), __ATTR_NULL }; @@ -2337,6 +2394,7 @@ static struct elevator_type iosched_cfq = { .elevator_init_fn = cfq_init_queue, .elevator_exit_fn = cfq_exit_queue, .trim = cfq_free_io_context, + .put_queue = cfq_put_queue, }, .elevator_attrs = cfq_attrs, .elevator_name = "cfq", diff --git a/block/elevator.c b/block/elevator.c index ed6f8f3..c0aad1b 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -40,6 +40,9 @@ static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); +struct kmem_cache *cfq_pool; +EXPORT_SYMBOL_GPL(cfq_pool); + /* * Merge hash stuff. */ @@ -1034,12 +1037,12 @@ void elv_unregister(struct elevator_type *e) */ if (e->ops.trim) { read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { task_lock(p); if (p->io_context) e->ops.trim(p->io_context); task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); } diff --git a/block/genhd.c b/block/genhd.c index e0ce23a..3156ed4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -407,7 +407,7 @@ static int __init genhd_device_init(void) { int error; - block_class.dev_kobj = sysfs_dev_block_kobj; + block_class.dev_kobj = ve_sysfs_dev_block_kobj; error = class_register(&block_class); if (unlikely(error)) return error; @@ -563,6 +563,7 @@ static void disk_release(struct device *dev) struct class block_class = { .name = "block", }; +EXPORT_SYMBOL(block_class); static struct device_type disk_type = { .name = "disk", diff --git a/drivers/base/base.h b/drivers/base/base.h index 31dc0cd..a56363c 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -95,7 +95,12 @@ extern char *make_class_name(const char *name, struct kobject *kobj); extern int devres_release_all(struct device *dev); +#ifndef CONFIG_VE extern struct kset *devices_kset; +#define ve_devices_kset devices_kset +#else +#define ve_devices_kset (get_exec_env()->devices_kset) +#endif #if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS) extern void module_add_driver(struct module *mod, struct device_driver *drv); diff --git a/drivers/base/class.c b/drivers/base/class.c index cc5e28c..a6f0802 100644 --- a/drivers/base/class.c +++ b/drivers/base/class.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include "base.h" #define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr) @@ -72,8 +74,14 @@ static struct kobj_type class_ktype = { }; /* Hotplug events for classes go to the class class_subsys */ -static struct kset *class_kset; +struct kset *class_kset; +EXPORT_SYMBOL_GPL(class_kset); +#ifndef CONFIG_VE +#define visible_class_kset class_kset +#else +#define visible_class_kset (get_exec_env()->class_kset) +#endif int class_create_file(struct class *cls, const struct class_attribute *attr) { @@ -157,14 +165,14 @@ int __class_register(struct class *cls, struct lock_class_key *key) /* set the default /sys/dev directory for devices of this class */ if (!cls->dev_kobj) - cls->dev_kobj = sysfs_dev_char_kobj; + cls->dev_kobj = ve_sysfs_dev_char_kobj; #if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK) /* let the block class directory show up in the root of sysfs */ if (cls != &block_class) - cp->class_subsys.kobj.kset = class_kset; + cp->class_subsys.kobj.kset = visible_class_kset; #else - cp->class_subsys.kobj.kset = class_kset; + cp->class_subsys.kobj.kset = visible_class_kset; #endif cp->class_subsys.kobj.ktype = &class_ktype; cp->class = cls; @@ -418,13 +426,20 @@ void class_interface_unregister(struct class_interface *class_intf) class_put(parent); } -int __init classes_init(void) +int classes_init(void) { - class_kset = kset_create_and_add("class", NULL, NULL); - if (!class_kset) + visible_class_kset = kset_create_and_add("class", NULL, NULL); + if (!visible_class_kset) return -ENOMEM; return 0; } +EXPORT_SYMBOL_GPL(classes_init); + +void classes_fini(void) +{ + kset_unregister(visible_class_kset); +} +EXPORT_SYMBOL_GPL(classes_fini); EXPORT_SYMBOL_GPL(class_create_file); EXPORT_SYMBOL_GPL(class_remove_file); diff --git a/drivers/base/core.c b/drivers/base/core.c index d021c98..29d6e7d 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -22,15 +22,22 @@ #include #include #include +#include +#include #include "base.h" #include "power/power.h" int (*platform_notify)(struct device *dev) = NULL; int (*platform_notify_remove)(struct device *dev) = NULL; +#ifndef CONFIG_VE static struct kobject *dev_kobj; +#define ve_dev_kobj dev_kobj struct kobject *sysfs_dev_char_kobj; struct kobject *sysfs_dev_block_kobj; +#else +#define ve_dev_kobj (get_exec_env()->dev_kobj) +#endif #ifdef CONFIG_BLOCK static inline int device_is_not_partition(struct device *dev) @@ -419,8 +426,9 @@ static ssize_t show_dev(struct device *dev, struct device_attribute *attr, static struct device_attribute devt_attr = __ATTR(dev, S_IRUGO, show_dev, NULL); -/* kset to create /sys/devices/ */ +#ifndef CONFIG_VE struct kset *devices_kset; +#endif /** * device_create_file - create sysfs attribute file for device. @@ -531,7 +539,7 @@ static void klist_children_put(struct klist_node *n) */ void device_initialize(struct device *dev) { - dev->kobj.kset = devices_kset; + dev->kobj.kset = ve_devices_kset; kobject_init(&dev->kobj, &device_ktype); klist_init(&dev->klist_children, klist_children_get, klist_children_put); @@ -569,7 +577,7 @@ static struct kobject *virtual_device_parent(struct device *dev) if (!virtual_dir) virtual_dir = kobject_create_and_add("virtual", - &devices_kset->kobj); + &ve_devices_kset->kobj); return virtual_dir; } @@ -799,7 +807,7 @@ static struct kobject *device_to_dev_kobj(struct device *dev) if (dev->class) kobj = dev->class->dev_kobj; else - kobj = sysfs_dev_char_kobj; + kobj = ve_sysfs_dev_char_kobj; return kobj; } @@ -1142,31 +1150,43 @@ struct device *device_find_child(struct device *parent, void *data, return child; } -int __init devices_init(void) +int devices_init(void) { - devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL); - if (!devices_kset) - return -ENOMEM; - dev_kobj = kobject_create_and_add("dev", NULL); - if (!dev_kobj) + ve_devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL); + if (!ve_devices_kset) + goto dev_kset_err; + ve_dev_kobj = kobject_create_and_add("dev", NULL); + if (!ve_dev_kobj) goto dev_kobj_err; - sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj); - if (!sysfs_dev_block_kobj) + ve_sysfs_dev_block_kobj = kobject_create_and_add("block", ve_dev_kobj); + if (!ve_sysfs_dev_block_kobj) goto block_kobj_err; - sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj); - if (!sysfs_dev_char_kobj) + ve_sysfs_dev_char_kobj = kobject_create_and_add("char", ve_dev_kobj); + if (!ve_sysfs_dev_char_kobj) goto char_kobj_err; return 0; char_kobj_err: - kobject_put(sysfs_dev_block_kobj); + kobject_put(ve_sysfs_dev_block_kobj); block_kobj_err: - kobject_put(dev_kobj); + kobject_put(ve_dev_kobj); dev_kobj_err: - kset_unregister(devices_kset); + kset_unregister(ve_devices_kset); +dev_kset_err: return -ENOMEM; } +EXPORT_SYMBOL_GPL(devices_init); + +void devices_fini(void) +{ + kobject_put(ve_sysfs_dev_char_kobj); + kobject_put(ve_sysfs_dev_block_kobj); + kobject_put(ve_dev_kobj); + kset_unregister(ve_devices_kset); +} +EXPORT_SYMBOL_GPL(devices_fini); + EXPORT_SYMBOL_GPL(device_for_each_child); EXPORT_SYMBOL_GPL(device_find_child); @@ -1493,7 +1513,12 @@ void device_shutdown(void) { struct device *dev, *devn; - list_for_each_entry_safe_reverse(dev, devn, &devices_kset->list, + if (!ve_is_super(get_exec_env())) { + printk("BUG: device_shutdown call from inside VE\n"); + return; + } + + list_for_each_entry_safe_reverse(dev, devn, &ve_devices_kset->list, kobj.entry) { if (dev->bus && dev->bus->shutdown) { dev_dbg(dev, "shutdown\n"); @@ -1503,7 +1528,8 @@ void device_shutdown(void) dev->driver->shutdown(dev); } } - kobject_put(sysfs_dev_char_kobj); - kobject_put(sysfs_dev_block_kobj); - kobject_put(dev_kobj); + + kobject_put(ve_sysfs_dev_char_kobj); + kobject_put(ve_sysfs_dev_block_kobj); + kobject_put(ve_dev_kobj); } diff --git a/drivers/base/sys.c b/drivers/base/sys.c index 75dd6e2..6c4dee7 100644 --- a/drivers/base/sys.c +++ b/drivers/base/sys.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include @@ -467,7 +469,7 @@ int sysdev_resume(void) int __init system_bus_init(void) { - system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj); + system_kset = kset_create_and_add("system", NULL, &ve_devices_kset->kobj); if (!system_kset) return -ENOMEM; return 0; diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c index 7b3a212..da591f4 100644 --- a/drivers/char/keyboard.c +++ b/drivers/char/keyboard.c @@ -162,6 +162,7 @@ unsigned char kbd_sysrq_xlate[KEY_MAX + 1] = static int sysrq_down; static int sysrq_alt_use; #endif +int sysrq_key_scancode = KEY_SYSRQ; static int sysrq_alt; /* @@ -1067,6 +1068,9 @@ static int emulate_raw(struct vc_data *vc, unsigned int keycode, { int code; + if (keycode == sysrq_key_scancode && sysrq_alt) + goto sysrq; + switch (keycode) { case KEY_PAUSE: put_queue(vc, 0xe1); @@ -1085,6 +1089,7 @@ static int emulate_raw(struct vc_data *vc, unsigned int keycode, break; case KEY_SYSRQ: +sysrq: /* * Real AT keyboards (that's what we're trying * to emulate here emit 0xe0 0x2a 0xe0 0x37 when @@ -1181,7 +1186,8 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw) printk(KERN_WARNING "keyboard.c: can't emulate rawmode for keycode %d\n", keycode); #ifdef CONFIG_MAGIC_SYSRQ /* Handle the SysRq Hack */ - if (keycode == KEY_SYSRQ && (sysrq_down || (down == 1 && sysrq_alt))) { + if ((keycode == sysrq_key_scancode || keycode == KEY_SYSRQ) && + (sysrq_down || (down == 1 && sysrq_alt))) { if (!sysrq_down) { sysrq_down = down; sysrq_alt_use = sysrq_alt; diff --git a/drivers/char/pty.c b/drivers/char/pty.c index 76b2793..b8c76ba 100644 --- a/drivers/char/pty.c +++ b/drivers/char/pty.c @@ -29,16 +29,22 @@ #include #include +#include + /* These are global because they are accessed in tty_io.c */ #ifdef CONFIG_UNIX98_PTYS struct tty_driver *ptm_driver; -static struct tty_driver *pts_driver; +struct tty_driver *pts_driver; +EXPORT_SYMBOL(ptm_driver); +EXPORT_SYMBOL(pts_driver); #endif static void pty_close(struct tty_struct * tty, struct file * filp) { if (!tty) return; + + ub_pty_uncharge(tty); if (tty->driver->subtype == PTY_TYPE_MASTER) { if (tty->count > 1) printk("master pty_close: count = %d!!\n", tty->count); @@ -58,8 +64,12 @@ static void pty_close(struct tty_struct * tty, struct file * filp) if (tty->driver->subtype == PTY_TYPE_MASTER) { set_bit(TTY_OTHER_CLOSED, &tty->flags); #ifdef CONFIG_UNIX98_PTYS - if (tty->driver == ptm_driver) + if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) { + struct ve_struct *old_env; + old_env = set_exec_env(tty->owner_env); devpts_pty_kill(tty->index); + (void)set_exec_env(old_env); + } #endif tty_vhangup(tty->link); } @@ -212,6 +222,10 @@ static int pty_open(struct tty_struct *tty, struct file * filp) if (tty->link->count != 1) goto out; + retval = -ENOMEM; + if (ub_pty_charge(tty)) + goto out; + clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); set_bit(TTY_THROTTLED, &tty->flags); set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); @@ -239,7 +253,9 @@ static const struct tty_operations pty_ops = { /* Traditional BSD devices */ #ifdef CONFIG_LEGACY_PTYS -static struct tty_driver *pty_driver, *pty_slave_driver; +struct tty_driver *pty_driver, *pty_slave_driver; +EXPORT_SYMBOL(pty_driver); +EXPORT_SYMBOL(pty_slave_driver); static int pty_bsd_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg) @@ -452,6 +468,9 @@ static void __init unix98_pty_init(void) pty_table[1].data = &ptm_driver->refcount; register_sysctl_table(pty_root_table); +#ifdef CONFIG_VE + get_ve0()->ptm_driver = ptm_driver; +#endif } #else static inline void unix98_pty_init(void) { } diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 8fdfe9c..111b7a6 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include #include @@ -241,9 +243,16 @@ static struct sysrq_key_op sysrq_showallcpus_op = { static void sysrq_handle_showregs(int key, struct tty_struct *tty) { struct pt_regs *regs = get_irq_regs(); + + bust_spinlocks(1); if (regs) show_regs(regs); + bust_spinlocks(0); +#if defined(__i386__) || defined(__x86_64__) + smp_nmi_call_function(smp_show_regs, NULL, 1); +#endif } + static struct sysrq_key_op sysrq_showregs_op = { .handler = sysrq_handle_showregs, .help_msg = "showPc", @@ -277,6 +286,7 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = { static void sysrq_handle_showmem(int key, struct tty_struct *tty) { show_mem(); + show_slab_info(); } static struct sysrq_key_op sysrq_showmem_op = { .handler = sysrq_handle_showmem, @@ -292,7 +302,7 @@ static void send_sig_all(int sig) { struct task_struct *p; - for_each_process(p) { + for_each_process_all(p) { if (p->mm && !is_global_init(p)) /* Not swapper, init nor kernel thread */ force_sig(sig, p); @@ -354,7 +364,267 @@ static struct sysrq_key_op sysrq_unrt_op = { /* Key Operations table and lock */ static DEFINE_SPINLOCK(sysrq_key_table_lock); -static struct sysrq_key_op *sysrq_key_table[36] = { +#define SYSRQ_KEY_TABLE_LENGTH 37 +static struct sysrq_key_op **sysrq_key_table; +static struct sysrq_key_op *sysrq_default_key_table[]; + +#ifdef CONFIG_SYSRQ_DEBUG +#define SYSRQ_NAMELEN_MAX 64 +#define SYSRQ_DUMP_LINES 32 + +static struct sysrq_key_op *sysrq_debug_key_table[]; +static struct sysrq_key_op *sysrq_input_key_table[]; +static unsigned long *dump_address; +static int orig_console_loglevel; +static void (*sysrq_input_return)(char *) = NULL; + +static void dump_mem(void) +{ + unsigned long value[4]; + mm_segment_t old_fs; + int line, err; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = 0; + + for (line = 0; line < SYSRQ_DUMP_LINES; line++) { + err |= __get_user(value[0], dump_address++); + err |= __get_user(value[1], dump_address++); + err |= __get_user(value[2], dump_address++); + err |= __get_user(value[3], dump_address++); + if (err) { + printk("Invalid address %p\n", dump_address - 4); + break; + } +#if BITS_PER_LONG == 32 + printk("0x%p: %08lx %08lx %08lx %08lx\n", + dump_address - 4, + value[0], value[1], value[2], value[3]); +#else + printk("0x%p: %016lx %016lx %016lx %016lx\n", + dump_address - 4, + value[0], value[1], value[2], value[3]); +#endif + } + set_fs(old_fs); +} + +static void write_mem(unsigned long val) +{ + mm_segment_t old_fs; + unsigned long old_val; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + if (__get_user(old_val, dump_address)) { + printk("Invalid address %p\n", dump_address); + goto out; + } + +#if BITS_PER_LONG == 32 + printk("Changing [%p] from %08lx to %08lx\n", + dump_address, old_val, val); +#else + printk("Changing [%p] from %016lx to %016lx\n", + dump_address, old_val, val); +#endif + __put_user(val, dump_address); +out: + set_fs(old_fs); +} + +static void handle_read(int key, struct tty_struct *tty) +{ + static int pos; + static int upper_case; + static char str[SYSRQ_NAMELEN_MAX]; + + if (key == 0) { + /* actually 0 is not shift only... */ + upper_case = 1; + return; + } + + if (key == 0x0d || pos == SYSRQ_NAMELEN_MAX - 1) { + /* enter */ + sysrq_key_table = sysrq_debug_key_table; + str[pos] = '\0'; + pos = upper_case = 0; + printk("\n"); + if (sysrq_input_return == NULL) + printk("No return handler!!!\n"); + else + sysrq_input_return(str); + return; + }; + + /* check for alowed symbols */ + if (key == '-') { + if (upper_case) + key = '_'; + goto correct; + }; + if (key >= 'a' && key <= 'z') { + if (upper_case) + key = key - 'a' + 'A'; + goto correct; + }; + if (key >= '0' && key <= '9') + goto correct; + + upper_case = 0; + return; + +correct: + str[pos] = key; + printk("%c", (char)key); + pos++; + upper_case = 0; +} + +static struct sysrq_key_op input_read = { + .handler = handle_read, + .help_msg = "", + .action_msg = NULL, +}; + +static struct sysrq_key_op *sysrq_input_key_table[SYSRQ_KEY_TABLE_LENGTH] = { + [0 ... SYSRQ_KEY_TABLE_LENGTH - 1] = &input_read, +}; + +static void return_dump_mem(char *str) +{ + unsigned long address; + char *end; + + address = simple_strtoul(str, &end, 0); + if (*end != '\0') { + printk("Bad address [%s]\n", str); + return; + } + + dump_address = (unsigned long *)address; + dump_mem(); +} + +static void handle_dump_mem(int key, struct tty_struct *tty) +{ + sysrq_input_return = return_dump_mem; + sysrq_key_table = sysrq_input_key_table; +} + +static struct sysrq_key_op debug_dump_mem = { + .handler = handle_dump_mem, + .help_msg = "Dump", + .action_msg = "Enter address:", +}; + +static void return_resolve(char *str) +{ + unsigned long address; + + address = kallsyms_lookup_name(str); + printk("%s : %lx\n", str, address); + if (address) { + dump_address = (unsigned long *)address; + printk("Now you can dump it via X\n"); + } +} + +static void handle_resolve(int key, struct tty_struct *tty) +{ + sysrq_input_return = return_resolve; + sysrq_key_table = sysrq_input_key_table; +} + +static struct sysrq_key_op debug_resolve = { + .handler = handle_resolve, + .help_msg = "Resolve", + .action_msg = "Enter symbol name:", +}; + +static void return_write_mem(char *str) +{ + unsigned long address; + unsigned long value; + char *end; + + address = simple_strtoul(str, &end, 0); + if (*end != '-') { + printk("Bad address in %s\n", str); + return; + } + value = simple_strtoul(end + 1, &end, 0); + if (*end != '\0') { + printk("Bad value in %s\n", str); + return; + } + + dump_address = (unsigned long *)address; + write_mem(value); +} + +static void handle_write_mem(int key, struct tty_struct *tty) +{ + sysrq_input_return = return_write_mem; + sysrq_key_table = sysrq_input_key_table; +} + +static struct sysrq_key_op debug_write_mem = { + .handler = handle_write_mem, + .help_msg = "Writemem", + .action_msg = "Enter address-value:", +}; + +static void handle_next(int key, struct tty_struct *tty) +{ + dump_mem(); +} + +static struct sysrq_key_op debug_next = { + .handler = handle_next, + .help_msg = "neXt", + .action_msg = "continuing", +}; + +static void handle_quit(int key, struct tty_struct *tty) +{ + sysrq_key_table = sysrq_default_key_table; + console_loglevel = orig_console_loglevel; +} + +static struct sysrq_key_op debug_quit = { + .handler = handle_quit, + .help_msg = "Quit", + .action_msg = "Tnahk you for using debugger", +}; + +static struct sysrq_key_op *sysrq_debug_key_table[SYSRQ_KEY_TABLE_LENGTH] = { + [13] = &debug_dump_mem, /* d */ + [26] = &debug_quit, /* q */ + [27] = &debug_resolve, /* r */ + [32] = &debug_write_mem, /* w */ + [33] = &debug_next, /* x */ +}; + +static void sysrq_handle_debug(int key, struct tty_struct *tty) +{ + orig_console_loglevel = console_loglevel; + console_loglevel = 8; + sysrq_key_table = sysrq_debug_key_table; + printk("Welcome sysrq debugging mode\n" + "Press H for help\n"); +} + +static struct sysrq_key_op sysrq_debug_op = { + .handler = sysrq_handle_debug, + .help_msg = "debuG", + .action_msg = "Select desired action", +}; +#endif + +static struct sysrq_key_op *sysrq_default_key_table[SYSRQ_KEY_TABLE_LENGTH] = { &sysrq_loglevel_op, /* 0 */ &sysrq_loglevel_op, /* 1 */ &sysrq_loglevel_op, /* 2 */ @@ -377,7 +647,11 @@ static struct sysrq_key_op *sysrq_key_table[36] = { &sysrq_term_op, /* e */ &sysrq_moom_op, /* f */ /* g: May be registered by ppc for kgdb */ +#ifdef CONFIG_SYSRQ_DEBUG + &sysrq_debug_op, /* g */ +#else NULL, /* g */ +#endif NULL, /* h */ &sysrq_kill_op, /* i */ NULL, /* j */ @@ -404,9 +678,12 @@ static struct sysrq_key_op *sysrq_key_table[36] = { NULL, /* x */ /* y: May be registered on sparc64 for global register dump */ NULL, /* y */ - NULL /* z */ + NULL, /* z */ + NULL, /* for debugger */ }; +static struct sysrq_key_op **sysrq_key_table = sysrq_default_key_table; + /* key2index calculation, -1 on invalid index */ static int sysrq_key_table_key2index(int key) { @@ -416,6 +693,10 @@ static int sysrq_key_table_key2index(int key) retval = key - '0'; else if ((key >= 'a') && (key <= 'z')) retval = key + 10 - 'a'; +#ifdef CONFIG_SYSRQ_DEBUG + else if (key == 0 || key == 0x0d || key == '-') + retval = SYSRQ_KEY_TABLE_LENGTH - 1; +#endif else retval = -1; return retval; @@ -457,7 +738,6 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) spin_lock_irqsave(&sysrq_key_table_lock, flags); orig_log_level = console_loglevel; console_loglevel = 7; - printk(KERN_INFO "SysRq : "); op_p = __sysrq_get_key_op(key); if (op_p) { @@ -466,16 +746,17 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) * should not) and is the invoked operation enabled? */ if (!check_mask || sysrq_on_mask(op_p->enable_mask)) { - printk("%s\n", op_p->action_msg); + if (op_p->action_msg) + printk("%s\n", op_p->action_msg); console_loglevel = orig_log_level; op_p->handler(key, tty); } else { printk("This sysrq operation is disabled.\n"); } } else { - printk("HELP : "); + printk("SysRq HELP : "); /* Only print the help msg once per handler */ - for (i = 0; i < ARRAY_SIZE(sysrq_key_table); i++) { + for (i = 0; i < SYSRQ_KEY_TABLE_LENGTH; i++) { if (sysrq_key_table[i]) { int j; diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index e4dce87..186e878 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -96,6 +96,8 @@ #include #include #include +#include +#include #include #include @@ -106,6 +108,7 @@ #include #include +#include #undef TTY_DEBUG_HANGUP @@ -130,6 +133,7 @@ EXPORT_SYMBOL(tty_std_termios); into this file */ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ +EXPORT_SYMBOL(tty_drivers); /* Mutex to protect creating and releasing a tty. This is shared with vt.c for deeply disgusting hack reasons */ @@ -137,7 +141,11 @@ DEFINE_MUTEX(tty_mutex); EXPORT_SYMBOL(tty_mutex); #ifdef CONFIG_UNIX98_PTYS +#ifdef CONFIG_VE +#define ptm_driver (get_exec_env()->ptm_driver) +#else extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ +#endif static int ptmx_open(struct inode *, struct file *); #endif @@ -173,7 +181,7 @@ static void proc_set_tty(struct task_struct *tsk, struct tty_struct *tty); static struct tty_struct *alloc_tty_struct(void) { - return kzalloc(sizeof(struct tty_struct), GFP_KERNEL); + return kzalloc(sizeof(struct tty_struct), GFP_KERNEL_UBC); } static void tty_buffer_free_all(struct tty_struct *); @@ -675,9 +683,29 @@ static struct tty_driver *get_tty_driver(dev_t device, int *index) if (device < base || device >= base + p->num) continue; *index = device - base; - return p; +#ifdef CONFIG_VE + if (in_interrupt()) + goto found; + if (p->major!=PTY_MASTER_MAJOR && p->major!=PTY_SLAVE_MAJOR +#ifdef CONFIG_UNIX98_PTYS + && (p->majormajor>UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) && + (p->majormajor>UNIX98_PTY_SLAVE_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) +#endif + ) + goto found; + if (ve_is_super(p->owner_env) && ve_is_super(get_exec_env())) + goto found; + if (!ve_accessible_strict(p->owner_env, get_exec_env())) + continue; +#endif + goto found; } return NULL; + +found: + return p; } #ifdef CONFIG_CONSOLE_POLL @@ -1632,13 +1660,21 @@ static void tty_line_name(struct tty_driver *driver, int index, char *p) */ static int init_dev(struct tty_driver *driver, int idx, - struct tty_struct **ret_tty) + struct tty_struct *i_tty, struct tty_struct **ret_tty) { struct tty_struct *tty, *o_tty; struct ktermios *tp, **tp_loc, *o_tp, **o_tp_loc; struct ktermios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc; + struct ve_struct * owner; int retval = 0; + owner = driver->owner_env; + + if (i_tty) { + tty = i_tty; + goto fast_track; + } + /* check whether we're reopening an existing tty */ if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { tty = devpts_get_tty(idx); @@ -1688,6 +1724,7 @@ static int init_dev(struct tty_driver *driver, int idx, tty->ops = driver->ops; tty->index = idx; tty_line_name(driver, idx, tty->name); + tty->owner_env = owner; if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { tp_loc = &tty->termios; @@ -1698,14 +1735,14 @@ static int init_dev(struct tty_driver *driver, int idx, } if (!*tp_loc) { - tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL); + tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL_UBC); if (!tp) goto free_mem_out; *tp = driver->init_termios; } if (!*ltp_loc) { - ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL); + ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL_UBC); if (!ltp) goto free_mem_out; } @@ -1719,6 +1756,7 @@ static int init_dev(struct tty_driver *driver, int idx, o_tty->ops = driver->ops; o_tty->index = idx; tty_line_name(driver->other, idx, o_tty->name); + o_tty->owner_env = owner; if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { o_tp_loc = &o_tty->termios; @@ -1729,14 +1767,14 @@ static int init_dev(struct tty_driver *driver, int idx, } if (!*o_tp_loc) { - o_tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL); + o_tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL_UBC); if (!o_tp) goto free_mem_out; *o_tp = driver->other->init_termios; } if (!*o_ltp_loc) { - o_ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL); + o_ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL_UBC); if (!o_ltp) goto free_mem_out; } @@ -1752,6 +1790,10 @@ static int init_dev(struct tty_driver *driver, int idx, *o_ltp_loc = o_ltp; o_tty->termios = *o_tp_loc; o_tty->termios_locked = *o_ltp_loc; +#ifdef CONFIG_VE + if (driver->other->refcount == 0) + (void)get_ve(owner); +#endif driver->other->refcount++; if (driver->subtype == PTY_TYPE_MASTER) o_tty->count++; @@ -1775,6 +1817,10 @@ static int init_dev(struct tty_driver *driver, int idx, *ltp_loc = ltp; tty->termios = *tp_loc; tty->termios_locked = *ltp_loc; +#ifdef CONFIG_VE + if (driver->refcount == 0) + (void)get_ve(owner); +#endif /* Compatibility until drivers always set this */ tty->termios->c_ispeed = tty_termios_input_baud_rate(tty->termios); tty->termios->c_ospeed = tty_termios_baud_rate(tty->termios); @@ -1888,7 +1934,8 @@ static void release_one_tty(struct tty_struct *tty, int idx) tty->magic = 0; tty->driver->refcount--; - + if (tty->driver->refcount == 0) + put_ve(tty->owner_env); file_list_lock(); list_del_init(&tty->tty_files); file_list_unlock(); @@ -2171,7 +2218,7 @@ static void release_dev(struct file *filp) static int __tty_open(struct inode *inode, struct file *filp) { - struct tty_struct *tty; + struct tty_struct *tty, *c_tty; int noctty, retval; struct tty_driver *driver; int index; @@ -2184,6 +2231,7 @@ retry_open: noctty = filp->f_flags & O_NOCTTY; index = -1; retval = 0; + c_tty = NULL; mutex_lock(&tty_mutex); @@ -2195,6 +2243,7 @@ retry_open: } driver = tty->driver; index = tty->index; + c_tty = tty; filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ /* noctty = 1; */ goto got_driver; @@ -2202,6 +2251,12 @@ retry_open: #ifdef CONFIG_VT if (device == MKDEV(TTY_MAJOR, 0)) { extern struct tty_driver *console_driver; +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) { + mutex_unlock(&tty_mutex); + return -ENODEV; + } +#endif driver = console_driver; index = fg_console; noctty = 1; @@ -2209,6 +2264,12 @@ retry_open: } #endif if (device == MKDEV(TTYAUX_MAJOR, 1)) { +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) { + mutex_unlock(&tty_mutex); + return -ENODEV; + } +#endif driver = console_device(&index); if (driver) { /* Don't let /dev/console block */ @@ -2226,7 +2287,7 @@ retry_open: return -ENODEV; } got_driver: - retval = init_dev(driver, index, &tty); + retval = init_dev(driver, index, c_tty, &tty); mutex_unlock(&tty_mutex); if (retval) return retval; @@ -2323,7 +2384,7 @@ static int __ptmx_open(struct inode *inode, struct file *filp) return index; mutex_lock(&tty_mutex); - retval = init_dev(ptm_driver, index, &tty); + retval = init_dev(ptm_driver, index, NULL, &tty); mutex_unlock(&tty_mutex); if (retval) @@ -2589,6 +2650,8 @@ static int tioccons(struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!ve_is_super(get_exec_env())) + return -EACCES; if (file->f_op->write == redirected_tty_write) { struct file *f; spin_lock(&redirect_lock); @@ -3160,7 +3223,7 @@ void __do_SAK(struct tty_struct *tty) /* Now kill any processes that happen to have the * tty open. */ - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (p->signal->tty == tty) { printk(KERN_NOTICE "SAK: killed process %d" " (%s): task_session_nr(p)==tty->session\n", @@ -3192,7 +3255,7 @@ void __do_SAK(struct tty_struct *tty) spin_unlock(&p->files->file_lock); } task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); #endif } @@ -3527,6 +3590,7 @@ int tty_register_driver(struct tty_driver *driver) } mutex_lock(&tty_mutex); + driver->owner_env = get_exec_env(); list_add(&driver->tty_drivers, &tty_drivers); mutex_unlock(&tty_mutex); @@ -3725,3 +3789,43 @@ static int __init tty_init(void) return 0; } module_init(tty_init); + +#ifdef CONFIG_UNIX98_PTYS +int init_ve_tty_class(void) +{ + struct class * ve_tty_class; + struct device * ve_ptmx_dev_class; + + ve_tty_class = class_create(THIS_MODULE, "tty"); + if (IS_ERR(ve_tty_class)) + return -ENOMEM; + + ve_ptmx_dev_class = device_create(ve_tty_class, NULL, + MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx"); + if (IS_ERR(ve_ptmx_dev_class)) { + class_destroy(ve_tty_class); + return PTR_ERR(ve_ptmx_dev_class); + } + + get_exec_env()->tty_class = ve_tty_class; + return 0; +} + +void fini_ve_tty_class(void) +{ + struct class *ve_tty_class = get_exec_env()->tty_class; + + device_destroy(ve_tty_class, MKDEV(TTYAUX_MAJOR, 2)); + class_destroy(ve_tty_class); +} +#else +int init_ve_tty_class(void) +{ + return 0; +} +void fini_ve_tty_class(void) +{ +} +#endif +EXPORT_SYMBOL(init_ve_tty_class); +EXPORT_SYMBOL(fini_ve_tty_class); diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 7629c90..6e9042c 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -27,6 +27,10 @@ gianfar_driver-objs := gianfar.o \ obj-$(CONFIG_UCC_GETH) += ucc_geth_driver.o ucc_geth_driver-objs := ucc_geth.o ucc_geth_mii.o ucc_geth_ethtool.o +obj-$(CONFIG_VE_NETDEV) += vznetdev.o +vznetdev-objs := open_vznet.o venet_core.o +obj-$(CONFIG_VE_ETHDEV) += vzethdev.o + # # link order important here # diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 3b43bfd..b59c38a 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -72,6 +72,12 @@ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev) { struct pcpu_lstats *pcpu_lstats, *lb_stats; +#ifdef CONFIG_VE + if (unlikely(get_exec_env()->disable_net)) { + kfree_skb(skb); + return 0; + } +#endif skb_orphan(skb); skb->protocol = eth_type_trans(skb,dev); @@ -173,7 +179,8 @@ static void loopback_setup(struct net_device *dev) | NETIF_F_NO_CSUM | NETIF_F_HIGHDMA | NETIF_F_LLTX - | NETIF_F_NETNS_LOCAL; + | NETIF_F_NETNS_LOCAL + | NETIF_F_VIRTUAL; dev->ethtool_ops = &loopback_ethtool_ops; dev->header_ops = ð_header_ops; dev->init = loopback_dev_init; diff --git a/drivers/net/open_vznet.c b/drivers/net/open_vznet.c new file mode 100644 index 0000000..79bf640 --- /dev/null +++ b/drivers/net/open_vznet.c @@ -0,0 +1,244 @@ +/* + * open_vznet.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * Virtual Networking device used to change VE ownership on packets + */ + +#include +#include +#include + +#include +#include +#include +#include + +void veip_stop(struct ve_struct *ve) +{ + struct list_head *p, *tmp; + + write_lock_irq(&veip_hash_lock); + if (ve->veip == NULL) + goto unlock; + list_for_each_safe(p, tmp, &ve->veip->ip_lh) { + struct ip_entry_struct *ptr; + ptr = list_entry(p, struct ip_entry_struct, ve_list); + ptr->active_env = NULL; + list_del(&ptr->ve_list); + list_del(&ptr->ip_hash); + kfree(ptr); + } + veip_put(ve->veip); + ve->veip = NULL; + if (!ve_is_super(ve)) + module_put(THIS_MODULE); +unlock: + write_unlock_irq(&veip_hash_lock); +} + +int veip_start(struct ve_struct *ve) +{ + int err, get; + + err = 0; + write_lock_irq(&veip_hash_lock); + get = ve->veip == NULL; + ve->veip = veip_findcreate(ve->veid); + if (ve->veip == NULL) + err = -ENOMEM; + write_unlock_irq(&veip_hash_lock); + if (err == 0 && get && !ve_is_super(ve)) + __module_get(THIS_MODULE); + return err; +} + +int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr) +{ + struct ip_entry_struct *entry, *found; + int err; + + entry = kzalloc(sizeof(struct ip_entry_struct), GFP_KERNEL); + if (entry == NULL) + return -ENOMEM; + + if (ve->veip == NULL) { + /* This can happen if we load venet AFTER ve was started */ + err = veip_start(ve); + if (err < 0) + goto out; + } + + write_lock_irq(&veip_hash_lock); + err = -EADDRINUSE; + found = venet_entry_lookup(addr); + if (found != NULL) + goto out_unlock; + + entry->active_env = ve; + entry->addr = *addr; + ip_entry_hash(entry, ve->veip); + + err = 0; + entry = NULL; +out_unlock: + write_unlock_irq(&veip_hash_lock); +out: + if (entry != NULL) + kfree(entry); + return err; +} + +int veip_entry_del(envid_t veid, struct ve_addr_struct *addr) +{ + struct ip_entry_struct *found; + int err; + + err = -EADDRNOTAVAIL; + write_lock_irq(&veip_hash_lock); + found = venet_entry_lookup(addr); + if (found == NULL) + goto out; + if (found->active_env->veid != veid) + goto out; + + err = 0; + found->active_env = NULL; + + list_del(&found->ip_hash); + list_del(&found->ve_list); + kfree(found); +out: + write_unlock_irq(&veip_hash_lock); + return err; +} + +static int skb_extract_addr(struct sk_buff *skb, + struct ve_addr_struct *addr, int dir) +{ + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + addr->family = AF_INET; + addr->key[0] = 0; + addr->key[1] = 0; + addr->key[2] = 0; + addr->key[3] = (dir ? ip_hdr(skb)->daddr : ip_hdr(skb)->saddr); + return 0; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case __constant_htons(ETH_P_IPV6): + addr->family = AF_INET6; + memcpy(&addr->key, dir ? + ipv6_hdr(skb)->daddr.s6_addr32 : + ipv6_hdr(skb)->saddr.s6_addr32, + sizeof(addr->key)); + return 0; +#endif + } + + return -EAFNOSUPPORT; +} + +static struct ve_struct *venet_find_ve(struct sk_buff *skb, int dir) +{ + struct ip_entry_struct *entry; + struct ve_addr_struct addr; + + if (skb_extract_addr(skb, &addr, dir) < 0) + return NULL; + + entry = venet_entry_lookup(&addr); + if (entry == NULL) + return NULL; + + return entry->active_env; +} + +int venet_change_skb_owner(struct sk_buff *skb) +{ + struct ve_struct *ve, *ve_old; + + ve_old = skb->owner_env; + + read_lock(&veip_hash_lock); + if (!ve_is_super(ve_old)) { + /* from VE to host */ + ve = venet_find_ve(skb, 0); + if (ve == NULL) + goto out_drop; + if (!ve_accessible_strict(ve, ve_old)) + goto out_source; + skb->owner_env = get_ve0(); + } else { + /* from host to VE */ + ve = venet_find_ve(skb, 1); + if (ve == NULL) + goto out_drop; + skb->owner_env = ve; + } + read_unlock(&veip_hash_lock); + + return 0; + +out_drop: + read_unlock(&veip_hash_lock); + return -ESRCH; + +out_source: + read_unlock(&veip_hash_lock); + if (net_ratelimit() && skb->protocol == __constant_htons(ETH_P_IP)) { + printk(KERN_WARNING "Dropped packet, source wrong " + "veid=%u src-IP=%u.%u.%u.%u " + "dst-IP=%u.%u.%u.%u\n", + skb->owner_env->veid, + NIPQUAD(ip_hdr(skb)->saddr), + NIPQUAD(ip_hdr(skb)->daddr)); + } + return -EACCES; +} + +#ifdef CONFIG_PROC_FS +int veip_seq_show(struct seq_file *m, void *v) +{ + struct list_head *p; + struct ip_entry_struct *entry; + char s[40]; + + p = (struct list_head *)v; + if (p == ip_entry_hash_table) { + seq_puts(m, "Version: 2.5\n"); + return 0; + } + entry = list_entry(p, struct ip_entry_struct, ip_hash); + veaddr_print(s, sizeof(s), &entry->addr); + seq_printf(m, "%39s %10u\n", s, 0); + return 0; +} +#endif + +__exit void veip_cleanup(void) +{ + int i; + + write_lock_irq(&veip_hash_lock); + for (i = 0; i < VEIP_HASH_SZ; i++) + while (!list_empty(ip_entry_hash_table + i)) { + struct ip_entry_struct *entry; + + entry = list_first_entry(ip_entry_hash_table + i, + struct ip_entry_struct, ip_hash); + list_del(&entry->ip_hash); + kfree(entry); + } + write_unlock_irq(&veip_hash_lock); +} + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Virtual Network Device"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 6daea0c..bebd95e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -80,33 +80,6 @@ static int debug; #define DBG1( a... ) #endif -#define FLT_EXACT_COUNT 8 -struct tap_filter { - unsigned int count; /* Number of addrs. Zero means disabled */ - u32 mask[2]; /* Mask of the hashed addrs */ - unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; -}; - -struct tun_struct { - struct list_head list; - unsigned int flags; - int attached; - uid_t owner; - gid_t group; - - wait_queue_head_t read_wait; - struct sk_buff_head readq; - - struct net_device *dev; - struct fasync_struct *fasync; - - struct tap_filter txflt; - -#ifdef TUN_DEBUG - int debug; -#endif -}; - /* TAP filterting */ static void addr_hash_set(u32 *mask, const u8 *addr) { @@ -213,19 +186,18 @@ static int check_filter(struct tap_filter *filter, const struct sk_buff *skb) /* Network device part of the driver */ -static unsigned int tun_net_id; -struct tun_net { - struct list_head dev_list; -}; +unsigned int tun_net_id; +EXPORT_SYMBOL(tun_net_id); static const struct ethtool_ops tun_ethtool_ops; /* Net device open. */ -static int tun_net_open(struct net_device *dev) +int tun_net_open(struct net_device *dev) { netif_start_queue(dev); return 0; } +EXPORT_SYMBOL(tun_net_open); /* Net device close. */ static int tun_net_close(struct net_device *dev) @@ -306,7 +278,7 @@ tun_net_change_mtu(struct net_device *dev, int new_mtu) } /* Initialize net device. */ -static void tun_net_init(struct net_device *dev) +void tun_net_init(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); @@ -336,6 +308,7 @@ static void tun_net_init(struct net_device *dev) break; } } +EXPORT_SYMBOL(tun_net_init); /* Character device part */ @@ -666,7 +639,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, return ret; } -static void tun_setup(struct net_device *dev) +void tun_setup(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); @@ -683,6 +656,7 @@ static void tun_setup(struct net_device *dev) dev->destructor = free_netdev; dev->features |= NETIF_F_NETNS_LOCAL; } +EXPORT_SYMBOL(tun_setup); static struct tun_struct *tun_get_by_name(struct tun_net *tn, const char *name) { @@ -715,7 +689,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) current->euid != tun->owner) || (tun->group != -1 && current->egid != tun->group)) && - !capable(CAP_NET_ADMIN)) + !capable(CAP_NET_ADMIN) && + !capable(CAP_VE_NET_ADMIN)) return -EPERM; } else if (__dev_get_by_name(net, ifr->ifr_name)) @@ -790,6 +765,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) file->private_data = tun; tun->attached = 1; get_net(dev_net(tun->dev)); + tun->bind_file = file; /* Make sure persistent devices do not get stuck in * xoff state. @@ -1053,13 +1029,14 @@ out: return ret; } -static int tun_chr_open(struct inode *inode, struct file * file) +int tun_chr_open(struct inode *inode, struct file * file) { cycle_kernel_lock(); DBG1(KERN_INFO "tunX: tun_chr_open\n"); file->private_data = NULL; return 0; } +EXPORT_SYMBOL(tun_chr_open); static int tun_chr_close(struct inode *inode, struct file *file) { diff --git a/drivers/net/venet_core.c b/drivers/net/venet_core.c new file mode 100644 index 0000000..8e605b6 --- /dev/null +++ b/drivers/net/venet_core.c @@ -0,0 +1,768 @@ +/* + * venet_core.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * Common part for Virtuozzo virtual network devices + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include /* For the statistics structure. */ +#include /* For ARPHRD_ETHER */ +#include +#include +#include +#include +#include + +struct list_head ip_entry_hash_table[VEIP_HASH_SZ]; +rwlock_t veip_hash_lock = RW_LOCK_UNLOCKED; +LIST_HEAD(veip_lh); + +#define ip_entry_hash_function(ip) (ntohl(ip) & (VEIP_HASH_SZ - 1)) + +void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip) +{ + list_add(&entry->ip_hash, + ip_entry_hash_table + + ip_entry_hash_function(entry->addr.key[3])); + list_add(&entry->ve_list, &veip->ip_lh); +} + +void veip_put(struct veip_struct *veip) +{ + if (!list_empty(&veip->ip_lh)) + return; + if (!list_empty(&veip->src_lh)) + return; + if (!list_empty(&veip->dst_lh)) + return; + + list_del(&veip->list); + kfree(veip); +} + +struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *addr) +{ + struct ip_entry_struct *entry; + + list_for_each_entry (entry, ip_entry_hash_table + + ip_entry_hash_function(addr->key[3]), ip_hash) + if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0) + return entry; + return NULL; +} + +struct veip_struct *veip_find(envid_t veid) +{ + struct veip_struct *ptr; + + list_for_each_entry(ptr, &veip_lh, list) { + if (ptr->veid != veid) + continue; + return ptr; + } + return NULL; +} + +struct veip_struct *veip_findcreate(envid_t veid) +{ + struct veip_struct *ptr; + + ptr = veip_find(veid); + if (ptr != NULL) + return ptr; + + ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC); + if (ptr == NULL) + return NULL; + memset(ptr, 0, sizeof(struct veip_struct)); + INIT_LIST_HEAD(&ptr->ip_lh); + INIT_LIST_HEAD(&ptr->src_lh); + INIT_LIST_HEAD(&ptr->dst_lh); + ptr->veid = veid; + list_add(&ptr->list, &veip_lh); + return ptr; +} + +static int convert_sockaddr(struct sockaddr *addr, int addrlen, + struct ve_addr_struct *veaddr) +{ + int err; + + switch (addr->sa_family) { + case AF_INET: { + struct sockaddr_in *sin; + + err = -EINVAL; + if (addrlen != sizeof(struct sockaddr_in)) + break; + + err = 0; + sin = (struct sockaddr_in *)addr; + veaddr->family = AF_INET; + veaddr->key[0] = 0; + veaddr->key[1] = 0; + veaddr->key[2] = 0; + veaddr->key[3] = sin->sin_addr.s_addr; + break; + } + case AF_INET6: { + struct sockaddr_in6 *sin; + + err = -EINVAL; + if (addrlen != sizeof(struct sockaddr_in6)) + break; + + err = 0; + sin = (struct sockaddr_in6 *)addr; + veaddr->family = AF_INET6; + memcpy(veaddr->key, &sin->sin6_addr, sizeof(veaddr->key)); + break; + } + default: + err = -EAFNOSUPPORT; + } + return err; +} + +int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen, + struct ve_addr_struct *veaddr) +{ + int err; + char addr[MAX_SOCK_ADDR]; + + err = move_addr_to_kernel(uaddr, addrlen, (struct sockaddr *)&addr); + if (err < 0) + goto out; + + err = convert_sockaddr((struct sockaddr *)&addr, addrlen, veaddr); +out: + return err; +} + +void veaddr_print(char *str, int len, struct ve_addr_struct *a) +{ + if (a->family == AF_INET) + snprintf(str, len, "%u.%u.%u.%u", NIPQUAD(a->key[3])); + else + snprintf(str, len, "%x:%x:%x:%x:%x:%x:%x:%x", + ntohl(a->key[0])>>16, ntohl(a->key[0])&0xFFFF, + ntohl(a->key[1])>>16, ntohl(a->key[1])&0xFFFF, + ntohl(a->key[2])>>16, ntohl(a->key[2])&0xFFFF, + ntohl(a->key[3])>>16, ntohl(a->key[3])&0xFFFF + ); +} + +/* + * Device functions + */ + +static int venet_open(struct net_device *dev) +{ + if (!ve_is_super(get_exec_env()) && !try_module_get(THIS_MODULE)) + return -EBUSY; + return 0; +} + +static int venet_close(struct net_device *master) +{ + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); + return 0; +} + +static void venet_destructor(struct net_device *dev) +{ + struct venet_stats *stats = (struct venet_stats *)dev->priv; + if (stats == NULL) + return; + free_percpu(stats->real_stats); + kfree(stats); + dev->priv = NULL; +} + +/* + * The higher levels take care of making this non-reentrant (it's + * called with bh's disabled). + */ +static int venet_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats; + struct net_device *rcv = NULL; + int length; + + stats = venet_stats(dev, smp_processor_id()); + if (unlikely(get_exec_env()->disable_net)) + goto outf; + + if (skb->protocol == __constant_htons(ETH_P_IP)) { + struct iphdr *iph; + iph = ip_hdr(skb); + if (ipv4_is_multicast(iph->daddr)) + goto outf; + } else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h; + ip6h = ipv6_hdr(skb); + if (ipv6_addr_is_multicast(&ip6h->daddr)) + goto outf; + skb_orphan(skb); + } else { + goto outf; + } + + if (venet_change_skb_owner(skb) < 0) + goto outf; + + if (unlikely(skb->owner_env->disable_net)) + goto outf; + + rcv = skb->owner_env->_venet_dev; + if (!rcv) + /* VE going down */ + goto outf; + + dev_hold(rcv); + + if (!(rcv->flags & IFF_UP)) { + /* Target VE does not want to receive packets */ + dev_put(rcv); + goto outf; + } + + skb->pkt_type = PACKET_HOST; + skb->dev = rcv; + + skb_reset_mac_header(skb); + memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len); + + dst_release(skb->dst); + skb->dst = NULL; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#endif + length = skb->len; + + netif_rx(skb); + + stats->tx_bytes += length; + stats->tx_packets++; + if (rcv) { + struct net_device_stats *rcv_stats; + + rcv_stats = venet_stats(rcv, smp_processor_id()); + rcv_stats->rx_bytes += length; + rcv_stats->rx_packets++; + dev_put(rcv); + } + + return 0; + +outf: + kfree_skb(skb); + ++stats->tx_dropped; + return 0; +} + +static struct net_device_stats *get_stats(struct net_device *dev) +{ + int i; + struct venet_stats *stats; + + stats = (struct venet_stats *)dev->priv; + memset(&stats->stats, 0, sizeof(struct net_device_stats)); + for (i=0; i < NR_CPUS; i++) { + struct net_device_stats *dev_stats; + + if (!cpu_possible(i)) + continue; + dev_stats = venet_stats(dev, i); + stats->stats.rx_bytes += dev_stats->rx_bytes; + stats->stats.tx_bytes += dev_stats->tx_bytes; + stats->stats.rx_packets += dev_stats->rx_packets; + stats->stats.tx_packets += dev_stats->tx_packets; + } + + return &stats->stats; +} + +/* Initialize the rest of the LOOPBACK device. */ +int venet_init_dev(struct net_device *dev) +{ + struct venet_stats *stats; + + dev->hard_start_xmit = venet_xmit; + stats = kzalloc(sizeof(struct venet_stats), GFP_KERNEL); + if (stats == NULL) + goto fail; + stats->real_stats = alloc_percpu(struct net_device_stats); + if (stats->real_stats == NULL) + goto fail_free; + dev->priv = stats; + + dev->get_stats = get_stats; + dev->open = venet_open; + dev->stop = venet_close; + dev->destructor = venet_destructor; + + /* + * Fill in the generic fields of the device structure. + */ + dev->type = ARPHRD_VOID; + dev->hard_header_len = ETH_HLEN; + dev->mtu = 1500; /* eth_mtu */ + dev->tx_queue_len = 0; + + memset(dev->broadcast, 0xFF, ETH_ALEN); + + /* New-style flags. */ + dev->flags = IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT; + return 0; + +fail_free: + kfree(stats); +fail: + return -ENOMEM; +} + +static int +venet_set_op(struct net_device *dev, u32 data, + int (*fop)(struct net_device *, u32)) +{ + + struct ve_struct *ve; + int ret = 0; + + read_lock(&ve_list_lock); + for_each_ve(ve) { + struct ve_struct *ve_old; + + ve_old = set_exec_env(ve); + read_lock(&dev_base_lock); + for_each_netdev(ve->ve_netns, dev) { + if (dev->hard_start_xmit == venet_xmit) + ret = fop(dev, data); + } + read_unlock(&dev_base_lock); + set_exec_env(ve_old); + + if (ret < 0) + break; + } + read_unlock(&ve_list_lock); + return ret; +} + +static unsigned long common_features; + +static int venet_op_set_sg(struct net_device *dev, u32 data) +{ + if (!ve_is_super(get_exec_env())) + return -EPERM; + + if (data) + common_features |= NETIF_F_SG; + else + common_features &= ~NETIF_F_SG; + + return venet_set_op(dev, data, ethtool_op_set_sg); +} + +static int venet_op_set_tx_csum(struct net_device *dev, u32 data) +{ + if (!ve_is_super(get_exec_env())) + return -EPERM; + + if (data) + common_features |= NETIF_F_IP_CSUM; + else + common_features &= ~NETIF_F_IP_CSUM; + + return venet_set_op(dev, data, ethtool_op_set_tx_csum); +} + +#define venet_op_set_rx_csum venet_op_set_tx_csum + +static struct ethtool_ops venet_ethtool_ops = { + .get_sg = ethtool_op_get_sg, + .set_sg = venet_op_set_sg, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = venet_op_set_tx_csum, + .get_rx_csum = ethtool_op_get_tx_csum, + .set_rx_csum = venet_op_set_rx_csum, + .get_tso = ethtool_op_get_tso, +}; + +static void venet_setup(struct net_device *dev) +{ + dev->init = venet_init_dev; + /* + * No other features, as they are: + * - checksumming is required, and nobody else will done our job + */ + dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX | + NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED; + + dev->features |= common_features; + + SET_ETHTOOL_OPS(dev, &venet_ethtool_ops); +} + +#ifdef CONFIG_PROC_FS +static int veinfo_seq_show(struct seq_file *m, void *v) +{ + struct ve_struct *ve; + struct ip_entry_struct *entry; + + ve = list_entry((struct list_head *)v, struct ve_struct, ve_list); + + seq_printf(m, "%10u %5u %5u", ve->veid, + ve->class_id, atomic_read(&ve->pcounter)); + read_lock(&veip_hash_lock); + if (ve->veip == NULL) + goto unlock; + list_for_each_entry (entry, &ve->veip->ip_lh, ve_list) { + char addr[40]; + + if (entry->active_env == NULL) + continue; + + veaddr_print(addr, sizeof(addr), &entry->addr); + if (entry->addr.family == AF_INET) + seq_printf(m, " %15s", addr); + else + seq_printf(m, " %39s", addr); + } +unlock: + read_unlock(&veip_hash_lock); + seq_putc(m, '\n'); + return 0; +} + +static struct seq_operations veinfo_seq_op = { + .start = ve_seq_start, + .next = ve_seq_next, + .stop = ve_seq_stop, + .show = veinfo_seq_show, +}; + +static int veinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &veinfo_seq_op); +} + +static struct file_operations proc_veinfo_operations = { + .open = veinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *veip_seq_start(struct seq_file *m, loff_t *pos) +{ + loff_t l; + struct list_head *p; + int i; + + l = *pos; + write_lock_irq(&veip_hash_lock); + if (l == 0) + return ip_entry_hash_table; + for (i = 0; i < VEIP_HASH_SZ; i++) { + list_for_each(p, ip_entry_hash_table + i) { + if (--l == 0) + return p; + } + } + return NULL; +} + +static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct list_head *p; + + p = (struct list_head *)v; + while (1) { + p = p->next; + if (p < ip_entry_hash_table || + p >= ip_entry_hash_table + VEIP_HASH_SZ) { + (*pos)++; + return p; + } + if (++p >= ip_entry_hash_table + VEIP_HASH_SZ) + return NULL; + } + return NULL; +} + +static void veip_seq_stop(struct seq_file *m, void *v) +{ + write_unlock_irq(&veip_hash_lock); +} + +static struct seq_operations veip_seq_op = { + .start = veip_seq_start, + .next = veip_seq_next, + .stop = veip_seq_stop, + .show = veip_seq_show, +}; + +static int veip_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &veip_seq_op); +} + +static struct file_operations proc_veip_operations = { + .open = veip_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + +static int real_ve_ip_map(envid_t veid, int op, struct sockaddr __user *uaddr, + int addrlen) +{ + int err; + struct ve_struct *ve; + struct ve_addr_struct addr; + + err = -EPERM; + if (!capable(CAP_SETVEID)) + goto out; + + err = sockaddr_to_veaddr(uaddr, addrlen, &addr); + if (err < 0) + goto out; + + switch (op) + { + case VE_IP_ADD: + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + if (ve->is_running) + err = veip_entry_add(ve, &addr); + up_read(&ve->op_sem); + put_ve(ve); + break; + + case VE_IP_DEL: + err = veip_entry_del(veid, &addr); + break; + default: + err = -EINVAL; + } + +out: + return err; +} + +int venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + err = -ENOTTY; + switch(cmd) { + case VENETCTL_VE_IP_MAP: { + struct vzctl_ve_ip_map s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen); + break; + } + } + return err; +} + +#ifdef CONFIG_COMPAT +int compat_venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + switch(cmd) { + case VENETCTL_COMPAT_VE_IP_MAP: { + struct compat_vzctl_ve_ip_map cs; + + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + err = real_ve_ip_map(cs.veid, cs.op, compat_ptr(cs.addr), + cs.addrlen); + break; + } + default: + err = venet_ioctl(file, cmd, arg); + break; + } + return err; +} +#endif + +static struct vzioctlinfo venetcalls = { + .type = VENETCTLTYPE, + .ioctl = venet_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_venet_ioctl, +#endif + .owner = THIS_MODULE, +}; + +int venet_dev_start(struct ve_struct *ve) +{ + struct net_device *dev_venet; + int err; + + dev_venet = alloc_netdev(0, "venet%d", venet_setup); + if (!dev_venet) + return -ENOMEM; + dev_net_set(dev_venet, ve->ve_netns); + err = dev_alloc_name(dev_venet, dev_venet->name); + if (err<0) + goto err; + if ((err = register_netdev(dev_venet)) != 0) + goto err; + ve->_venet_dev = dev_venet; + return 0; +err: + free_netdev(dev_venet); + printk(KERN_ERR "VENET initialization error err=%d\n", err); + return err; +} + +static int venet_start(void *data) +{ + struct ve_struct *env; + int err; + + env = (struct ve_struct *)data; + if (env->veip) + return -EEXIST; + + err = veip_start(env); + if (err != 0) + return err; + + err = venet_dev_start(env); + if (err) + goto err_free; + return 0; + +err_free: + veip_stop(env); + return err; +} + +static void venet_stop(void *data) +{ + struct ve_struct *env; + struct net_device *dev; + + env = (struct ve_struct *)data; + veip_stop(env); + + dev = env->_venet_dev; + if (dev == NULL) + return; + + unregister_netdev(dev); + env->_venet_dev = NULL; + free_netdev(dev); +} + +static struct ve_hook venet_ve_hook = { + .init = venet_start, + .fini = venet_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET, +}; + +__init int venet_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *de; +#endif + int i, err; + + if (get_ve0()->_venet_dev != NULL) + return -EEXIST; + + for (i = 0; i < VEIP_HASH_SZ; i++) + INIT_LIST_HEAD(ip_entry_hash_table + i); + + err = venet_start(get_ve0()); + if (err) + return err; + +#ifdef CONFIG_PROC_FS + de = proc_create("veinfo", S_IFREG | S_IRUSR, glob_proc_vz_dir, + &proc_veinfo_operations); + if (de == NULL) + printk(KERN_WARNING "venet: can't make veinfo proc entry\n"); + + de = proc_create("veip", S_IFREG | S_IRUSR, proc_vz_dir, + &proc_veip_operations); + if (de == NULL) + printk(KERN_WARNING "venet: can't make veip proc entry\n"); +#endif + + ve_hook_register(VE_SS_CHAIN, &venet_ve_hook); + vzioctl_register(&venetcalls); + return 0; +} + +__exit void venet_exit(void) +{ + vzioctl_unregister(&venetcalls); + ve_hook_unregister(&venet_ve_hook); + +#ifdef CONFIG_PROC_FS + remove_proc_entry("veip", proc_vz_dir); + remove_proc_entry("veinfo", glob_proc_vz_dir); +#endif + venet_stop(get_ve0()); + veip_cleanup(); +} + +module_init(venet_init); +module_exit(venet_exit); diff --git a/drivers/net/vzethdev.c b/drivers/net/vzethdev.c new file mode 100644 index 0000000..1414618 --- /dev/null +++ b/drivers/net/vzethdev.c @@ -0,0 +1,692 @@ +/* + * veth.c + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * Virtual ethernet device used to change VE ownership on packets + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include /* For the statistics structure. */ +#include /* For ARPHRD_ETHER */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static LIST_HEAD(veth_hwaddr_list); +static DEFINE_RWLOCK(ve_hwaddr_lock); +static DECLARE_MUTEX(hwaddr_sem); + +struct net_device * veth_dev_start(char *dev_addr, char *name); + +struct veth_struct *hwaddr_entry_lookup(char *name) +{ + struct veth_struct *entry; + + list_for_each_entry(entry, &veth_hwaddr_list, hwaddr_list) { + BUG_ON(entry->pair == NULL); + if (strncmp(name, entry->pair->name, IFNAMSIZ) == 0) + return entry; + } + return NULL; +} + +int veth_entry_add(struct ve_struct *ve, char *dev_addr, char *name, + char *dev_addr_ve, char *name_ve) +{ + struct net_device *dev_ve; + struct net_device *dev_ve0; + struct ve_struct *old_env; + char dev_name[IFNAMSIZ]; + int err; + + down(&hwaddr_sem); + + if (name[0] == '\0') + snprintf(dev_name, sizeof(dev_name), "vz%d.%%d", ve->veid); + else { + memcpy(dev_name, name, IFNAMSIZ - 1); + dev_name[IFNAMSIZ - 1] = '\0'; + } + dev_ve0 = veth_dev_start(dev_addr, dev_name); + if (IS_ERR(dev_ve0)) { + err = PTR_ERR(dev_ve0); + goto err; + } + + old_env = set_exec_env(ve); + if (name_ve[0] == '\0') + sprintf(dev_name, "eth%%d"); + else { + memcpy(dev_name, name_ve, IFNAMSIZ - 1); + dev_name[IFNAMSIZ - 1] = '\0'; + } + dev_ve = veth_dev_start(dev_addr_ve, dev_name); + if (IS_ERR(dev_ve)) { + err = PTR_ERR(dev_ve); + goto err_ve; + } + set_exec_env(old_env); + veth_from_netdev(dev_ve)->pair = dev_ve0; + veth_from_netdev(dev_ve0)->pair = dev_ve; + + write_lock(&ve_hwaddr_lock); + list_add(&(veth_from_netdev(dev_ve)->hwaddr_list), &veth_hwaddr_list); + write_unlock(&ve_hwaddr_lock); + + up(&hwaddr_sem); + return 0; + +err_ve: + set_exec_env(old_env); + unregister_netdev(dev_ve0); +err: + up(&hwaddr_sem); + return err; +} + +void veth_pair_del(struct ve_struct *env, struct veth_struct *entry) +{ + struct net_device *dev; + struct ve_struct *old_env; + + write_lock(&ve_hwaddr_lock); + list_del(&entry->hwaddr_list); + write_unlock(&ve_hwaddr_lock); + + dev = entry->pair; + BUG_ON(entry->pair == NULL); + + veth_from_netdev(dev)->pair = NULL; + entry->pair = NULL; + rtnl_lock(); + old_env = set_exec_env(dev->owner_env); + dev_close(dev); + + /* + * Now device from VE0 does not send or receive anything, + * i.e. dev->hard_start_xmit won't be called. + */ + set_exec_env(env); + unregister_netdevice(veth_to_netdev(entry)); + set_exec_env(dev->owner_env); + unregister_netdevice(dev); + set_exec_env(old_env); + rtnl_unlock(); +} + +int veth_entry_del(struct ve_struct *ve, char *name) +{ + struct veth_struct *found; + int err; + + err = -ENODEV; + down(&hwaddr_sem); + found = hwaddr_entry_lookup(name); + if (found == NULL) + goto out; + if (veth_to_netdev(found)->owner_env != ve) + goto out; + + err = 0; + veth_pair_del(ve, found); + +out: + up(&hwaddr_sem); + return err; +} + +int veth_allow_change_mac(envid_t veid, char *name, int allow) +{ + struct ve_struct *ve; + struct veth_struct *found; + int err; + + err = -ESRCH; + ve = get_ve_by_id(veid); + if (!ve) + return err; + + down_read(&ve->op_sem); + if (!ve->is_running) + goto out_ve; + err = -ENODEV; + down(&hwaddr_sem); + found = hwaddr_entry_lookup(name); + if (found == NULL) + goto out_sem; + if (veth_to_netdev(found)->owner_env != ve) + goto out_sem; + + err = 0; + found->allow_mac_change = allow; + +out_sem: + up(&hwaddr_sem); +out_ve: + up_read(&ve->op_sem); + put_ve(ve); + return err; +} + +/* + * Device functions + */ + +static int veth_open(struct net_device *dev) +{ + return 0; +} + +static int veth_close(struct net_device *master) +{ + return 0; +} + +static void veth_destructor(struct net_device *dev) +{ + free_percpu(veth_from_netdev(dev)->real_stats); + free_netdev(dev); +} + +static struct net_device_stats *get_stats(struct net_device *dev) +{ + int i; + struct net_device_stats *stats; + + stats = &veth_from_netdev(dev)->stats; + memset(stats, 0, sizeof(struct net_device_stats)); + for (i = 0; i < NR_CPUS; i++) { + struct net_device_stats *dev_stats; + + if (!cpu_possible(i)) + continue; + dev_stats = veth_stats(dev, i); + stats->rx_bytes += dev_stats->rx_bytes; + stats->tx_bytes += dev_stats->tx_bytes; + stats->rx_packets += dev_stats->rx_packets; + stats->tx_packets += dev_stats->tx_packets; + } + + return stats; +} + +/* + * The higher levels take care of making this non-reentrant (it's + * called with bh's disabled). + */ +static int veth_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats; + struct net_device *rcv = NULL; + struct veth_struct *entry; + int length; + + stats = veth_stats(dev, smp_processor_id()); + if (unlikely(get_exec_env()->disable_net)) + goto outf; + + entry = veth_from_netdev(dev); + rcv = entry->pair; + if (!rcv) + /* VE going down */ + goto outf; + + if (!(rcv->flags & IFF_UP)) { + /* Target VE does not want to receive packets */ + goto outf; + } + + if (unlikely(rcv->owner_env->disable_net)) + goto outf; + /* Filtering */ + if (ve_is_super(dev->owner_env) && + !veth_from_netdev(rcv)->allow_mac_change) { + /* from VE0 to VEX */ + if (ve_is_super(rcv->owner_env)) + goto out; + if (is_multicast_ether_addr( + ((struct ethhdr *)skb->data)->h_dest)) + goto out; + if (compare_ether_addr(((struct ethhdr *)skb->data)->h_dest, + rcv->dev_addr)) + goto outf; + } else if (!ve_is_super(dev->owner_env) && + !entry->allow_mac_change) { + /* from VE to VE0 */ + if (compare_ether_addr(((struct ethhdr *)skb->data)->h_source, + dev->dev_addr)) + goto outf; + } + +out: + skb->owner_env = rcv->owner_env; + + skb->dev = rcv; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, rcv); + + if (skb->protocol != __constant_htons(ETH_P_IP)) + skb_orphan(skb); + + dst_release(skb->dst); + skb->dst = NULL; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#endif + length = skb->len; + + netif_rx(skb); + + stats->tx_bytes += length; + stats->tx_packets++; + if (rcv) { + struct net_device_stats *rcv_stats; + rcv_stats = veth_stats(rcv, smp_processor_id()); + rcv_stats->rx_bytes += length; + rcv_stats->rx_packets++; + } + + return 0; + +outf: + kfree_skb(skb); + stats->tx_dropped++; + return 0; +} + +static int veth_set_mac(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + + if (!ve_is_super(dev->owner_env) && + !veth_from_netdev(dev)->allow_mac_change) + return -EPERM; + if (netif_running(dev)) + return -EBUSY; + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + + return 0; +} + +int veth_init_dev(struct net_device *dev) +{ + dev->hard_start_xmit = veth_xmit; + dev->get_stats = get_stats; + dev->open = veth_open; + dev->stop = veth_close; + dev->destructor = veth_destructor; + + ether_setup(dev); + dev->set_mac_address = veth_set_mac; + + /* remove setted by ether_setup() handler */ + dev->change_mtu = NULL; + + dev->tx_queue_len = 0; + + veth_from_netdev(dev)->real_stats = + alloc_percpu(struct net_device_stats); + if (veth_from_netdev(dev)->real_stats == NULL) + return -ENOMEM; + + return 0; +} + +static int +veth_set_op(struct net_device *dev, u32 data, + int (*fop)(struct net_device *, u32)) +{ + struct net_device *pair; + int ret = 0; + + ret = fop(dev, data); + if (ret < 0) + goto out; + + pair = veth_from_netdev(dev)->pair; + if (pair) + ret = fop(pair, data); +out: + return ret; +} + +static int veth_op_set_sg(struct net_device *dev, u32 data) +{ + return veth_set_op(dev, data, ethtool_op_set_sg); +} + +static int veth_op_set_tx_csum(struct net_device *dev, u32 data) +{ + return veth_set_op(dev, data, ethtool_op_set_tx_csum); +} + +#define veth_op_set_rx_csum veth_op_set_tx_csum + +static struct ethtool_ops veth_ethtool_ops = { + .get_sg = ethtool_op_get_sg, + .set_sg = veth_op_set_sg, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = veth_op_set_tx_csum, + .get_rx_csum = ethtool_op_get_tx_csum, + .set_rx_csum = veth_op_set_rx_csum, + .get_tso = ethtool_op_get_tso, +}; + +static void veth_setup(struct net_device *dev) +{ + dev->init = veth_init_dev; + /* + * No other features, as they are: + * - checksumming is required, and nobody else will done our job + */ + dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX | + NETIF_F_HIGHDMA; + + SET_ETHTOOL_OPS(dev, &veth_ethtool_ops); +} + +#ifdef CONFIG_PROC_FS +#define ADDR_FMT "%02x:%02x:%02x:%02x:%02x:%02x" +#define ADDR_ARG(x) (x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5] +static int vehwaddr_seq_show(struct seq_file *m, void *v) +{ + struct list_head *p; + struct veth_struct *entry; + + p = (struct list_head *)v; + if (p == &veth_hwaddr_list) { + seq_puts(m, "Version: 1.0\n"); + return 0; + } + entry = list_entry(p, struct veth_struct, hwaddr_list); + seq_printf(m, ADDR_FMT " %16s ", + ADDR_ARG(entry->pair->dev_addr), entry->pair->name); + seq_printf(m, ADDR_FMT " %16s %10u %5s\n", + ADDR_ARG(veth_to_netdev(entry)->dev_addr), + veth_to_netdev(entry)->name, + VEID(veth_to_netdev(entry)->owner_env), + entry->allow_mac_change ? "allow" : "deny"); + return 0; +} + +static void *vehwaddr_seq_start(struct seq_file *m, loff_t *pos) +{ + read_lock(&ve_hwaddr_lock); + return seq_list_start_head(&veth_hwaddr_list, *pos); +} + +static void *vehwaddr_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &veth_hwaddr_list, pos); +} + +static void vehwaddr_seq_stop(struct seq_file *m, void *v) +{ + read_unlock(&ve_hwaddr_lock); +} + +static struct seq_operations vehwaddr_seq_op = { + .start = vehwaddr_seq_start, + .next = vehwaddr_seq_next, + .stop = vehwaddr_seq_stop, + .show = vehwaddr_seq_show, +}; + +static int vehwaddr_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vehwaddr_seq_op); +} + +static struct file_operations proc_vehwaddr_operations = { + .open = vehwaddr_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + +int real_ve_hwaddr(envid_t veid, int op, + unsigned char *dev_addr, int addrlen, char *name, + unsigned char *dev_addr_ve, int addrlen_ve, char *name_ve) +{ + int err; + struct ve_struct *ve; + char ve_addr[ETH_ALEN]; + + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto out; + + err = -EINVAL; + switch (op) { + case VE_ETH_ADD: + if (addrlen != ETH_ALEN) + goto out; + if (addrlen_ve != ETH_ALEN && addrlen_ve != 0) + goto out; + /* If ve addr is not set then we use dev_addr[3] & 0x80 for it */ + if (addrlen_ve == 0 && (dev_addr[3] & 0x80)) + goto out; + if (addrlen_ve == 0) { + memcpy(ve_addr, dev_addr, ETH_ALEN); + ve_addr[3] |= 0x80; + } else { + memcpy(ve_addr, dev_addr_ve, ETH_ALEN); + } + + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + if (ve->is_running) + err = veth_entry_add(ve, dev_addr, name, ve_addr, name_ve); + up_read(&ve->op_sem); + put_ve(ve); + break; + + case VE_ETH_DEL: + if (name[0] == '\0') + goto out; + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + if (ve->is_running) + err = veth_entry_del(ve, name); + up_read(&ve->op_sem); + put_ve(ve); + break; + case VE_ETH_ALLOW_MAC_CHANGE: + case VE_ETH_DENY_MAC_CHANGE: + err = veth_allow_change_mac(veid, name, + op == VE_ETH_ALLOW_MAC_CHANGE); + break; + } + +out: + return err; +} + +int veth_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + err = -ENOTTY; + switch(cmd) { + case VETHCTL_VE_HWADDR: { + struct vzctl_ve_hwaddr s; + + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_ve_hwaddr(s.veid, s.op, s.dev_addr, s.addrlen, + s.dev_name, s.dev_addr_ve, s.addrlen_ve, + s.dev_name_ve); + } + break; + } + return err; +} + +static struct vzioctlinfo vethcalls = { + .type = VETHCTLTYPE, + .ioctl = veth_ioctl, + .compat_ioctl = veth_ioctl, + .owner = THIS_MODULE, +}; + +struct net_device * veth_dev_start(char *dev_addr, char *name) +{ + struct net_device *dev; + int err; + + if (!is_valid_ether_addr(dev_addr)) + return ERR_PTR(-EADDRNOTAVAIL); + + dev = alloc_netdev(sizeof(struct veth_struct), name, veth_setup); + if (!dev) + return ERR_PTR(-ENOMEM); + dev->nd_net = get_exec_env()->ve_netns; + if (strchr(dev->name, '%')) { + err = dev_alloc_name(dev, dev->name); + if (err < 0) + goto err; + } + if ((err = register_netdev(dev)) != 0) + goto err; + + memcpy(dev->dev_addr, dev_addr, ETH_ALEN); + dev->addr_len = ETH_ALEN; + + return dev; +err: + free_netdev(dev); + printk(KERN_ERR "%s initialization error err=%d\n", name, err); + return ERR_PTR(err); +} + +static int veth_start(void *data) +{ + return 0; +} + +static void veth_stop(void *data) +{ + struct ve_struct *env; + struct veth_struct *entry, *tmp; + + env = (struct ve_struct *)data; + down(&hwaddr_sem); + list_for_each_entry_safe(entry, tmp, &veth_hwaddr_list, hwaddr_list) + if (VEID(env) == VEID(veth_to_netdev(entry)->owner_env)) + veth_pair_del(env, entry); + up(&hwaddr_sem); +} + +static struct ve_hook veth_ve_hook = { + .init = veth_start, + .fini = veth_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET, +}; + +__init int veth_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *de; + + de = proc_create("veth", S_IFREG|S_IRUSR, proc_vz_dir, + &proc_vehwaddr_operations); + if (de == NULL) + printk(KERN_WARNING "veth: can't make vehwaddr proc entry\n"); +#endif + + ve_hook_register(VE_SS_CHAIN, &veth_ve_hook); + vzioctl_register(&vethcalls); + KSYMRESOLVE(veth_open); + KSYMMODRESOLVE(vzethdev); + return 0; +} + +__exit void veth_exit(void) +{ + struct veth_struct *entry; + struct list_head *tmp, *n; + struct ve_struct *ve; + + KSYMMODUNRESOLVE(vzethdev); + KSYMUNRESOLVE(veth_open); + vzioctl_unregister(&vethcalls); + ve_hook_unregister(&veth_ve_hook); +#ifdef CONFIG_PROC_FS + remove_proc_entry("veth", proc_vz_dir); +#endif + + down(&hwaddr_sem); + list_for_each_safe(tmp, n, &veth_hwaddr_list) { + entry = list_entry(tmp, struct veth_struct, hwaddr_list); + ve = get_ve(veth_to_netdev(entry)->owner_env); + + veth_pair_del(ve, entry); + + put_ve(ve); + } + up(&hwaddr_sem); +} + +module_init(veth_init); +module_exit(veth_exit); + +MODULE_AUTHOR("Andrey Mirkin "); +MODULE_DESCRIPTION("Virtuozzo Virtual Ethernet Device"); +MODULE_LICENSE("GPL v2"); + diff --git a/fs/Kconfig b/fs/Kconfig index abccb5d..59091ad 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -562,13 +562,22 @@ config QUOTA_NETLINK_INTERFACE config PRINT_QUOTA_WARNING bool "Print quota warnings to console (OBSOLETE)" depends on QUOTA - default y + default n help If you say Y here, quota warnings (about exceeding softlimit, reaching hardlimit, etc.) will be printed to the process' controlling terminal. Note that this behavior is currently deprecated and may go away in future. Please use notification via netlink socket instead. +config QUOTA_COMPAT + bool "Compatibility with older quotactl interface" + depends on QUOTA + help + This option enables compatibility layer for older version + of quotactl interface with byte granularity (QUOTAON at 0x0100, + GETQUOTA at 0x0D00). Interface versions older than that one and + with block granularity are still not supported. + config QFMT_V1 tristate "Old quota format support" depends on QUOTA @@ -584,6 +593,40 @@ config QFMT_V2 This quota format allows using quotas with 32-bit UIDs/GIDs. If you need this functionality say Y here. +config SIM_FS + tristate "VPS filesystem" + depends on VZ_QUOTA + default m + help + This file system is a part of Virtuozzo. It intoduces a fake + superblock and blockdev to VE to hide real device and show + statfs results taken from quota. + +config VZ_QUOTA + tristate "Virtuozzo Disk Quota support" + select QUOTA + select QUOTA_COMPAT + select VZ_DEV + default m + help + Virtuozzo Disk Quota imposes disk quota on directories with their + files and subdirectories in total. Such disk quota is used to + account and limit disk usage by Virtuozzo VPS, but also may be used + separately. + +config VZ_QUOTA_UNLOAD + bool "Unloadable Virtuozzo Disk Quota module" + depends on VZ_QUOTA=m + default n + help + Make Virtuozzo Disk Quota module unloadable. + Doesn't work reliably now. + +config VZ_QUOTA_UGID + bool "Per-user and per-group quota in Virtuozzo quota partitions" + depends on VZ_QUOTA!=n + default y + config QUOTACTL bool depends on XFS_QUOTA || QUOTA diff --git a/fs/Makefile b/fs/Makefile index a1482a5..8a040bf 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -54,9 +54,15 @@ obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTACTL) += quota.o +obj-$(CONFIG_VZ_QUOTA) += vzdquota.o +vzdquota-y += vzdquot.o vzdq_mgmt.o vzdq_ops.o vzdq_tree.o +vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_ugid.o +vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_file.o obj-$(CONFIG_DNOTIFY) += dnotify.o +obj-$(CONFIG_SIM_FS) += simfs.o + obj-$(CONFIG_PROC_FS) += proc/ obj-y += partitions/ obj-$(CONFIG_SYSFS) += sysfs/ diff --git a/fs/aio.c b/fs/aio.c index f658441..2742c37 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -43,13 +43,16 @@ #endif /*------ sysctl variables----*/ -static DEFINE_SPINLOCK(aio_nr_lock); +DEFINE_SPINLOCK(aio_nr_lock); +EXPORT_SYMBOL_GPL(aio_nr_lock); unsigned long aio_nr; /* current system wide number of aio requests */ +EXPORT_SYMBOL_GPL(aio_nr); unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ static struct kmem_cache *kiocb_cachep; -static struct kmem_cache *kioctx_cachep; +struct kmem_cache *kioctx_cachep; +EXPORT_SYMBOL_GPL(kioctx_cachep); static struct workqueue_struct *aio_wq; @@ -60,7 +63,7 @@ static DECLARE_WORK(fput_work, aio_fput_routine); static DEFINE_SPINLOCK(fput_lock); static LIST_HEAD(fput_head); -static void aio_kick_handler(struct work_struct *); +void aio_kick_handler(struct work_struct *); static void aio_queue_work(struct kioctx *); /* aio_setup @@ -327,7 +330,7 @@ static void aio_cancel_all(struct kioctx *ctx) spin_unlock_irq(&ctx->ctx_lock); } -static void wait_for_all_aios(struct kioctx *ctx) +void wait_for_all_aios(struct kioctx *ctx) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -350,6 +353,7 @@ static void wait_for_all_aios(struct kioctx *ctx) out: spin_unlock_irq(&ctx->ctx_lock); } +EXPORT_SYMBOL_GPL(wait_for_all_aios); /* wait_on_sync_kiocb: * Waits on the given sync kiocb to complete. @@ -836,7 +840,7 @@ static inline void aio_run_all_iocbs(struct kioctx *ctx) * space. * Run on aiod's context. */ -static void aio_kick_handler(struct work_struct *work) +void aio_kick_handler(struct work_struct *work) { struct kioctx *ctx = container_of(work, struct kioctx, wq.work); mm_segment_t oldfs = get_fs(); @@ -857,7 +861,7 @@ static void aio_kick_handler(struct work_struct *work) if (requeue) queue_delayed_work(aio_wq, &ctx->wq, 0); } - +EXPORT_SYMBOL_GPL(aio_kick_handler); /* * Called by kick_iocb to queue the kiocb for retry diff --git a/fs/autofs/init.c b/fs/autofs/init.c index cea5219..1217caf 100644 --- a/fs/autofs/init.c +++ b/fs/autofs/init.c @@ -25,6 +25,7 @@ static struct file_system_type autofs_fs_type = { .name = "autofs", .get_sb = autofs_get_sb, .kill_sb = autofs_kill_sb, + .fs_flags = FS_VIRTUALIZED, }; static int __init init_autofs_fs(void) diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index dda510d..1f6e222 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -78,7 +78,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, *uid = current->uid; *gid = current->gid; - *pgrp = task_pgrp_nr(current); + *pgrp = task_pgrp_vnr(current); *minproto = *maxproto = AUTOFS_PROTO_VERSION; diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 8aacade..f273f47 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -362,7 +362,7 @@ static int autofs_root_unlink(struct inode *dir, struct dentry *dentry) /* This allows root to remove symlinks */ lock_kernel(); - if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) { + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) { unlock_kernel(); return -EACCES; } @@ -556,7 +556,7 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp, _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; - if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) return -EPERM; switch(cmd) { diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 69a2f5c..aa0d0b0 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -100,7 +100,7 @@ struct autofs_sb_info { u32 magic; int pipefd; struct file *pipe; - pid_t oz_pgrp; + struct pid *oz_pgrp; int catatonic; int version; int sub_version; @@ -134,7 +134,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) filesystem without "magic".) */ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp; + return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } /* Does a dentry have some pending activity? */ diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index 723a1c5..01ac1e0 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -25,6 +25,7 @@ static struct file_system_type autofs_fs_type = { .name = "autofs", .get_sb = autofs_get_sb, .kill_sb = autofs4_kill_sb, + .fs_flags = FS_VIRTUALIZED, }; static int __init init_autofs4_fs(void) diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 7bb3e5b..dc86587 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -169,6 +169,8 @@ void autofs4_kill_sb(struct super_block *sb) /* Clean up and release dangling references */ autofs4_force_release(sbi); + put_pid(sbi->oz_pgrp); + sb->s_fs_info = NULL; kfree(sbi); @@ -190,7 +192,7 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",uid=%u", root_inode->i_uid); if (root_inode->i_gid != 0) seq_printf(m, ",gid=%u", root_inode->i_gid); - seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); + seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); @@ -235,7 +237,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, *uid = current->uid; *gid = current->gid; - *pgrp = task_pgrp_nr(current); + *pgrp = task_pgrp_vnr(current); *minproto = AUTOFS_MIN_PROTO_VERSION; *maxproto = AUTOFS_MAX_PROTO_VERSION; @@ -320,6 +322,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; + pid_t pgrp; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -332,7 +335,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->pipe = NULL; sbi->catatonic = 1; sbi->exp_timeout = 0; - sbi->oz_pgrp = task_pgrp_nr(current); sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; @@ -371,7 +373,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) /* Can this call block? */ if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, - &sbi->oz_pgrp, &sbi->type, &sbi->min_proto, + &pgrp, &sbi->type, &sbi->min_proto, &sbi->max_proto)) { printk("autofs: called with bogus options\n"); goto fail_dput; @@ -399,12 +401,20 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->version = sbi->max_proto; sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp); + DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pgrp); + + sbi->oz_pgrp = find_get_pid(pgrp); + + if (!sbi->oz_pgrp) { + printk("autofs: could not find process group %d\n", pgrp); + goto fail_dput; + } + pipe = fget(pipefd); if (!pipe) { printk("autofs: could not open pipe file descriptor\n"); - goto fail_dput; + goto fail_put_pid; } if (!pipe->f_op || !pipe->f_op->write) goto fail_fput; @@ -425,6 +435,8 @@ fail_fput: printk("autofs: pipe file descriptor does not contain proper ops\n"); fput(pipe); /* fall through */ +fail_put_pid: + put_pid(sbi->oz_pgrp); fail_dput: dput(root); goto fail_free; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 2a41c2a..a3191d3 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -689,7 +689,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) struct autofs_info *p_ino; /* This allows root to remove symlinks */ - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) return -EACCES; if (atomic_dec_and_test(&ino->count)) { @@ -883,7 +883,7 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp, _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) return -EPERM; switch(cmd) { diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 35216d1..dbdad95 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -148,6 +148,16 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; pktsz = sizeof(*packet); +#if defined CONFIG_X86_64 && defined CONFIG_IA32_EMULATION + /* + * On x86_64 autofs_v5_packet struct padded with 4 bytes + * it broke autofs daemon worked in ia32 emulation mode + * + * reduce size if work in 32-bit mode to satisfy userspace hope + */ + if (test_thread_flag(TIF_IA32)) + pktsz -= 4; +#endif packet->wait_queue_token = wq->wait_queue_token; packet->len = wq->name.len; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 204cfd1..cbc5262 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -375,12 +375,12 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) { - printk(KERN_NOTICE "executable not page aligned\n"); + ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n"); } if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit()) { - printk(KERN_WARNING + ve_printk(VE_LOG, KERN_WARNING "fd_offset is not page aligned. Please convert program: %s\n", bprm->file->f_path.dentry->d_name.name); } @@ -489,7 +489,7 @@ static int load_aout_library(struct file *file) if (printk_ratelimit()) { - printk(KERN_WARNING + ve_printk(VE_LOG, KERN_WARNING "N_TXTOFF is not page aligned. Please convert library: %s\n", file->f_path.dentry->d_name.name); } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 655ed8d..659bd08 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -432,7 +432,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, eppnt = elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { - int elf_type = MAP_PRIVATE | MAP_DENYWRITE; + int elf_type = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECPRIO; int elf_prot = 0; unsigned long vaddr = 0; unsigned long k, map_addr; @@ -814,7 +814,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (elf_ppnt->p_flags & PF_X) elf_prot |= PROT_EXEC; - elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; + elf_flags = MAP_PRIVATE | MAP_DENYWRITE | + MAP_EXECUTABLE | MAP_EXECPRIO; vaddr = elf_ppnt->p_vaddr; if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { @@ -949,7 +950,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES - retval = arch_setup_additional_pages(bprm, executable_stack); + retval = arch_setup_additional_pages(bprm, executable_stack, 0); if (retval < 0) { send_sig(SIGKILL, current, 0); goto out; diff --git a/fs/block_dev.c b/fs/block_dev.c index aff5421..54756e5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1307,7 +1307,7 @@ int __invalidate_device(struct block_device *bdev) * hold). */ shrink_dcache_sb(sb); - res = invalidate_inodes(sb); + res = invalidate_inodes_check(sb, 1); drop_super(sb); } invalidate_bdev(bdev); diff --git a/fs/buffer.c b/fs/buffer.c index ac78d4c..c375336 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -700,6 +700,8 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); static int __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { + int acct = 0; + if (unlikely(!mapping)) return !TestSetPageDirty(page); @@ -714,12 +716,14 @@ static int __set_page_dirty(struct page *page, __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); - task_io_account_write(PAGE_CACHE_SIZE); + acct = 1; } radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&mapping->tree_lock); + if (acct) + task_io_account_write(page, PAGE_CACHE_SIZE, 0); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return 1; diff --git a/fs/char_dev.c b/fs/char_dev.c index 3cb7cda..228de49 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -22,6 +22,8 @@ #include #include +#include + #ifdef CONFIG_KMOD #include #endif diff --git a/fs/compat.c b/fs/compat.c index 075d050..dc2674c 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,18 @@ int compat_printk(const char *fmt, ...) #include "read_write.h" +int ve_compat_printk(int dst, const char *fmt, ...) +{ + va_list ap; + int ret; + if (!compat_log) + return 0; + va_start(ap, fmt); + ret = ve_vprintk(dst, fmt, ap); + va_end(ap); + return ret; +} + /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. @@ -244,6 +257,8 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta struct kstatfs tmp; error = vfs_statfs(path.dentry, &tmp); if (!error) + error = faudit_statfs(path.mnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs(buf, &tmp); path_put(&path); } @@ -262,6 +277,8 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user goto out; error = vfs_statfs(file->f_path.dentry, &tmp); if (!error) + error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs(buf, &tmp); fput(file); out: @@ -312,6 +329,8 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s struct kstatfs tmp; error = vfs_statfs(path.dentry, &tmp); if (!error) + error = faudit_statfs(path.mnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs64(buf, &tmp); path_put(&path); } @@ -333,6 +352,8 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c goto out; error = vfs_statfs(file->f_path.dentry, &tmp); if (!error) + error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs64(buf, &tmp); fput(file); out: @@ -1355,6 +1376,10 @@ int compat_do_execve(char * filename, struct file *file; int retval; + retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL); + if (retval) + return retval; + retval = -ENOMEM; bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) diff --git a/fs/dcache.c b/fs/dcache.c index e7a1a99..18ea10d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -27,13 +27,20 @@ #include #include #include +#include #include #include #include #include #include +#include +#include +#include +#include #include "internal.h" +#include +#include int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); @@ -43,7 +50,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(dcache_lock); -static struct kmem_cache *dentry_cache __read_mostly; +struct kmem_cache *dentry_cache __read_mostly; #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname)) @@ -172,6 +179,7 @@ static struct dentry *d_kill(struct dentry *dentry) list_del(&dentry->d_u.d_child); dentry_stat.nr_dentry--; /* For d_free, below */ + preempt_enable_no_resched(); /*drops the locks, at that point nobody can reach this dentry */ dentry_iput(dentry); parent = dentry->d_parent; @@ -210,21 +218,31 @@ static struct dentry *d_kill(struct dentry *dentry) void dput(struct dentry *dentry) { + struct user_beancounter *ub; + unsigned long d_ubsize; + if (!dentry) return; repeat: if (atomic_read(&dentry->d_count) == 1) might_sleep(); - if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) - return; + preempt_disable(); + if (unlikely(ub_dentry_on)) { + spin_lock(&dcache_lock); + if (!atomic_dec_and_test(&dentry->d_count)) { + ub_dentry_uncharge_locked(dentry); + spin_unlock(&dcache_lock); + goto out_preempt; + } + } else { + if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) + goto out_preempt; + } spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count)) { - spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); - return; - } + if (atomic_read(&dentry->d_count)) + goto out_unlock; /* * AV: ->d_delete() is _NOT_ allowed to block now. @@ -240,8 +258,12 @@ repeat: dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); } +out_unlock: spin_unlock(&dentry->d_lock); + ub_dentry_uncharge_locked(dentry); spin_unlock(&dcache_lock); +out_preempt: + preempt_enable(); return; unhash_it: @@ -249,9 +271,18 @@ unhash_it: kill_it: /* if dentry was on the d_lru list delete it from there */ dentry_lru_del(dentry); + + ub = dentry->dentry_bc.d_ub; + d_ubsize = dentry->dentry_bc.d_ubsize; dentry = d_kill(dentry); - if (dentry) + preempt_disable(); + if (unlikely(ub_dentry_on)) { + uncharge_dcache(ub, d_ubsize); + put_beancounter(ub); + } + if (dentry) goto repeat; + preempt_enable(); } /** @@ -317,6 +348,7 @@ static inline struct dentry * __dget_locked(struct dentry *dentry) { atomic_inc(&dentry->d_count); dentry_lru_del_init(dentry); + ub_dentry_charge_nofail(dentry); return dentry; } @@ -419,6 +451,7 @@ static void prune_one_dentry(struct dentry * dentry) __acquires(dcache_lock) { __d_drop(dentry); + preempt_disable(); dentry = d_kill(dentry); /* @@ -434,6 +467,7 @@ static void prune_one_dentry(struct dentry * dentry) dentry->d_op->d_delete(dentry); dentry_lru_del_init(dentry); __d_drop(dentry); + preempt_disable(); dentry = d_kill(dentry); spin_lock(&dcache_lock); } @@ -727,6 +761,8 @@ void shrink_dcache_for_umount(struct super_block *sb) dentry = sb->s_root; sb->s_root = NULL; + /* "/" was also charged in d_alloc_root() */ + ub_dentry_uncharge(dentry); atomic_dec(&dentry->d_count); shrink_dcache_for_umount_subtree(dentry); @@ -886,12 +922,18 @@ void shrink_dcache_parent(struct dentry * parent) */ static int shrink_dcache_memory(int nr, gfp_t gfp_mask) { + int res = -1; + + KSTAT_PERF_ENTER(shrink_dcache) if (nr) { if (!(gfp_mask & __GFP_FS)) - return -1; + goto out; prune_dcache(nr); } - return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + res = (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; +out: + KSTAT_PERF_LEAVE(shrink_dcache) + return res; } static struct shrinker dcache_shrinker = { @@ -914,21 +956,27 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) struct dentry *dentry; char *dname; + dname = NULL; + if (name->len > DNAME_INLINE_LEN-1) { + dname = kmalloc(name->len + 1, GFP_KERNEL); + if (!dname) + goto err_name; + } + + ub_dentry_alloc_start(); + dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); if (!dentry) - return NULL; + goto err_alloc; - if (name->len > DNAME_INLINE_LEN-1) { - dname = kmalloc(name->len + 1, GFP_KERNEL); - if (!dname) { - kmem_cache_free(dentry_cache, dentry); - return NULL; - } - } else { + preempt_disable(); + if (dname == NULL) dname = dentry->d_iname; - } dentry->d_name.name = dname; + if (ub_dentry_alloc(dentry)) + goto err_charge; + dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; memcpy(dname, name->name, name->len); @@ -959,12 +1007,27 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) } spin_lock(&dcache_lock); - if (parent) + if (parent) { list_add(&dentry->d_u.d_child, &parent->d_subdirs); + if (parent->d_flags & DCACHE_VIRTUAL) + dentry->d_flags |= DCACHE_VIRTUAL; + } dentry_stat.nr_dentry++; spin_unlock(&dcache_lock); + preempt_enable(); + ub_dentry_alloc_end(); return dentry; + +err_charge: + preempt_enable(); + kmem_cache_free(dentry_cache, dentry); +err_alloc: + if (name->len > DNAME_INLINE_LEN - 1) + kfree(dname); + ub_dentry_alloc_end(); +err_name: + return NULL; } struct dentry *d_alloc_name(struct dentry *parent, const char *name) @@ -1371,12 +1434,12 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) unsigned int hash = name->hash; const unsigned char *str = name->name; struct hlist_head *head = d_hash(parent,hash); - struct dentry *found = NULL; struct hlist_node *node; - struct dentry *dentry; + struct dentry *dentry, *found; rcu_read_lock(); + found = NULL; hlist_for_each_entry_rcu(dentry, node, head, d_hash) { struct qstr *qstr; @@ -1416,6 +1479,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) atomic_inc(&dentry->d_count); found = dentry; + + if (ub_dentry_charge(found)) + goto charge_failure; + spin_unlock(&dentry->d_lock); break; next: @@ -1424,6 +1491,14 @@ next: rcu_read_unlock(); return found; + +charge_failure: + spin_unlock(&found->d_lock); + rcu_read_unlock(); + /* dentry is now unhashed, just kill it */ + dput(found); + /* ... and fail lookup */ + return NULL; } /** @@ -1892,6 +1967,16 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) } /** + * d_root_check - checks if dentry is accessible from current's fs root + * @dentry: dentry to be verified + * @vfsmnt: vfsmnt to which the dentry belongs + */ +int d_root_check(struct path *path) +{ + return PTR_ERR(d_path(path, NULL, 0)); +} + +/** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry (may be modified by this function) @@ -1915,18 +2000,21 @@ char *__d_path(const struct path *path, struct path *root, struct vfsmount *vfsmnt = path->mnt; char *end = buffer + buflen; char *retval; + int deleted; + struct vfsmount *oldmnt = vfsmnt; spin_lock(&vfsmount_lock); - prepend(&end, &buflen, "\0", 1); - if (!IS_ROOT(dentry) && d_unhashed(dentry) && - (prepend(&end, &buflen, " (deleted)", 10) != 0)) + if (buffer) { + prepend(&end, &buflen, "\0", 1); + if (buflen < 1) goto Elong; + } + deleted = (!IS_ROOT(dentry) && d_unhashed(dentry)); - if (buflen < 1) - goto Elong; /* Get '/' right */ retval = end-1; - *retval = '/'; + if (buffer) + *retval = '/'; for (;;) { struct dentry * parent; @@ -1944,20 +2032,43 @@ char *__d_path(const struct path *path, struct path *root, } parent = dentry->d_parent; prefetch(parent); - if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || - (prepend(&end, &buflen, "/", 1) != 0)) + if (buffer && ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || + (prepend(&end, &buflen, "/", 1) != 0))) goto Elong; retval = end; dentry = parent; } out: + if (deleted && buffer && + prepend(&end, &buflen, " (deleted)", 10) != 0) + goto Elong; +out_err: spin_unlock(&vfsmount_lock); - return retval; + return buffer ? retval : NULL; global_root: + /* + * We traversed the tree upward and reached a root, but the given + * lookup terminal point wasn't encountered. It means either that the + * dentry is out of our scope or belongs to an abstract space like + * sock_mnt or pipe_mnt. Check for it. + * + * There are different options to check it. + * We may assume that any dentry tree is unreachable unless it's + * connected to `root' (defined as fs root of init aka child reaper) + * and expose all paths that are not connected to it. + * The other option is to allow exposing of known abstract spaces + * explicitly and hide the path information for other cases. + * This approach is more safe, let's take it. 2001/04/22 SAW + */ + if (!(oldmnt->mnt_sb->s_flags & MS_NOUSER)) { + retval = ERR_PTR(-EINVAL); + goto out_err; + } + retval += 1; /* hit the slash */ - if (prepend_name(&retval, &buflen, &dentry->d_name) != 0) + if (buffer && prepend_name(&retval, &buflen, &dentry->d_name) != 0) goto Elong; root->mnt = vfsmnt; root->dentry = dentry; @@ -1965,8 +2076,9 @@ global_root: Elong: retval = ERR_PTR(-ENAMETOOLONG); - goto out; + goto out_err; } +EXPORT_SYMBOL(__d_path); /** * d_path - return the path of a dentry @@ -1993,8 +2105,11 @@ char *d_path(const struct path *path, char *buf, int buflen) * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: + * + * pipefs and socketfs methods assume valid buffer, d_root_check() + * supplies NULL one for access checks. */ - if (path->dentry->d_op && path->dentry->d_op->d_dname) + if (buf && path->dentry->d_op && path->dentry->d_op->d_dname) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); read_lock(¤t->fs->lock); @@ -2009,6 +2124,231 @@ char *d_path(const struct path *path, char *buf, int buflen) return res; } +#ifdef CONFIG_VE +#include +#include +#include +#include +#include + +static void mark_sub_tree_virtual(struct dentry *d) +{ + struct dentry *orig_root; + + orig_root = d; + while (1) { + spin_lock(&d->d_lock); + d->d_flags |= DCACHE_VIRTUAL; + spin_unlock(&d->d_lock); + + if (!list_empty(&d->d_subdirs)) { + d = list_entry(d->d_subdirs.next, + struct dentry, d_u.d_child); + continue; + } + if (d == orig_root) + break; + while (d == list_entry(d->d_parent->d_subdirs.prev, + struct dentry, d_u.d_child)) { + d = d->d_parent; + if (d == orig_root) + goto out; + } + d = list_entry(d->d_u.d_child.next, + struct dentry, d_u.d_child); + } +out: + return; +} + +void mark_tree_virtual(struct path *path) +{ + struct vfsmount *orig_rootmnt; + struct vfsmount *m = path->mnt; + struct dentry *d = path->dentry; + + spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); + orig_rootmnt = m; + while (1) { + mark_sub_tree_virtual(d); + if (!list_empty(&m->mnt_mounts)) { + m = list_entry(m->mnt_mounts.next, + struct vfsmount, mnt_child); + d = m->mnt_root; + continue; + } + if (m == orig_rootmnt) + break; + while (m == list_entry(m->mnt_parent->mnt_mounts.prev, + struct vfsmount, mnt_child)) { + m = m->mnt_parent; + if (m == orig_rootmnt) + goto out; + } + m = list_entry(m->mnt_child.next, + struct vfsmount, mnt_child); + d = m->mnt_root; + } +out: + spin_unlock(&vfsmount_lock); + spin_unlock(&dcache_lock); +} +EXPORT_SYMBOL(mark_tree_virtual); + +static struct vz_rate_info area_ri = { 20, 10*HZ }; +#define VE_AREA_ACC_CHECK 0x0001 +#define VE_AREA_ACC_DENY 0x0002 +#define VE_AREA_EXEC_CHECK 0x0010 +#define VE_AREA_EXEC_DENY 0x0020 +#define VE0_AREA_ACC_CHECK 0x0100 +#define VE0_AREA_ACC_DENY 0x0200 +#define VE0_AREA_EXEC_CHECK 0x1000 +#define VE0_AREA_EXEC_DENY 0x2000 +int ve_area_access_check = 0; + +static void print_connection_info(struct task_struct *tsk) +{ + struct files_struct *files; + struct fdtable *fdt; + int fd; + + files = get_files_struct(tsk); + if (!files) + return; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + for (fd = 0; fd < fdt->max_fds; fd++) { + struct file *file; + struct inode *inode; + struct socket *socket; + struct sock *sk; + struct inet_sock *inet; + + file = fdt->fd[fd]; + if (file == NULL) + continue; + + inode = file->f_dentry->d_inode; + if (!S_ISSOCK(inode->i_mode)) + continue; + + socket = SOCKET_I(inode); + if (socket == NULL) + continue; + + sk = socket->sk; + if ((sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + || sk->sk_type != SOCK_STREAM) + continue; + + inet = inet_sk(sk); + printk(KERN_ALERT "connection from %u.%u.%u.%u:%u to port %u\n", + NIPQUAD(inet->daddr), ntohs(inet->dport), + inet->num); + } + spin_unlock(&files->file_lock); + put_files_struct(files); +} + +static void check_alert(struct path *path, char *str) +{ + struct task_struct *tsk; + unsigned long page; + struct super_block *sb; + char *p; + + if (!vz_ratelimit(&area_ri)) + return; + + tsk = current; + p = ERR_PTR(-ENOMEM); + page = __get_free_page(GFP_KERNEL); + if (page) { + spin_lock(&dcache_lock); + p = __d_path(path, &tsk->fs->root, (char *)page, PAGE_SIZE); + spin_unlock(&dcache_lock); + } + if (IS_ERR(p)) + p = "(undefined)"; + + sb = path->dentry->d_sb; + printk(KERN_ALERT "%s check alert! file:[%s] from %d/%s, dev%x\n" + "Task %d/%d[%s] from VE%d, execenv %d\n", + str, p, sb->s_type->owner_env->veid, + sb->s_type->name, sb->s_dev, + tsk->pid, task_pid_vnr(tsk), tsk->comm, + VE_TASK_INFO(tsk)->owner_env->veid, + get_exec_env()->veid); + + free_page(page); + + print_connection_info(tsk); + + read_lock(&tasklist_lock); + tsk = tsk->parent; + get_task_struct(tsk); + read_unlock(&tasklist_lock); + + printk(KERN_ALERT "Parent %d/%d[%s] from VE%d\n", + tsk->pid, task_pid_vnr(tsk), tsk->comm, + VE_TASK_INFO(tsk)->owner_env->veid); + + print_connection_info(tsk); + put_task_struct(tsk); + dump_stack(); +} +#endif + +int check_area_access_ve(struct path *path) +{ +#ifdef CONFIG_VE + int check, alert, deny; + + if (ve_is_super(get_exec_env())) { + check = ve_area_access_check & VE0_AREA_ACC_CHECK; + alert = path->dentry->d_flags & DCACHE_VIRTUAL; + deny = ve_area_access_check & VE0_AREA_ACC_DENY; + } else { + check = ve_area_access_check & VE_AREA_ACC_CHECK; + alert = !(path->dentry->d_flags & DCACHE_VIRTUAL); + deny = ve_area_access_check & VE_AREA_ACC_DENY; + } + + if (check && alert) + check_alert(path, "Access"); + if (deny && alert) + return -EACCES; +#endif + return 0; +} + +#if 0 +int check_area_execute_ve(struct dentry *dentry, struct vfsmount *mnt) +{ +#ifdef CONFIG_VE + int check, alert, deny; + + if (ve_is_super(get_exec_env())) { + check = ve_area_access_check & VE0_AREA_EXEC_CHECK; + alert = dentry->d_flags & DCACHE_VIRTUAL; + deny = ve_area_access_check & VE0_AREA_EXEC_DENY; + } else { + check = ve_area_access_check & VE_AREA_EXEC_CHECK; + alert = !(dentry->d_flags & DCACHE_VIRTUAL); + deny = ve_area_access_check & VE_AREA_EXEC_DENY; + } + + if (check && alert) + check_alert(mnt, dentry, "Exec"); + if (deny && alert) + return -EACCES; +#endif + return 0; +} +#endif + /* * Helper function for dentry_operations.d_dname() members */ @@ -2201,10 +2541,12 @@ resume: goto repeat; } atomic_dec(&dentry->d_count); + ub_dentry_uncharge_locked(dentry); } if (this_parent != root) { next = this_parent->d_u.d_child.next; atomic_dec(&this_parent->d_count); + ub_dentry_uncharge_locked(this_parent); this_parent = this_parent->d_parent; goto resume; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 488eb42..b4da9f7 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -23,6 +23,7 @@ #include #include #include +#include #define DEVPTS_SUPER_MAGIC 0x1cd1 @@ -30,18 +31,26 @@ extern int pty_limit; /* Config limit on Unix98 ptys */ static DEFINE_IDA(allocated_ptys); +#ifdef CONFIG_VE +#define __ve_allocated_ptys(ve) (*((ve)->allocated_ptys)) +#define ve_allocated_ptys __ve_allocated_ptys(get_exec_env()) +#else +#define __ve_allocated_ptys(ve) allocated_ptys +#define ve_allocated_ptys allocated_ptys +#endif static DEFINE_MUTEX(allocated_ptys_lock); +struct devpts_config devpts_config = {.mode = 0600}; + +#ifndef CONFIG_VE static struct vfsmount *devpts_mnt; static struct dentry *devpts_root; - -static struct { - int setuid; - int setgid; - uid_t uid; - gid_t gid; - umode_t mode; -} config = {.mode = DEVPTS_DEFAULT_MODE}; +#define config devpts_config +#else +#define devpts_mnt (get_exec_env()->devpts_mnt) +#define devpts_root (get_exec_env()->devpts_root) +#define config (*(get_exec_env()->devpts_config)) +#endif enum { Opt_uid, Opt_gid, Opt_mode, @@ -93,7 +102,8 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) config.mode = option & S_IALLUGO; break; default: - printk(KERN_ERR "devpts: called with bogus options\n"); + ve_printk(VE_LOG, KERN_ERR + "devpts: called with bogus options\n"); return -EINVAL; } } @@ -157,13 +167,15 @@ static int devpts_get_sb(struct file_system_type *fs_type, return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt); } -static struct file_system_type devpts_fs_type = { +struct file_system_type devpts_fs_type = { .owner = THIS_MODULE, .name = "devpts", .get_sb = devpts_get_sb, .kill_sb = kill_anon_super, }; +EXPORT_SYMBOL(devpts_fs_type); + /* * The normal naming convention is simply /dev/pts/; this conforms * to the System V naming convention @@ -183,12 +195,12 @@ int devpts_new_index(void) int ida_ret; retry: - if (!ida_pre_get(&allocated_ptys, GFP_KERNEL)) { + if (!ida_pre_get(&ve_allocated_ptys, GFP_KERNEL)) { return -ENOMEM; } mutex_lock(&allocated_ptys_lock); - ida_ret = ida_get_new(&allocated_ptys, &index); + ida_ret = ida_get_new(&ve_allocated_ptys, &index); if (ida_ret < 0) { mutex_unlock(&allocated_ptys_lock); if (ida_ret == -EAGAIN) @@ -197,7 +209,7 @@ retry: } if (index >= pty_limit) { - ida_remove(&allocated_ptys, index); + ida_remove(&ve_allocated_ptys, index); mutex_unlock(&allocated_ptys_lock); return -EIO; } @@ -208,7 +220,7 @@ retry: void devpts_kill_index(int idx) { mutex_lock(&allocated_ptys_lock); - ida_remove(&allocated_ptys, idx); + ida_remove(&ve_allocated_ptys, idx); mutex_unlock(&allocated_ptys_lock); } @@ -278,6 +290,17 @@ void devpts_pty_kill(int number) mutex_unlock(&devpts_root->d_inode->i_mutex); } +void prepare_tty(void) +{ +#ifdef CONFIG_VE + get_ve0()->allocated_ptys = &allocated_ptys; + /* + * in this case, tty_register_driver() setups + * owner_env correctly right from the bootup + */ +#endif +} + static int __init init_devpts_fs(void) { int err = register_filesystem(&devpts_fs_type); @@ -286,11 +309,13 @@ static int __init init_devpts_fs(void) if (IS_ERR(devpts_mnt)) err = PTR_ERR(devpts_mnt); } + prepare_tty(); return err; } static void __exit exit_devpts_fs(void) { + /* the code is never called, the argument is irrelevant */ unregister_filesystem(&devpts_fs_type); mntput(devpts_mnt); } diff --git a/fs/direct-io.c b/fs/direct-io.c index 9606ee8..84f0486 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -660,7 +660,7 @@ submit_page_section(struct dio *dio, struct page *page, /* * Read accounting is performed in submit_bio() */ - task_io_account_write(len); + task_io_account_write(page, len, 1); } /* diff --git a/fs/dquot.c b/fs/dquot.c index 8ec4d6c..6d70056 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -162,7 +162,9 @@ static struct quota_format_type *find_quota_format(int id) struct quota_format_type *actqf; spin_lock(&dq_list_lock); - for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next); + for (actqf = quota_formats; + actqf && (actqf->qf_fmt_id != id || actqf->qf_ops == NULL); + actqf = actqf->qf_next); if (!actqf || !try_module_get(actqf->qf_owner)) { int qm; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 7cc0eb7..19f36d0 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -102,11 +103,6 @@ #define EP_UNACTIVE_PTR ((void *) -1L) -struct epoll_filefd { - struct file *file; - int fd; -}; - /* * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". * It is used to keep track on all tasks that are currently inside the wake_up() code @@ -129,79 +125,6 @@ struct poll_safewake { spinlock_t lock; }; -/* - * Each file descriptor added to the eventpoll interface will - * have an entry of this type linked to the "rbr" RB tree. - */ -struct epitem { - /* RB tree node used to link this structure to the eventpoll RB tree */ - struct rb_node rbn; - - /* List header used to link this structure to the eventpoll ready list */ - struct list_head rdllink; - - /* - * Works together "struct eventpoll"->ovflist in keeping the - * single linked chain of items. - */ - struct epitem *next; - - /* The file descriptor information this item refers to */ - struct epoll_filefd ffd; - - /* Number of active wait queue attached to poll operations */ - int nwait; - - /* List containing poll wait queues */ - struct list_head pwqlist; - - /* The "container" of this item */ - struct eventpoll *ep; - - /* List header used to link this item to the "struct file" items list */ - struct list_head fllink; - - /* The structure that describe the interested events and the source fd */ - struct epoll_event event; -}; - -/* - * This structure is stored inside the "private_data" member of the file - * structure and rapresent the main data sructure for the eventpoll - * interface. - */ -struct eventpoll { - /* Protect the this structure access */ - spinlock_t lock; - - /* - * This mutex is used to ensure that files are not removed - * while epoll is using them. This is held during the event - * collection loop, the file cleanup path, the epoll file exit - * code and the ctl operations. - */ - struct mutex mtx; - - /* Wait queue used by sys_epoll_wait() */ - wait_queue_head_t wq; - - /* Wait queue used by file->poll() */ - wait_queue_head_t poll_wait; - - /* List of ready file descriptors */ - struct list_head rdllist; - - /* RB tree root used to store monitored fd structs */ - struct rb_root rbr; - - /* - * This is a single linked list that chains all the "struct epitem" that - * happened while transfering ready events to userspace w/out - * holding ->lock. - */ - struct epitem *ovflist; -}; - /* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ @@ -229,7 +152,8 @@ struct ep_pqueue { /* * This mutex is used to serialize ep_free() and eventpoll_release_file(). */ -static struct mutex epmutex; +struct mutex epmutex; +EXPORT_SYMBOL_GPL(epmutex); /* Safe wake up implementation */ static struct poll_safewake psw; @@ -482,10 +406,11 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) } /* File callbacks that implement the eventpoll file behaviour */ -static const struct file_operations eventpoll_fops = { +const struct file_operations eventpoll_fops = { .release = ep_eventpoll_release, .poll = ep_eventpoll_poll }; +EXPORT_SYMBOL(eventpoll_fops); /* Fast test to see if the file is an evenpoll file */ static inline int is_file_epoll(struct file *f) @@ -557,7 +482,7 @@ static int ep_alloc(struct eventpoll **pep) * are protected by the "mtx" mutex, and ep_find() must be called with * "mtx" held. */ -static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) +struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; struct rb_node *rbp; @@ -583,6 +508,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) return epir; } +EXPORT_SYMBOL_GPL(ep_find); /* * This is the callback that is passed to the wait queue wakeup @@ -695,7 +621,7 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) /* * Must be called with "mtx" held. */ -static int ep_insert(struct eventpoll *ep, struct epoll_event *event, +int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd) { int error, revents, pwake = 0; @@ -792,6 +718,7 @@ error_unregister: error_return: return error; } +EXPORT_SYMBOL(ep_insert); /* * Modify the interest event mask by dropping an event if the new mask @@ -1089,6 +1016,7 @@ asmlinkage long sys_epoll_create(int size) return sys_epoll_create1(0); } +EXPORT_SYMBOL(sys_epoll_create); /* * The following function implements the controller interface for diff --git a/fs/exec.c b/fs/exec.c index cecee50..64cf4c2 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,8 @@ #include #include +#include + #ifdef CONFIG_KMOD #include #endif @@ -70,6 +73,8 @@ int suid_dumpable = 0; /* The maximal length of core_pattern is also specified in sysctl.c */ +int sysctl_at_vsyscall; + static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); @@ -239,9 +244,13 @@ static int __bprm_mm_init(struct linux_binprm *bprm) struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; - bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (ub_memory_charge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags, + NULL, UB_SOFT)) + goto fail_charge; + + bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL_UBC); if (!vma) - goto err; + goto fail_alloc; down_write(&mm->mmap_sem); vma->vm_mm = mm; @@ -275,7 +284,9 @@ err: bprm->vma = NULL; kmem_cache_free(vm_area_cachep, vma); } - +fail_alloc: + ub_memory_uncharge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags, NULL); +fail_charge: return err; } @@ -723,10 +734,11 @@ int kernel_read(struct file *file, unsigned long offset, EXPORT_SYMBOL(kernel_read); -static int exec_mmap(struct mm_struct *mm) +static int exec_mmap(struct linux_binprm *bprm) { struct task_struct *tsk; - struct mm_struct * old_mm, *active_mm; + struct mm_struct *old_mm, *active_mm, *mm; + int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; @@ -746,6 +758,10 @@ static int exec_mmap(struct mm_struct *mm) return -EINTR; } } + + ret = 0; + mm = bprm->mm; + mm->vps_dumpable = 1; task_lock(tsk); active_mm = tsk->active_mm; tsk->mm = mm; @@ -753,15 +769,25 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); task_unlock(tsk); arch_pick_mmap_layout(mm); + bprm->mm = NULL; /* We're using it now */ + +#ifdef CONFIG_VZ_GENCALLS + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXECMMAP, + bprm) & NOTIFY_FAIL) { + /* similar to binfmt_elf */ + send_sig(SIGKILL, current, 0); + ret = -ENOMEM; + } +#endif if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); mm_update_next_owner(old_mm); mmput(old_mm); - return 0; + return ret; } mmdrop(active_mm); - return 0; + return ret; } /* @@ -859,6 +885,10 @@ static int de_thread(struct task_struct *tsk) transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); +#ifdef CONFIG_VE + list_replace_rcu(&leader->ve_task_info.vetask_list, + &tsk->ve_task_info.vetask_list); +#endif tsk->group_leader = tsk; leader->group_leader = tsk; @@ -976,12 +1006,10 @@ int flush_old_exec(struct linux_binprm * bprm) /* * Release all of the old mmap stuff */ - retval = exec_mmap(bprm->mm); + retval = exec_mmap(bprm); if (retval) goto out; - bprm->mm = NULL; /* We're using it now */ - /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; @@ -1283,6 +1311,10 @@ int do_execve(char * filename, struct files_struct *displaced; int retval; + retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL); + if (retval) + return retval; + retval = unshare_files(&displaced); if (retval) goto out_ret; @@ -1526,7 +1558,7 @@ static int zap_process(struct task_struct *start) signal_wake_up(t, 1); nr++; } - } while_each_thread(start, t); + } while_each_thread_ve(start, t); return nr; } @@ -1581,7 +1613,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, * next_thread(). */ rcu_read_lock(); - for_each_process(g) { + for_each_process_ve(g) { if (g == tsk->group_leader) continue; if (g->flags & PF_KTHREAD) @@ -1596,7 +1628,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, } break; } - } while_each_thread(g, p); + } while_each_thread_ve(g, p); } rcu_read_unlock(); done: @@ -1732,7 +1764,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) /* * If another thread got here first, or we are not dumpable, bail out. */ - if (mm->core_state || !get_dumpable(mm)) { + if (mm->core_state || !get_dumpable(mm) || mm->vps_dumpable != 1) { up_write(&mm->mmap_sem); goto fail; } diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 80c97fd..c03ef38 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -31,6 +31,7 @@ */ #include +#include #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -257,6 +258,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry) struct page * page; int err = -ENOENT; + DQUOT_INIT(inode); + de = ext2_find_entry (dir, dentry, &page); if (!de) goto out; @@ -299,6 +302,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, struct ext2_dir_entry_2 * old_de; int err = -ENOENT; + if (new_inode) + DQUOT_INIT(new_inode); + old_de = ext2_find_entry (old_dir, old_dentry, &old_page); if (!old_de) goto out; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index fd88c7b..f4ff824 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1401,7 +1401,7 @@ static struct file_system_type ext2_fs_type = { .name = "ext2", .get_sb = ext2_get_sb, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, }; static int __init init_ext2_fs(void) diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 0d0c701..d4d3c11 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -87,7 +87,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * the relevant capability. */ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) { + if (!capable(CAP_SYS_ADMIN)) { mutex_unlock(&inode->i_mutex); err = -EPERM; goto flags_out; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index de13e91..565bca4 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1347,7 +1347,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, if (err) ext3_std_error(dir->i_sb, err); brelse(bh); - return 0; + return err; } /* diff --git a/fs/ext3/super.c b/fs/ext3/super.c index f38a5af..75e629a 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2938,7 +2938,7 @@ static struct file_system_type ext3_fs_type = { .name = "ext3", .get_sb = ext3_get_sb, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, }; static int __init init_ext3_fs(void) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7a6c2f1..ec237c2 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -79,7 +79,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) * the relevant capability. */ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) + if (!capable(CAP_SYS_ADMIN)) goto flags_out; } diff --git a/fs/fcntl.c b/fs/fcntl.c index ac4f7db..016a0cb 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -124,6 +124,7 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd) } return sys_dup3(oldfd, newfd, 0); } +EXPORT_SYMBOL_GPL(sys_dup2); asmlinkage long sys_dup(unsigned int fildes) { @@ -147,6 +148,9 @@ static int setfl(int fd, struct file * filp, unsigned long arg) struct inode * inode = filp->f_path.dentry->d_inode; int error = 0; + if (!capable(CAP_SYS_RAWIO) && !odirect_enable) + arg &= ~O_DIRECT; + /* * O_APPEND cannot be cleared if the file is marked as append-only * and the file is open for write. diff --git a/fs/file.c b/fs/file.c index f313314..b3cf859 100644 --- a/fs/file.c +++ b/fs/file.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,8 @@ #include #include +#include + struct fdtable_defer { spinlock_t lock; struct work_struct wq; @@ -41,9 +44,9 @@ static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); static inline void * alloc_fdmem(unsigned int size) { if (size <= PAGE_SIZE) - return kmalloc(size, GFP_KERNEL); + return kmalloc(size, GFP_KERNEL_UBC); else - return vmalloc(size); + return ub_vmalloc(size); } static inline void free_fdarr(struct fdtable *fdt) @@ -162,7 +165,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) if (unlikely(nr > sysctl_nr_open)) nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; - fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_UBC); if (!fdt) goto out; fdt->max_fds = nr; @@ -197,7 +200,7 @@ out: * Return <0 error code on error; 1 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ -static int expand_fdtable(struct files_struct *files, int nr) +int expand_fdtable(struct files_struct *files, int nr) __releases(files->file_lock) __acquires(files->file_lock) { @@ -237,6 +240,7 @@ static int expand_fdtable(struct files_struct *files, int nr) } return 1; } +EXPORT_SYMBOL_GPL(expand_fdtable); /* * Expand files. diff --git a/fs/file_table.c b/fs/file_table.c index f45a449..108a41b 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -21,9 +21,14 @@ #include #include #include +#include #include +#include +#include +#include + /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE @@ -37,13 +42,16 @@ static struct percpu_counter nr_files __cacheline_aligned_in_smp; static inline void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + put_ve(f->owner_env); kmem_cache_free(filp_cachep, f); } static inline void file_free(struct file *f) { - percpu_counter_dec(&nr_files); file_check_state(f); + if (f->f_ub == get_ub0()) + percpu_counter_dec(&nr_files); + ub_file_uncharge(f); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -97,11 +105,14 @@ struct file *get_empty_filp(void) struct task_struct *tsk; static int old_max; struct file * f; + int acct; + acct = (get_exec_ub() == get_ub0()); /* * Privileged users can go above max_files */ - if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + if (acct && get_nr_files() >= files_stat.max_files && + !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. @@ -114,7 +125,13 @@ struct file *get_empty_filp(void) if (f == NULL) goto fail; - percpu_counter_inc(&nr_files); + if (ub_file_charge(f)) + goto fail_ch; + if (acct) + percpu_counter_inc(&nr_files); + + f->owner_env = get_ve(get_exec_env()); + if (security_file_alloc(f)) goto fail_sec; @@ -141,6 +158,10 @@ fail_sec: file_free(f); fail: return NULL; + +fail_ch: + kmem_cache_free(filp_cachep, f); + return NULL; } EXPORT_SYMBOL(get_empty_filp); diff --git a/fs/filesystems.c b/fs/filesystems.c index f37f872..3dca4a7 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -12,6 +12,9 @@ #include #include #include +#include /* for 'current' */ +#include +#include #include /* @@ -21,8 +24,8 @@ * During the unload module must call unregister_filesystem(). * We can access the fields of list element if: * 1) spinlock is held or - * 2) we hold the reference to the module. - * The latter can be guaranteed by call of try_module_get(); if it + * 2) we hold the reference to the element. + * The latter can be guaranteed by call of try_filesystem(); if it * returned 0 we must skip the element, otherwise we got the reference. * Once the reference is obtained we can drop the spinlock. */ @@ -30,24 +33,46 @@ static struct file_system_type *file_systems; static DEFINE_RWLOCK(file_systems_lock); +int try_get_filesystem(struct file_system_type *fs) +{ + if (try_module_get(fs->owner)) { + (void)get_ve(fs->owner_env); + return 1; + } + return 0; +} + /* WARNING: This can be used only if we _already_ own a reference */ void get_filesystem(struct file_system_type *fs) { + (void)get_ve(fs->owner_env); __module_get(fs->owner); } void put_filesystem(struct file_system_type *fs) { module_put(fs->owner); + put_ve(fs->owner_env); +} + +static inline int check_ve_fstype(struct file_system_type *p, + struct ve_struct *env) +{ + return ((p->fs_flags & FS_VIRTUALIZED) || + ve_accessible_strict(p->owner_env, env)); } -static struct file_system_type **find_filesystem(const char *name, unsigned len) +static struct file_system_type **find_filesystem(const char *name, unsigned len, + struct ve_struct *env) { struct file_system_type **p; - for (p=&file_systems; *p; p=&(*p)->next) + for (p=&file_systems; *p; p=&(*p)->next) { + if (!check_ve_fstype(*p, env)) + continue; if (strlen((*p)->name) == len && strncmp((*p)->name, name, len) == 0) break; + } return p; } @@ -73,8 +98,12 @@ int register_filesystem(struct file_system_type * fs) if (fs->next) return -EBUSY; INIT_LIST_HEAD(&fs->fs_supers); + if (fs->owner_env == NULL) + fs->owner_env = get_ve0(); + if (fs->proto == NULL) + fs->proto = fs; write_lock(&file_systems_lock); - p = find_filesystem(fs->name, strlen(fs->name)); + p = find_filesystem(fs->name, strlen(fs->name), fs->owner_env); if (*p) res = -EBUSY; else @@ -118,6 +147,75 @@ int unregister_filesystem(struct file_system_type * fs) EXPORT_SYMBOL(unregister_filesystem); +#ifdef CONFIG_VE +int register_ve_fs_type(struct ve_struct *ve, struct file_system_type *template, + struct file_system_type **p_fs_type, struct vfsmount **p_mnt) +{ + struct vfsmount *mnt; + struct file_system_type *local_fs_type; + int ret; + + local_fs_type = kzalloc(sizeof(*local_fs_type) + sizeof(void *), + GFP_KERNEL); + if (local_fs_type == NULL) + return -ENOMEM; + + local_fs_type->name = template->name; + local_fs_type->fs_flags = template->fs_flags; + local_fs_type->get_sb = template->get_sb; + local_fs_type->kill_sb = template->kill_sb; + local_fs_type->owner = template->owner; + local_fs_type->owner_env = ve; + local_fs_type->proto = template; + + get_filesystem(local_fs_type); /* get_ve() inside */ + + ret = register_filesystem(local_fs_type); + if (ret) + goto reg_err; + + if (p_mnt == NULL) + goto done; + + mnt = vfs_kern_mount(local_fs_type, 0, local_fs_type->name, NULL); + if (IS_ERR(mnt)) + goto mnt_err; + + *p_mnt = mnt; +done: + *p_fs_type = local_fs_type; + return 0; + +mnt_err: + ret = PTR_ERR(mnt); + unregister_filesystem(local_fs_type); /* does not put */ + +reg_err: + put_filesystem(local_fs_type); + kfree(local_fs_type); + printk(KERN_DEBUG + "register_ve_fs_type(\"%s\") err=%d\n", template->name, ret); + return ret; +} + +EXPORT_SYMBOL(register_ve_fs_type); + +void unregister_ve_fs_type(struct file_system_type *local_fs_type, + struct vfsmount *local_fs_mount) +{ + if (local_fs_mount == NULL && local_fs_type == NULL) + return; + + unregister_filesystem(local_fs_type); + umount_ve_fs_type(local_fs_type); + if (local_fs_mount) + kern_umount(local_fs_mount); /* alias to mntput, drop our ref */ + put_filesystem(local_fs_type); +} + +EXPORT_SYMBOL(unregister_ve_fs_type); +#endif + static int fs_index(const char __user * __name) { struct file_system_type * tmp; @@ -131,11 +229,14 @@ static int fs_index(const char __user * __name) err = -EINVAL; read_lock(&file_systems_lock); - for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { + for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next) { + if (!check_ve_fstype(tmp, get_exec_env())) + continue; if (strcmp(tmp->name,name) == 0) { err = index; break; } + index++; } read_unlock(&file_systems_lock); putname(name); @@ -148,9 +249,15 @@ static int fs_name(unsigned int index, char __user * buf) int len, res; read_lock(&file_systems_lock); - for (tmp = file_systems; tmp; tmp = tmp->next, index--) - if (index <= 0 && try_module_get(tmp->owner)) - break; + for (tmp = file_systems; tmp; tmp = tmp->next) { + if (!check_ve_fstype(tmp, get_exec_env())) + continue; + if (!index) { + if (try_get_filesystem(tmp)) + break; + } else + index--; + } read_unlock(&file_systems_lock); if (!tmp) return -EINVAL; @@ -168,8 +275,9 @@ static int fs_maxindex(void) int index; read_lock(&file_systems_lock); - for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) - ; + for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next) + if (check_ve_fstype(tmp, get_exec_env())) + index++; read_unlock(&file_systems_lock); return index; } @@ -205,9 +313,10 @@ int get_filesystem_list(char * buf) read_lock(&file_systems_lock); tmp = file_systems; while (tmp && len < PAGE_SIZE - 80) { - len += sprintf(buf+len, "%s\t%s\n", - (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", - tmp->name); + if (check_ve_fstype(tmp, get_exec_env())) + len += sprintf(buf+len, "%s\t%s\n", + (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + tmp->name); tmp = tmp->next; } read_unlock(&file_systems_lock); @@ -221,14 +330,14 @@ struct file_system_type *get_fs_type(const char *name) unsigned len = dot ? dot - name : strlen(name); read_lock(&file_systems_lock); - fs = *(find_filesystem(name, len)); - if (fs && !try_module_get(fs->owner)) + fs = *(find_filesystem(name, len, get_exec_env())); + if (fs && !try_get_filesystem(fs)) fs = NULL; read_unlock(&file_systems_lock); if (!fs && (request_module("%.*s", len, name) == 0)) { read_lock(&file_systems_lock); - fs = *(find_filesystem(name, len)); - if (fs && !try_module_get(fs->owner)) + fs = *(find_filesystem(name, len, get_exec_env())); + if (fs && !try_get_filesystem(fs)) fs = NULL; read_unlock(&file_systems_lock); } diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 4f3cab3..755be17 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -10,6 +10,8 @@ #include #include +#include +#include #define FUSE_CTL_SUPER_MAGIC 0x65735543 @@ -17,7 +19,11 @@ * This is non-NULL when the single instance of the control filesystem * exists. Protected by fuse_mutex */ +#ifdef CONFIG_VE +#define fuse_control_sb (get_exec_env()->_fuse_control_sb) +#else static struct super_block *fuse_control_sb; +#endif static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) { @@ -211,12 +217,51 @@ static struct file_system_type fuse_ctl_fs_type = { .kill_sb = fuse_ctl_kill_sb, }; +#ifdef CONFIG_VE +static int fuse_ctl_start(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_ctl_fs_type != NULL) + return -EBUSY; + + return register_ve_fs_type(ve, &fuse_ctl_fs_type, + &ve->fuse_ctl_fs_type, NULL); +} + +static void fuse_ctl_stop(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_ctl_fs_type == NULL) + return; + + unregister_ve_fs_type(ve->fuse_ctl_fs_type, NULL); + ve->fuse_ctl_fs_type = NULL; +} + +static struct ve_hook fuse_ctl_ve_hook = { + .init = fuse_ctl_start, + .fini = fuse_ctl_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_FS, +}; +#endif + int __init fuse_ctl_init(void) { - return register_filesystem(&fuse_ctl_fs_type); + int err; + + err = register_filesystem(&fuse_ctl_fs_type); + if (err == 0) + ve_hook_register(VE_SS_CHAIN, &fuse_ctl_ve_hook); + return err; } void fuse_ctl_cleanup(void) { + ve_hook_unregister(&fuse_ctl_ve_hook); unregister_filesystem(&fuse_ctl_fs_type); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 3a87607..541375e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -45,7 +45,11 @@ #define FUSE_ALLOW_OTHER (1 << 1) /** List of active connections */ +#ifdef CONFIG_VE +#define fuse_conn_list (get_exec_env()->_fuse_conn_list) +#else extern struct list_head fuse_conn_list; +#endif /** Global mutex protecting fuse_conn_list and the control filesystem */ extern struct mutex fuse_mutex; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d2249f1..eef02a8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -19,13 +19,16 @@ #include #include #include +#include MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); static struct kmem_cache *fuse_inode_cachep; +#ifndef CONFIG_VE struct list_head fuse_conn_list; +#endif DEFINE_MUTEX(fuse_mutex); #define FUSE_SUPER_MAGIC 0x65735546 @@ -1033,6 +1036,41 @@ static void fuse_sysfs_cleanup(void) kobject_put(fuse_kobj); } +#ifdef CONFIG_VE +static int fuse_start(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_fs_type != NULL) + return -EBUSY; + + INIT_LIST_HEAD(&ve->_fuse_conn_list); + return register_ve_fs_type(ve, &fuse_fs_type, &ve->fuse_fs_type, NULL); +} + +static void fuse_stop(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_fs_type == NULL) + return; + + unregister_ve_fs_type(ve->fuse_fs_type, NULL); + kfree(ve->fuse_fs_type); + ve->fuse_fs_type = NULL; + BUG_ON(!list_empty(&ve->_fuse_conn_list)); +} + +static struct ve_hook fuse_ve_hook = { + .init = fuse_start, + .fini = fuse_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_FS, +}; +#endif + static int __init fuse_init(void) { int res; @@ -1057,6 +1095,7 @@ static int __init fuse_init(void) if (res) goto err_sysfs_cleanup; + ve_hook_register(VE_SS_CHAIN, &fuse_ve_hook); return 0; err_sysfs_cleanup: @@ -1073,6 +1112,7 @@ static void __exit fuse_exit(void) { printk(KERN_DEBUG "fuse exit\n"); + ve_hook_unregister(&fuse_ve_hook); fuse_ctl_cleanup(); fuse_sysfs_cleanup(); fuse_fs_cleanup(); diff --git a/fs/inode.c b/fs/inode.c index 0487ddb..156c8fb 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -8,10 +8,13 @@ #include #include #include +#include #include #include #include #include +#include +#include #include #include #include @@ -22,6 +25,7 @@ #include #include #include +#include /* * This is needed for the following functions: @@ -97,7 +101,8 @@ static DEFINE_MUTEX(iprune_mutex); */ struct inodes_stat_t inodes_stat; -static struct kmem_cache * inode_cachep __read_mostly; +struct kmem_cache * inode_cachep __read_mostly; + static void wake_up_inode(struct inode *inode) { @@ -108,11 +113,13 @@ static void wake_up_inode(struct inode *inode) wake_up_bit(&inode->i_state, __I_LOCK); } +static struct address_space_operations vfs_empty_aops; +struct inode_operations vfs_empty_iops; +static struct file_operations vfs_empty_fops; +EXPORT_SYMBOL(vfs_empty_iops); + static struct inode *alloc_inode(struct super_block *sb) { - static const struct address_space_operations empty_aops; - static struct inode_operations empty_iops; - static const struct file_operations empty_fops; struct inode *inode; if (sb->s_op->alloc_inode) @@ -127,8 +134,8 @@ static struct inode *alloc_inode(struct super_block *sb) inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; atomic_set(&inode->i_count, 1); - inode->i_op = &empty_iops; - inode->i_fop = &empty_fops; + inode->i_op = &vfs_empty_iops; + inode->i_fop = &vfs_empty_fops; inode->i_nlink = 1; atomic_set(&inode->i_writecount, 0); inode->i_size = 0; @@ -152,15 +159,15 @@ static struct inode *alloc_inode(struct super_block *sb) } spin_lock_init(&inode->i_lock); - lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); + lockdep_set_class(&inode->i_lock, &sb->s_type->proto->i_lock_key); mutex_init(&inode->i_mutex); - lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); + lockdep_set_class(&inode->i_mutex, &sb->s_type->proto->i_mutex_key); init_rwsem(&inode->i_alloc_sem); - lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); + lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->proto->i_alloc_sem_key); - mapping->a_ops = &empty_aops; + mapping->a_ops = &vfs_empty_aops; mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); @@ -311,13 +318,76 @@ static void dispose_list(struct list_head *head) spin_unlock(&inode_lock); } +static void show_header(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + printk("VFS: Busy inodes after unmount. " + "sb = %p, fs type = %s, sb count = %d, " + "sb->s_root = %s\n", sb, + (sb->s_type != NULL) ? sb->s_type->name : "", + sb->s_count, + (sb->s_root != NULL) ? + (char *)sb->s_root->d_name.name : ""); +} + +static void show_inode(struct inode *inode) +{ + struct dentry *d; + struct vfsmount *mnt; + int i; + + printk("inode = %p, inode->i_count = %d, " + "inode->i_nlink = %d, " + "inode->i_mode = %d, " + "inode->i_state = %ld, " + "inode->i_flags = %d, " + "inode->i_devices.next = %p, " + "inode->i_devices.prev = %p, " + "inode->i_ino = %ld\n", + inode, + atomic_read(&inode->i_count), + inode->i_nlink, + inode->i_mode, + inode->i_state, + inode->i_flags, + inode->i_devices.next, + inode->i_devices.prev, + inode->i_ino); + printk("inode dump: "); + for (i = 0; i < sizeof(*inode); i++) + printk("%2.2x ", *((u_char *)inode + i)); + printk("\n"); + list_for_each_entry(d, &inode->i_dentry, d_alias) { + printk(" d_alias %s d_count=%d d_flags=%x\n", + d->d_name.name, atomic_read(&d->d_count), d->d_flags); + for (i = 0; i < sizeof(*d); i++) + printk("%2.2x ", *((u_char *)d + i)); + printk("\n"); + } + + spin_lock(&vfsmount_lock); + list_for_each_entry(mnt, &get_task_mnt_ns(current)->list, mnt_list) { + if (mnt->mnt_sb != inode->i_sb) + continue; + printk("mnt=%p count=%d flags=%x exp_mask=%x\n", + mnt, atomic_read(&mnt->mnt_count), + mnt->mnt_flags, + mnt->mnt_expiry_mark); + for (i = 0; i < sizeof(*mnt); i++) + printk("%2.2x ", *((u_char *)mnt + i)); + printk("\n"); + } + spin_unlock(&vfsmount_lock); +} + /* * Invalidate all inodes for a device. */ -static int invalidate_list(struct list_head *head, struct list_head *dispose) +static int invalidate_list(struct list_head *head, struct list_head *dispose, int check) { struct list_head *next; - int busy = 0, count = 0; + int busy = 0, count = 0, once = 1; next = head->next; for (;;) { @@ -344,6 +414,14 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) continue; } busy = 1; + + if (check) { + if (once) { + once = 0; + show_header(inode); + } + show_inode(inode); + } } /* only unused inodes may be cached with i_count zero */ inodes_stat.nr_unused -= count; @@ -358,7 +436,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) * fails because there are busy inodes then a non zero value is returned. * If the discard is successful all the inodes have been discarded. */ -int invalidate_inodes(struct super_block * sb) +int invalidate_inodes_check(struct super_block * sb, int check) { int busy; LIST_HEAD(throw_away); @@ -366,7 +444,7 @@ int invalidate_inodes(struct super_block * sb) mutex_lock(&iprune_mutex); spin_lock(&inode_lock); inotify_unmount_inodes(&sb->s_inodes); - busy = invalidate_list(&sb->s_inodes, &throw_away); + busy = invalidate_list(&sb->s_inodes, &throw_away, check); spin_unlock(&inode_lock); dispose_list(&throw_away); @@ -375,7 +453,7 @@ int invalidate_inodes(struct super_block * sb) return busy; } -EXPORT_SYMBOL(invalidate_inodes); +EXPORT_SYMBOL(invalidate_inodes_check); static int can_unuse(struct inode *inode) { @@ -465,6 +543,7 @@ static void prune_icache(int nr_to_scan) */ static int shrink_icache_memory(int nr, gfp_t gfp_mask) { + KSTAT_PERF_ENTER(shrink_icache) if (nr) { /* * Nasty deadlock avoidance. We may hold various FS locks, @@ -475,6 +554,7 @@ static int shrink_icache_memory(int nr, gfp_t gfp_mask) return -1; prune_icache(nr); } + KSTAT_PERF_LEAVE(shrink_icache) return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; } @@ -584,7 +664,7 @@ void unlock_new_inode(struct inode *inode) */ mutex_destroy(&inode->i_mutex); mutex_init(&inode->i_mutex); - lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key); + lockdep_set_class(&inode->i_mutex, &type->proto->i_mutex_dir_key); } #endif /* diff --git a/fs/inotify.c b/fs/inotify.c index 690e725..01ddb06 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -32,6 +32,7 @@ #include #include #include +#include static atomic_t inotify_cookie; @@ -69,19 +70,6 @@ static atomic_t inotify_cookie; * inotify_add_watch() to the final put_inotify_watch(). */ -/* - * struct inotify_handle - represents an inotify instance - * - * This structure is protected by the mutex 'mutex'. - */ -struct inotify_handle { - struct idr idr; /* idr mapping wd -> watch */ - struct mutex mutex; /* protects this bad boy */ - struct list_head watches; /* list of watches */ - atomic_t count; /* reference count */ - u32 last_wd; /* the last wd allocated */ - const struct inotify_operations *in_ops; /* inotify caller operations */ -}; static inline void get_inotify_handle(struct inotify_handle *ih) { @@ -118,6 +106,9 @@ void put_inotify_watch(struct inotify_watch *watch) struct inotify_handle *ih = watch->ih; iput(watch->inode); + path_put(&watch->path); + watch->path.dentry = NULL; + watch->path.mnt = NULL; ih->in_ops->destroy_watch(watch); put_inotify_handle(ih); } @@ -476,6 +467,8 @@ void inotify_init_watch(struct inotify_watch *watch) INIT_LIST_HEAD(&watch->i_list); atomic_set(&watch->count, 0); get_inotify_watch(watch); /* initial get */ + watch->path.dentry = NULL; + watch->path.mnt = NULL; } EXPORT_SYMBOL_GPL(inotify_init_watch); @@ -616,8 +609,8 @@ EXPORT_SYMBOL_GPL(inotify_find_update_watch); * Caller must ensure it only calls inotify_add_watch() once per watch. * Calls inotify_handle_get_wd() so may sleep. */ -s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch, - struct inode *inode, u32 mask) +s32 __inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch, + struct path *path, struct inode * inode, u32 mask) { int ret = 0; int newly_watched; @@ -645,6 +638,10 @@ s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch, * Save a reference to the inode and bump the ref count to make it * official. We hold a reference to nameidata, which makes this safe. */ + if (path) { + path_get(path); + watch->path = *path; + } watch->inode = igrab(inode); /* Add the watch to the handle's and the inode's list */ @@ -666,6 +663,18 @@ out: } EXPORT_SYMBOL_GPL(inotify_add_watch); +s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch, + struct inode *inode, u32 mask) +{ + return __inotify_add_watch(ih, watch, NULL, inode, mask); +} + +s32 inotify_add_watch_dget(struct inotify_handle *ih, + struct inotify_watch *watch, struct path *p, u32 mask) +{ + return __inotify_add_watch(ih, watch, p, p->dentry->d_inode, mask); +} + /** * inotify_clone_watch - put the watch next to existing one * @old: already installed watch diff --git a/fs/inotify_user.c b/fs/inotify_user.c index d85c7d9..62e2d29 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -20,6 +20,7 @@ */ #include +#include #include #include #include @@ -66,47 +67,6 @@ static int inotify_max_queued_events __read_mostly; * first event, or to inotify_destroy(). */ -/* - * struct inotify_device - represents an inotify instance - * - * This structure is protected by the mutex 'mutex'. - */ -struct inotify_device { - wait_queue_head_t wq; /* wait queue for i/o */ - struct mutex ev_mutex; /* protects event queue */ - struct mutex up_mutex; /* synchronizes watch updates */ - struct list_head events; /* list of queued events */ - atomic_t count; /* reference count */ - struct user_struct *user; /* user who opened this dev */ - struct inotify_handle *ih; /* inotify handle */ - struct fasync_struct *fa; /* async notification */ - unsigned int queue_size; /* size of the queue (bytes) */ - unsigned int event_count; /* number of pending events */ - unsigned int max_events; /* maximum number of events */ -}; - -/* - * struct inotify_kernel_event - An inotify event, originating from a watch and - * queued for user-space. A list of these is attached to each instance of the - * device. In read(), this list is walked and all events that can fit in the - * buffer are returned. - * - * Protected by dev->ev_mutex of the device in which we are queued. - */ -struct inotify_kernel_event { - struct inotify_event event; /* the user-space event */ - struct list_head list; /* entry in inotify_device's list */ - char *name; /* filename, if any */ -}; - -/* - * struct inotify_user_watch - our version of an inotify_watch, we add - * a reference to the associated inotify_device. - */ -struct inotify_user_watch { - struct inotify_device *dev; /* associated device */ - struct inotify_watch wdata; /* inotify watch data */ -}; #ifdef CONFIG_SYSCTL @@ -383,8 +343,7 @@ static int find_inode(const char __user *dirname, struct path *path, * * Callers must hold dev->up_mutex. */ -static int create_watch(struct inotify_device *dev, struct inode *inode, - u32 mask) +int inotify_create_watch(struct inotify_device *dev, struct path *p, u32 mask) { struct inotify_user_watch *watch; int ret; @@ -404,12 +363,13 @@ static int create_watch(struct inotify_device *dev, struct inode *inode, atomic_inc(&dev->user->inotify_watches); inotify_init_watch(&watch->wdata); - ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask); + ret = inotify_add_watch_dget(dev->ih, &watch->wdata, p, mask); if (ret < 0) free_inotify_user_watch(&watch->wdata); return ret; } +EXPORT_SYMBOL(inotify_create_watch); /* Device Interface */ @@ -565,7 +525,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, return ret; } -static const struct file_operations inotify_fops = { +const struct file_operations inotify_fops = { .poll = inotify_poll, .read = inotify_read, .fasync = inotify_fasync, @@ -573,6 +533,7 @@ static const struct file_operations inotify_fops = { .unlocked_ioctl = inotify_ioctl, .compat_ioctl = inotify_ioctl, }; +EXPORT_SYMBOL(inotify_fops); static const struct inotify_operations inotify_user_ops = { .handle_event = inotify_dev_queue_event, @@ -662,6 +623,7 @@ asmlinkage long sys_inotify_init(void) { return sys_inotify_init1(0); } +EXPORT_SYMBOL(sys_inotify_init); asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask) { @@ -698,7 +660,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 m mutex_lock(&dev->up_mutex); ret = inotify_find_update_watch(dev->ih, inode, mask); if (ret == -ENOENT) - ret = create_watch(dev, inode, mask); + ret = inotify_create_watch(dev, &path, mask); mutex_unlock(&dev->up_mutex); path_put(&path); diff --git a/fs/ioprio.c b/fs/ioprio.c index da3cc46..e39b34f 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include static int set_task_ioprio(struct task_struct *task, int ioprio) { @@ -71,8 +73,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) int data = IOPRIO_PRIO_DATA(ioprio); struct task_struct *p, *g; struct user_struct *user; - struct pid *pgrp; int ret; + struct pid *pgrp; + + if (!ve_is_super(get_exec_env())) + return -EPERM; switch (class) { case IOPRIO_CLASS_RT: @@ -130,17 +135,23 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) if (!user) break; - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (p->uid != who) continue; ret = set_task_ioprio(p, ioprio); if (ret) goto free_uid; - } while_each_thread(g, p); + } while_each_thread_all(g, p); free_uid: if (who) free_uid(user); break; + case IOPRIO_WHO_UBC: + if (class != IOPRIO_CLASS_BE) + return -ERANGE; + + ret = bc_set_ioprio(who, data); + break; default: ret = -EINVAL; } @@ -185,9 +196,9 @@ asmlinkage long sys_ioprio_get(int which, int who) { struct task_struct *g, *p; struct user_struct *user; - struct pid *pgrp; int ret = -ESRCH; int tmpio; + struct pid *pgrp; read_lock(&tasklist_lock); switch (which) { @@ -223,7 +234,7 @@ asmlinkage long sys_ioprio_get(int which, int who) if (!user) break; - do_each_thread(g, p) { + do_each_thread_ve(g, p) { if (p->uid != user->uid) continue; tmpio = get_task_ioprio(p); @@ -233,7 +244,7 @@ asmlinkage long sys_ioprio_get(int which, int who) ret = tmpio; else ret = ioprio_best(ret, tmpio); - } while_each_thread(g, p); + } while_each_thread_ve(g, p); if (who) free_uid(user); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 31668b6..d7f9400 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -156,12 +156,15 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) { struct nlm_rqst *call; int status; + struct ve_struct *ve; nlm_get_host(host); call = nlm_alloc_call(host); if (call == NULL) return -ENOMEM; + ve = set_exec_env(host->owner_env); + nlmclnt_locks_init_private(fl, host); /* Set up the argument struct */ nlmclnt_setlockargs(call, fl); @@ -181,6 +184,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) fl->fl_ops = NULL; dprintk("lockd: clnt proc returns %d\n", status); + (void)set_exec_env(ve); return status; } EXPORT_SYMBOL_GPL(nlmclnt_proc); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index a17664c..cfa0cf3 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -53,6 +53,7 @@ static struct nlm_host *nlm_lookup_host(int server, struct nlm_host *host; struct nsm_handle *nsm = NULL; int hash; + struct ve_struct *ve; dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT ", p=%d, v=%u, my role=%s, name=%.*s)\n", @@ -78,10 +79,14 @@ static struct nlm_host *nlm_lookup_host(int server, * different NLM rpc_clients into one single nlm_host object. * This would allow us to have one nlm_host per address. */ + + ve = get_exec_env(); chain = &nlm_hosts[hash]; hlist_for_each_entry(host, pos, chain, h_hash) { if (!nlm_cmp_addr(&host->h_addr, sin)) continue; + if (!ve_accessible_strict(host->owner_env, ve)) + continue; /* See if we have an NSM handle for this client */ if (!nsm) @@ -141,6 +146,7 @@ static struct nlm_host *nlm_lookup_host(int server, spin_lock_init(&host->h_lock); INIT_LIST_HEAD(&host->h_granted); INIT_LIST_HEAD(&host->h_reclaim); + host->owner_env = ve; nrhosts++; out: @@ -454,6 +460,52 @@ nlm_gc_hosts(void) next_gc = jiffies + NLM_HOST_COLLECT; } +#ifdef CONFIG_VE +void ve_nlm_shutdown_hosts(struct ve_struct *ve) +{ + envid_t veid = ve->veid; + int i; + + dprintk("lockd: shutting down host module for ve %d\n", veid); + mutex_lock(&nlm_host_mutex); + + /* Perform a garbage collection pass */ + for (i = 0; i < NLM_HOST_NRHASH; i++) { + struct nlm_host *host; + struct hlist_node *pos; + + hlist_for_each_entry(host, pos, &nlm_hosts[i], h_hash) { + struct rpc_clnt *clnt; + + if (ve != host->owner_env) + continue; + + hlist_del(&host->h_hash); + if (host->h_nsmhandle) + host->h_nsmhandle->sm_monitored = 0; + dprintk("lockd: delete host %s ve %d\n", host->h_name, + veid); + if ((clnt = host->h_rpcclnt) != NULL) { + if (!list_empty(&clnt->cl_tasks)) { + struct rpc_xprt *xprt; + + printk(KERN_WARNING + "lockd: active RPC handle\n"); + rpc_killall_tasks(clnt); + xprt = clnt->cl_xprt; + xprt_disconnect_done(xprt); + xprt->ops->close(xprt); + } else + rpc_shutdown_client(clnt); + } + kfree(host); + nrhosts--; + } + } + + mutex_unlock(&nlm_host_mutex); +} +#endif /* * Manage NSM handles diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 5bd9bf0..2a9b08c 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -48,11 +49,13 @@ struct nlmsvc_binding * nlmsvc_ops; EXPORT_SYMBOL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); -static unsigned int nlmsvc_users; -static struct task_struct *nlmsvc_task; -static struct svc_rqst *nlmsvc_rqst; -int nlmsvc_grace_period; -unsigned long nlmsvc_timeout; +#ifndef CONFIG_VE +static unsigned int _nlmsvc_users; +static struct task_struct *_nlmsvc_task; +static struct svc_rqst *_nlmsvc_rqst; +int _nlmsvc_grace_period; +unsigned long _nlmsvc_timeout; +#endif /* * These can be set at insmod time (useful for NFS as root filesystem), @@ -175,6 +178,10 @@ lockd(void *vrqstp) */ err = svc_recv(rqstp, timeout); if (err == -EAGAIN || err == -EINTR) { +#ifdef CONFIG_VE + if (!get_exec_env()->is_running) + break; +#endif preverr = err; continue; } @@ -328,12 +335,12 @@ lockd_down(void) } else { printk(KERN_ERR "lockd_down: no users! task=%p\n", nlmsvc_task); - BUG(); + goto out; } if (!nlmsvc_task) { printk(KERN_ERR "lockd_down: no lockd running.\n"); - BUG(); + goto out; } kthread_stop(nlmsvc_task); svc_exit_thread(nlmsvc_rqst); @@ -478,6 +485,29 @@ static int lockd_authenticate(struct svc_rqst *rqstp) return SVC_DENIED; } +#ifdef CONFIG_VE +extern void ve_nlm_shutdown_hosts(struct ve_struct *ve); + +static int ve_lockd_start(void *data) +{ + return 0; +} + +static void ve_lockd_stop(void *data) +{ + struct ve_struct *ve = (struct ve_struct *)data; + + ve_nlm_shutdown_hosts(ve); + flush_scheduled_work(); +} + +static struct ve_hook lockd_hook = { + .init = ve_lockd_start, + .fini = ve_lockd_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_FS, +}; +#endif param_set_min_max(port, int, simple_strtol, 0, 65535) param_set_min_max(grace_period, unsigned long, simple_strtoul, @@ -505,16 +535,20 @@ module_param(nsm_use_hostnames, bool, 0644); static int __init init_nlm(void) { + ve_hook_register(VE_SS_CHAIN, &lockd_hook); #ifdef CONFIG_SYSCTL nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); - return nlm_sysctl_table ? 0 : -ENOMEM; -#else - return 0; + if (nlm_sysctl_table == NULL) { + ve_hook_unregister(&lockd_hook); + return -ENOMEM; + } #endif + return 0; } static void __exit exit_nlm(void) { + ve_hook_unregister(&lockd_hook); /* FIXME: delete all NLM clients */ nlm_shutdown_hosts(); #ifdef CONFIG_SYSCTL diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 198b4e5..7c9fb4f 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -335,6 +335,9 @@ nlmsvc_is_client(void *data, struct nlm_host *dummy) { struct nlm_host *host = data; + if (!ve_accessible_strict(host->owner_env, get_exec_env())) + return 0; + if (host->h_server) { /* we are destroying locks even though the client * hasn't asked us too, so don't unmonitor the diff --git a/fs/locks.c b/fs/locks.c index 5eb259e..c2f755b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -130,6 +130,8 @@ #include +#include + #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) @@ -146,9 +148,25 @@ static LIST_HEAD(blocked_list); static struct kmem_cache *filelock_cache __read_mostly; /* Allocate an empty lock structure. */ -static struct file_lock *locks_alloc_lock(void) +static struct file_lock *locks_alloc_lock(int charge) { - return kmem_cache_alloc(filelock_cache, GFP_KERNEL); + struct file_lock *fl; + + fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL); +#ifdef CONFIG_BEANCOUNTERS + if (fl == NULL) + goto out; + fl->fl_charged = 0; + if (!charge) + goto out; + if (!ub_flock_charge(fl, 1)) + goto out; + + kmem_cache_free(filelock_cache, fl); + fl = NULL; +out: +#endif + return fl; } static void locks_release_private(struct file_lock *fl) @@ -173,6 +191,7 @@ static void locks_free_lock(struct file_lock *fl) BUG_ON(!list_empty(&fl->fl_block)); BUG_ON(!list_empty(&fl->fl_link)); + ub_flock_uncharge(fl); locks_release_private(fl); kmem_cache_free(filelock_cache, fl); } @@ -276,7 +295,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock, if (type < 0) return type; - fl = locks_alloc_lock(); + fl = locks_alloc_lock(type != F_UNLCK); if (fl == NULL) return -ENOMEM; @@ -463,7 +482,7 @@ static int lease_init(struct file *filp, int type, struct file_lock *fl) /* Allocate a file_lock initialised to this type of lease */ static struct file_lock *lease_alloc(struct file *filp, int type) { - struct file_lock *fl = locks_alloc_lock(); + struct file_lock *fl = locks_alloc_lock(1); int error = -ENOMEM; if (fl == NULL) @@ -734,8 +753,13 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) goto find_conflict; if (request->fl_type != F_UNLCK) { + /* + * Nont F_UNLCK request must be already charged in + * flock_make_lock(). Actually new_fl must be charged not the + * request, but we try to fail earlier. + */ error = -ENOMEM; - new_fl = locks_alloc_lock(); + new_fl = locks_alloc_lock(0); if (new_fl == NULL) goto out; error = 0; @@ -787,6 +811,10 @@ find_conflict: } if (request->fl_flags & FL_ACCESS) goto out; + + set_flock_charged(new_fl); + unset_flock_charged(request); + locks_copy_lock(new_fl, request); locks_insert_lock(before, new_fl); new_fl = NULL; @@ -818,8 +846,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK || request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { - new_fl = locks_alloc_lock(); - new_fl2 = locks_alloc_lock(); + if (request->fl_type != F_UNLCK) + new_fl = locks_alloc_lock(1); + else + new_fl = NULL; + new_fl2 = locks_alloc_lock(0); } lock_kernel(); @@ -953,7 +984,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str * bail out. */ error = -ENOLCK; /* "no luck" */ - if (right && left == right && !new_fl2) + if (right && left == right && !(request->fl_type == F_UNLCK || new_fl2)) goto out; error = 0; @@ -964,23 +995,32 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str goto out; } - if (!new_fl) { - error = -ENOLCK; + error = -ENOLCK; + if (!new_fl) + goto out; + if (right && (left == right) && ub_flock_charge(new_fl, 1)) goto out; - } locks_copy_lock(new_fl, request); locks_insert_lock(before, new_fl); new_fl = NULL; + error = 0; } if (right) { if (left == right) { /* The new lock breaks the old one in two pieces, * so we have to use the second new lock. */ + error = -ENOLCK; + if (added && ub_flock_charge(new_fl2, + request->fl_type != F_UNLCK)) + goto out; + /* FIXME move all fl_charged manipulations in ub code */ + set_flock_charged(new_fl2); left = new_fl2; new_fl2 = NULL; locks_copy_lock(left, right); locks_insert_lock(before, left); + error = 0; } right->fl_start = request->fl_end + 1; locks_wake_up_blocks(right); @@ -1365,7 +1405,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) if (arg != F_UNLCK) { error = -ENOMEM; - new_fl = locks_alloc_lock(); + new_fl = locks_alloc_lock(1); if (new_fl == NULL) goto out; @@ -1608,6 +1648,7 @@ asmlinkage long sys_flock(unsigned int fd, unsigned int cmd) out: return error; } +EXPORT_SYMBOL_GPL(sys_flock); /** * vfs_test_lock - test file byte range lock @@ -1768,7 +1809,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, struct flock __user *l) { - struct file_lock *file_lock = locks_alloc_lock(); + struct file_lock *file_lock = locks_alloc_lock(0); struct flock flock; struct inode *inode; struct file *f; @@ -1886,7 +1927,7 @@ out: int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, struct flock64 __user *l) { - struct file_lock *file_lock = locks_alloc_lock(); + struct file_lock *file_lock = locks_alloc_lock(0); struct flock64 flock; struct inode *inode; struct file *f; @@ -2156,6 +2197,8 @@ static int locks_show(struct seq_file *f, void *v) struct file_lock *fl, *bfl; fl = list_entry(v, struct file_lock, fl_link); + if (!ve_accessible(fl->fl_file->owner_env, get_exec_env())) + goto out; lock_get_status(f, fl, (long)f->private, ""); @@ -2163,6 +2206,7 @@ static int locks_show(struct seq_file *f, void *v) lock_get_status(f, bfl, (long)f->private, " ->"); f->private++; +out: return 0; } @@ -2272,7 +2316,7 @@ EXPORT_SYMBOL(lock_may_write); static int __init filelock_init(void) { filelock_cache = kmem_cache_create("file_lock_cache", - sizeof(struct file_lock), 0, SLAB_PANIC, + sizeof(struct file_lock), 0, SLAB_PANIC|SLAB_UBC, init_once); return 0; } diff --git a/fs/namei.c b/fs/namei.c index 4ea63ed..a7c0e50 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -141,6 +141,7 @@ char * getname(const char __user * filename) { char *tmp, *result; + /*ub_dentry_checkup();*/ result = ERR_PTR(-ENOMEM); tmp = __getname(); if (tmp) { @@ -431,6 +432,21 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, if (!dentry) dentry = d_lookup(parent, name); + /* + * The revalidation rules are simple: + * d_revalidate operation is called when we're about to use a cached + * dentry rather than call d_lookup. + * d_revalidate method may unhash the dentry itself or return FALSE, in + * which case if the dentry can be released d_lookup will be called. + * + * Additionally, by request of NFS people + * (http://linux.bkbits.net:8080/linux-2.4/cset@1.181?nav=index.html|src/|src/fs|related/fs/namei.c) + * d_revalidate is called when `/', `.' or `..' are looked up. + * Since re-lookup is impossible on them, we introduce a hack and + * return an error in this case. + * + * 2003/02/19 SAW + */ if (dentry && dentry->d_op && dentry->d_op->d_revalidate) dentry = do_revalidate(dentry, nd); @@ -489,6 +505,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s struct dentry * result; struct inode *dir = parent->d_inode; +repeat: mutex_lock(&dir->i_mutex); /* * First re-do the cached lookup just in case it was created @@ -535,7 +552,7 @@ out_unlock: if (result->d_op && result->d_op->d_revalidate) { result = do_revalidate(result, nd); if (!result) - result = ERR_PTR(-ENOENT); + goto repeat; } return result; } @@ -775,6 +792,13 @@ static __always_inline void follow_dotdot(struct nameidata *nd) read_unlock(&fs->lock); break; } +#ifdef CONFIG_VE + if (nd->path.dentry == get_exec_env()->root_path.dentry && + nd->path.mnt == get_exec_env()->root_path.mnt) { + read_unlock(¤t->fs->lock); + break; + } +#endif read_unlock(&fs->lock); spin_lock(&dcache_lock); if (nd->path.dentry != nd->path.mnt->mnt_root) { @@ -816,6 +840,10 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, if (dentry->d_op && dentry->d_op->d_revalidate) goto need_revalidate; done: + if ((nd->flags & LOOKUP_STRICT) && d_mountpoint(dentry)) { + dput(dentry); + return -ENOENT; + } path->mnt = mnt; path->dentry = dentry; __follow_mount(path); @@ -853,6 +881,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd) struct inode *inode; int err; unsigned int lookup_flags = nd->flags; + int real_components = 0; while (*name=='/') name++; @@ -923,6 +952,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd) break; } /* This does the actual lookups.. */ + real_components++; err = do_lookup(nd, &this, &next); if (err) break; @@ -936,6 +966,9 @@ static int __link_path_walk(const char *name, struct nameidata *nd) goto out_dput; if (inode->i_op->follow_link) { + err = -ENOENT; + if (lookup_flags & LOOKUP_STRICT) + goto out_dput; err = do_follow_link(&next, nd); if (err) goto return_err; @@ -984,6 +1017,7 @@ last_component: break; inode = next.dentry->d_inode; if ((lookup_flags & LOOKUP_FOLLOW) + && !(lookup_flags & LOOKUP_STRICT) && inode && inode->i_op && inode->i_op->follow_link) { err = do_follow_link(&next, nd); if (err) @@ -1005,27 +1039,41 @@ lookup_parent: nd->last_type = LAST_NORM; if (this.name[0] != '.') goto return_base; - if (this.len == 1) + if (this.len == 1) { nd->last_type = LAST_DOT; - else if (this.len == 2 && this.name[1] == '.') + goto return_reval; + } else if (this.len == 2 && this.name[1] == '.') { nd->last_type = LAST_DOTDOT; - else - goto return_base; + goto return_reval; + } +return_base: + if (!(nd->flags & LOOKUP_NOAREACHECK)) { + err = check_area_access_ve(&nd->path); + if (err) + break; + } + return 0; return_reval: /* * We bypassed the ordinary revalidation routines. * We may need to check the cached dentry for staleness. */ - if (nd->path.dentry && nd->path.dentry->d_sb && + if (!real_components && nd->path.dentry && nd->path.dentry->d_sb && (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { err = -ESTALE; /* Note: we do not d_invalidate() */ if (!nd->path.dentry->d_op->d_revalidate( nd->path.dentry, nd)) + /* + * This lookup is for `/' or `.' or `..'. + * The filesystem unhashed the dentry itself + * inside d_revalidate (otherwise, d_invalidate + * wouldn't succeed). As a special courtesy to + * NFS we return an error. 2003/02/19 SAW + */ break; } -return_base: - return 0; + goto return_base; out_dput: path_put_conditional(&next, nd); break; @@ -2045,6 +2093,7 @@ asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev) { return sys_mknodat(AT_FDCWD, filename, mode, dev); } +EXPORT_SYMBOL_GPL(sys_mknod); int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { @@ -2105,6 +2154,7 @@ asmlinkage long sys_mkdir(const char __user *pathname, int mode) { return sys_mkdirat(AT_FDCWD, pathname, mode); } +EXPORT_SYMBOL_GPL(sys_mkdir); /* * We try to drop the dentry early: we should have @@ -2132,6 +2182,7 @@ void dentry_unhash(struct dentry *dentry) spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); } +EXPORT_SYMBOL(sys_symlink); int vfs_rmdir(struct inode *dir, struct dentry *dentry) { @@ -2212,6 +2263,7 @@ asmlinkage long sys_rmdir(const char __user *pathname) { return do_rmdir(AT_FDCWD, pathname); } +EXPORT_SYMBOL_GPL(sys_rmdir); int vfs_unlink(struct inode *dir, struct dentry *dentry) { @@ -2312,6 +2364,7 @@ asmlinkage long sys_unlink(const char __user *pathname) { return do_unlinkat(AT_FDCWD, pathname); } +EXPORT_SYMBOL_GPL(sys_unlink); int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) { @@ -2475,6 +2528,7 @@ asmlinkage long sys_link(const char __user *oldname, const char __user *newname) { return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } +EXPORT_SYMBOL(sys_rename); /* * The worst of all namespace operations - renaming directory. "Perverted" @@ -2586,6 +2640,9 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); const char *old_name; + if (DQUOT_RENAME(old_dentry->d_inode, old_dir, new_dir)) + return -EXDEV; + if (old_dentry->d_inode == new_dentry->d_inode) return 0; diff --git a/fs/namespace.c b/fs/namespace.c index 6e283c9..746610b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -37,6 +37,7 @@ /* spinlock for vfsmount related operations, inplace of dcache_lock */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); +EXPORT_SYMBOL(vfsmount_lock); static int event; static DEFINE_IDA(mnt_id_ida); @@ -44,7 +45,8 @@ static DEFINE_IDA(mnt_group_ida); static struct list_head *mount_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; -static struct rw_semaphore namespace_sem; +struct rw_semaphore namespace_sem; +EXPORT_SYMBOL_GPL(namespace_sem); /* /sys/fs */ struct kobject *fs_kobj; @@ -116,11 +118,12 @@ struct vfsmount *alloc_vfsmnt(const char *name) goto out_free_cache; if (name) { - mnt->mnt_devname = kstrdup(name, GFP_KERNEL); + mnt->mnt_devname = kstrdup(name, GFP_KERNEL_UBC); if (!mnt->mnt_devname) goto out_free_id; } + mnt->owner = VEID(get_exec_env()); atomic_set(&mnt->mnt_count, 1); INIT_LIST_HEAD(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); @@ -797,15 +800,48 @@ static void show_type(struct seq_file *m, struct super_block *sb) } } +static int prepare_mnt_root_mangle(struct path *path, + char **path_buf, char **ret_path) +{ + /* skip FS_NOMOUNT mounts (rootfs) */ + if (path->mnt->mnt_sb->s_flags & MS_NOUSER) + return -EACCES; + + *path_buf = (char *)__get_free_page(GFP_KERNEL); + if (!*path_buf) + return -ENOMEM; + + *ret_path = d_path(path, *path_buf, PAGE_SIZE); + if (IS_ERR(*ret_path)) { + free_page((unsigned long)*path_buf); + /* + * This means that the file position will be incremented, i.e. + * the total number of "invisible" vfsmnt will leak. + */ + return -EACCES; + } + return 0; +} + static int show_vfsmnt(struct seq_file *m, void *v) { struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - int err = 0; + int err; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + char *path_buf, *path; - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + err = prepare_mnt_root_mangle(&mnt_path, &path_buf, &path); + if (err < 0) + return (err == -EACCES ? 0 : err); + + if (ve_is_super(get_exec_env()) || + !(mnt->mnt_sb->s_type->fs_flags & FS_MANGLE_PROC)) + mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + else + mangle(m, mnt->mnt_sb->s_type->name); seq_putc(m, ' '); - seq_path(m, &mnt_path, " \t\n\\"); + mangle(m, path); + free_page((unsigned long) path_buf); seq_putc(m, ' '); show_type(m, mnt->mnt_sb); seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); @@ -892,18 +928,27 @@ static int show_vfsstat(struct seq_file *m, void *v) { struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - int err = 0; + char *path_buf, *path; + int err; + + err = prepare_mnt_root_mangle(&mnt_path, &path_buf, &path); + if (err < 0) + return (err == -EACCES ? 0 : err); /* device */ if (mnt->mnt_devname) { seq_puts(m, "device "); - mangle(m, mnt->mnt_devname); + if (ve_is_super(get_exec_env())) + mangle(m, mnt->mnt_devname); + else + mangle(m, mnt->mnt_sb->s_type->name); } else seq_puts(m, "no device"); /* mount point */ seq_puts(m, " mounted on "); - seq_path(m, &mnt_path, " \t\n\\"); + mangle(m, path); + free_page((unsigned long)path_buf); seq_putc(m, ' '); /* file system type */ @@ -1120,6 +1165,34 @@ static int do_umount(struct vfsmount *mnt, int flags) return retval; } +#ifdef CONFIG_VE +void umount_ve_fs_type(struct file_system_type *local_fs_type) +{ + struct vfsmount *mnt; + struct list_head *p, *q; + LIST_HEAD(kill); + LIST_HEAD(umount_list); + + down_write(&namespace_sem); + spin_lock(&vfsmount_lock); + list_for_each_safe(p, q, ¤t->nsproxy->mnt_ns->list) { + mnt = list_entry(p, struct vfsmount, mnt_list); + if (mnt->mnt_sb->s_type != local_fs_type) + continue; + list_del(p); + list_add(p, &kill); + } + + while (!list_empty(&kill)) { + mnt = list_entry(kill.next, struct vfsmount, mnt_list); + umount_tree(mnt, 1, &umount_list); + } + spin_unlock(&vfsmount_lock); + up_write(&namespace_sem); + release_mounts(&umount_list); +} +#endif + /* * Now umount can handle mount points as well as block devices. * This is important for filesystems which use unnamed block devices. @@ -1143,7 +1216,7 @@ asmlinkage long sys_umount(char __user * name, int flags) goto dput_and_out; retval = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) goto dput_and_out; retval = do_umount(path.mnt, flags); @@ -1169,7 +1242,7 @@ asmlinkage long sys_oldumount(char __user * name) static int mount_is_safe(struct nameidata *nd) { - if (capable(CAP_SYS_ADMIN)) + if (capable(CAP_VE_SYS_ADMIN)) return 0; return -EPERM; #ifdef notyet @@ -1439,6 +1512,8 @@ static noinline int do_change_type(struct nameidata *nd, int flag) if (nd->path.dentry != nd->path.mnt->mnt_root) return -EINVAL; + if (!ve_accessible_veid(nd->path.mnt->owner, get_exec_env()->veid)) + return -EPERM; down_write(&namespace_sem); if (type == MS_SHARED) { @@ -1462,7 +1537,7 @@ static noinline int do_change_type(struct nameidata *nd, int flag) * noinline this do_mount helper to save do_mount stack space. */ static noinline int do_loopback(struct nameidata *nd, char *old_name, - int recurse) + int recurse, int mnt_flags) { struct nameidata old_nd; struct vfsmount *mnt = NULL; @@ -1492,6 +1567,7 @@ static noinline int do_loopback(struct nameidata *nd, char *old_name, if (!mnt) goto out; + mnt->mnt_flags |= mnt_flags; err = graft_tree(mnt, &nd->path); if (err) { LIST_HEAD(umount_list); @@ -1536,7 +1612,7 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, int err; struct super_block *sb = nd->path.mnt->mnt_sb; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (!check_mnt(nd->path.mnt)) @@ -1545,6 +1621,9 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, if (nd->path.dentry != nd->path.mnt->mnt_root) return -EINVAL; + if (!ve_accessible_veid(nd->path.mnt->owner, get_exec_env()->veid)) + return -EPERM; + down_write(&sb->s_umount); if (flags & MS_BIND) err = change_mount_flags(nd->path.mnt, flags); @@ -1577,7 +1656,7 @@ static noinline int do_move_mount(struct nameidata *nd, char *old_name) struct path parent_path; struct vfsmount *p; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (!old_name || !*old_name) return -EINVAL; @@ -1585,6 +1664,10 @@ static noinline int do_move_mount(struct nameidata *nd, char *old_name) if (err) return err; + err = -EPERM; + if (!ve_accessible_veid(old_nd.path.mnt->owner, get_exec_env()->veid)) + goto out_nosem; + down_write(&namespace_sem); while (d_mountpoint(nd->path.dentry) && follow_down(&nd->path.mnt, &nd->path.dentry)) @@ -1642,6 +1725,7 @@ out: up_write(&namespace_sem); if (!err) path_put(&parent_path); +out_nosem: path_put(&old_nd.path); return err; } @@ -1660,7 +1744,7 @@ static noinline int do_new_mount(struct nameidata *nd, char *type, int flags, return -EINVAL; /* we need capabilities... */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; mnt = do_kern_mount(type, flags, name, data); @@ -1699,6 +1783,11 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, goto unlock; newmnt->mnt_flags = mnt_flags; + + /* make this before graft_tree reveals mnt_root to the world... */ + if (path->dentry->d_flags & DCACHE_VIRTUAL) + newmnt->mnt_root->d_flags |= DCACHE_VIRTUAL; + if ((err = graft_tree(newmnt, path))) goto unlock; @@ -1953,7 +2042,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) - retval = do_loopback(&nd, dev_name, flags & MS_REC); + retval = do_loopback(&nd, dev_name, flags & MS_REC, mnt_flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&nd, flags); else if (flags & MS_MOVE) @@ -2089,6 +2178,7 @@ out1: free_page(type_page); return retval; } +EXPORT_SYMBOL_GPL(sys_mount); /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. @@ -2131,7 +2221,7 @@ static void chroot_fs_refs(struct path *old_root, struct path *new_root) struct fs_struct *fs; read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_ve(g, p) { task_lock(p); fs = p->fs; if (fs) { @@ -2146,7 +2236,7 @@ static void chroot_fs_refs(struct path *old_root, struct path *new_root) put_fs_struct(fs); } else task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_ve(g, p); read_unlock(&tasklist_lock); } @@ -2315,7 +2405,7 @@ void __init mnt_init(void) init_rwsem(&namespace_sem); mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, NULL); mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); @@ -2352,3 +2442,4 @@ void __put_mnt_ns(struct mnt_namespace *ns) release_mounts(&umount_list); kfree(ns); } +EXPORT_SYMBOL_GPL(__put_mnt_ns); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 5ee23e7..8d5d75a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -127,6 +127,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ atomic_set(&clp->cl_count, 1); clp->cl_cons_state = NFS_CS_INITING; + clp->owner_env = get_exec_env(); memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); clp->cl_addrlen = cl_init->addrlen; @@ -257,6 +258,7 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion) { struct nfs_client *clp; + struct ve_struct *ve = get_exec_env(); spin_lock(&nfs_client_lock); list_for_each_entry(clp, &nfs_client_list, cl_share_link) { @@ -272,6 +274,9 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion) if (addr->sa_family != clap->sa_family) continue; + if (!ve_accessible_strict(clp->owner_env, ve)) + continue; + /* Match only the IP address, not the port number */ if (!nfs_sockaddr_match_ipaddr(addr, clap)) continue; @@ -292,6 +297,7 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp) { struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr; u32 nfsvers = clp->rpc_ops->version; + struct ve_struct *ve = get_exec_env(); spin_lock(&nfs_client_lock); list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) { @@ -307,6 +313,9 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp) if (sap->sa_family != clap->sa_family) continue; + if (!ve_accessible_strict(clp->owner_env, ve)) + continue; + /* Match only the IP address, not the port number */ if (!nfs_sockaddr_match_ipaddr(sap, clap)) continue; @@ -326,7 +335,9 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp) static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; + struct ve_struct *ve; + ve = get_exec_env(); list_for_each_entry(clp, &nfs_client_list, cl_share_link) { /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e9b2017..4ffeff3 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -51,6 +51,9 @@ #include #include #include +#include +#include +#include #include #include @@ -217,7 +220,8 @@ static struct file_system_type nfs_fs_type = { .name = "nfs", .get_sb = nfs_get_sb, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT| + FS_BINARY_MOUNTDATA|FS_VIRTUALIZED, }; struct file_system_type nfs_xdev_fs_type = { @@ -225,7 +229,8 @@ struct file_system_type nfs_xdev_fs_type = { .name = "nfs", .get_sb = nfs_xdev_get_sb, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT| + FS_BINARY_MOUNTDATA|FS_VIRTUALIZED, }; static const struct super_operations nfs_sops = { @@ -292,6 +297,55 @@ static struct shrinker acl_shrinker = { .seeks = DEFAULT_SEEKS, }; +#ifdef CONFIG_VE +static int ve_nfs_start(void *data) +{ + return 0; +} + +static void ve_nfs_stop(void *data) +{ + struct ve_struct *ve; + struct super_block *sb; + + flush_scheduled_work(); + + ve = (struct ve_struct *)data; + /* Basically, on a valid stop we can be here iff NFS was mounted + read-only. In such a case client force-stop is not a problem. + If we are here and NFS is read-write, we are in a FORCE stop, so + force the client to stop. + Lock daemon is already dead. + Only superblock client remains. Den */ + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + struct rpc_clnt *clnt; + struct rpc_xprt *xprt; + if (sb->s_type != &nfs_fs_type) + continue; + clnt = NFS_SB(sb)->client; + if (!ve_accessible_strict(clnt->cl_xprt->owner_env, ve)) + continue; + clnt->cl_broken = 1; + rpc_killall_tasks(clnt); + + xprt = clnt->cl_xprt; + xprt_disconnect_done(xprt); + xprt->ops->close(xprt); + } + spin_unlock(&sb_lock); + + flush_scheduled_work(); +} + +static struct ve_hook nfs_hook = { + .init = ve_nfs_start, + .fini = ve_nfs_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET_POST, +}; +#endif + /* * Register the NFS filesystems */ @@ -312,6 +366,7 @@ int __init register_nfs_fs(void) goto error_2; #endif register_shrinker(&acl_shrinker); + ve_hook_register(VE_SS_CHAIN, &nfs_hook); return 0; #ifdef CONFIG_NFS_V4 @@ -330,6 +385,7 @@ error_0: void __exit unregister_nfs_fs(void) { unregister_shrinker(&acl_shrinker); + ve_hook_unregister(&nfs_hook); #ifdef CONFIG_NFS_V4 unregister_filesystem(&nfs4_fs_type); #endif @@ -1955,6 +2011,11 @@ static int nfs_get_sb(struct file_system_type *fs_type, .mntflags = flags, }; int error = -ENOMEM; + struct ve_struct *ve; + + ve = get_exec_env(); + if (!ve_is_super(ve) && !(ve->features & VE_FEATURE_NFS)) + return -ENODEV; data = kzalloc(sizeof(*data), GFP_KERNEL); mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); @@ -2064,6 +2125,11 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, .mntflags = flags, }; int error; + struct ve_struct *ve; + + ve = get_exec_env(); + if (!ve_is_super(ve) && !(ve->features & VE_FEATURE_NFS)) + return -ENODEV; dprintk("--> nfs_xdev_get_sb()\n"); diff --git a/fs/open.c b/fs/open.c index 07da935..50873f8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -51,7 +52,21 @@ int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) EXPORT_SYMBOL(vfs_statfs); -static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) +int faudit_statfs(struct super_block *sb, struct kstatfs *buf) +{ + struct faudit_statfs_arg arg; + + arg.sb = sb; + arg.stat = buf; + + if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STATFS, &arg) + != NOTIFY_DONE) + return arg.err; + return 0; +} + +static int vfs_statfs_native(struct dentry *dentry, struct vfsmount *mnt, + struct statfs *buf) { struct kstatfs st; int retval; @@ -60,6 +75,10 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) if (retval) return retval; + retval = faudit_statfs(mnt->mnt_sb, &st); + if (retval) + return retval; + if (sizeof(*buf) == sizeof(st)) memcpy(buf, &st, sizeof(st)); else { @@ -95,7 +114,8 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) return 0; } -static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) +static int vfs_statfs64(struct dentry *dentry, struct vfsmount *mnt, + struct statfs64 *buf) { struct kstatfs st; int retval; @@ -104,6 +124,10 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) if (retval) return retval; + retval = faudit_statfs(mnt->mnt_sb, &st); + if (retval) + return retval; + if (sizeof(*buf) == sizeof(st)) memcpy(buf, &st, sizeof(st)); else { @@ -130,7 +154,7 @@ asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * b error = user_path(pathname, &path); if (!error) { struct statfs tmp; - error = vfs_statfs_native(path.dentry, &tmp); + error = vfs_statfs_native(path.dentry, path.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; path_put(&path); @@ -149,7 +173,7 @@ asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct stat error = user_path(pathname, &path); if (!error) { struct statfs64 tmp; - error = vfs_statfs64(path.dentry, &tmp); + error = vfs_statfs64(path.dentry, path.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; path_put(&path); @@ -168,7 +192,7 @@ asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf) file = fget(fd); if (!file) goto out; - error = vfs_statfs_native(file->f_path.dentry, &tmp); + error = vfs_statfs_native(file->f_path.dentry, file->f_path.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); @@ -189,7 +213,7 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user file = fget(fd); if (!file) goto out; - error = vfs_statfs64(file->f_path.dentry, &tmp); + error = vfs_statfs64(file->f_path.dentry, file->f_path.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); @@ -692,6 +716,7 @@ out_release: out: return error; } +EXPORT_SYMBOL_GPL(sys_chown); asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag) @@ -930,6 +955,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags) return filp; } +int odirect_enable = 0; /* * dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an * error. @@ -951,6 +977,9 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) return ERR_PTR(-EINVAL); } + if (!capable(CAP_SYS_RAWIO) && !odirect_enable) + flags &= ~O_DIRECT; + error = -ENFILE; f = get_empty_filp(); if (f == NULL) { @@ -1041,6 +1070,7 @@ asmlinkage long sys_open(const char __user *filename, int flags, int mode) asmlinkage_protect(3, ret, filename, flags, mode); return ret; } +EXPORT_SYMBOL_GPL(sys_open); asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, int mode) diff --git a/fs/partitions/check.c b/fs/partitions/check.c index ecc3330..a8cc218 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -131,6 +131,7 @@ char *disk_name(struct gendisk *hd, int part, char *buf) return buf; } +EXPORT_SYMBOL(disk_name); const char *bdevname(struct block_device *bdev, char *buf) { diff --git a/fs/pipe.c b/fs/pipe.c index fcba654..2fd4b51 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -22,6 +22,8 @@ #include #include +#include + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -478,7 +480,7 @@ redo1: int error, atomic = 1; if (!page) { - page = alloc_page(GFP_HIGHUSER); + page = alloc_page(GFP_HIGHUSER | __GFP_UBC); if (unlikely(!page)) { ret = ret ? : -ENOMEM; break; @@ -821,7 +823,7 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) { struct pipe_inode_info *pipe; - pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_UBC); if (pipe) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; @@ -1046,6 +1048,7 @@ int do_pipe(int *fd) { return do_pipe_flags(fd, 0); } +EXPORT_SYMBOL_GPL(do_pipe); /* * sys_pipe() is the normal C calling standard for creating diff --git a/fs/proc/array.c b/fs/proc/array.c index 71c9be5..8359842 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -82,6 +82,8 @@ #include #include +#include + #include #include #include "internal.h" @@ -208,6 +210,15 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, put_group_info(group_info); seq_printf(m, "\n"); + +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) { + seq_printf(m, "envID:\t%d\nVPid:\t%d\n", + p->ve_task_info.owner_env->veid, task_pid_vnr(p)); + seq_printf(m, "PNState:\t%u\nStopState:\t%u\n", + p->pn_state, p->stopped_state); + } +#endif } static void render_sigset_t(struct seq_file *m, const char *header, @@ -247,10 +258,10 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, } } -static inline void task_sig(struct seq_file *m, struct task_struct *p) +void task_sig(struct seq_file *m, struct task_struct *p) { unsigned long flags; - sigset_t pending, shpending, blocked, ignored, caught; + sigset_t pending, shpending, blocked, ignored, caught, saved; int num_threads = 0; unsigned long qsize = 0; unsigned long qlim = 0; @@ -260,12 +271,14 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) sigemptyset(&blocked); sigemptyset(&ignored); sigemptyset(&caught); + sigemptyset(&saved); rcu_read_lock(); if (lock_task_sighand(p, &flags)) { pending = p->pending.signal; shpending = p->signal->shared_pending.signal; blocked = p->blocked; + saved = p->saved_sigmask; collect_sigign_sigcatch(p, &ignored, &caught); num_threads = atomic_read(&p->signal->count); qsize = atomic_read(&p->user->sigpending); @@ -283,6 +296,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) render_sigset_t(m, "SigBlk:\t", &blocked); render_sigset_t(m, "SigIgn:\t", &ignored); render_sigset_t(m, "SigCgt:\t", &caught); + render_sigset_t(m, "SigSvd:\t", &saved); } static void render_cap_t(struct seq_file *m, const char *header, @@ -306,6 +320,20 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) render_cap_t(m, "CapBnd:\t", &p->cap_bset); } +#ifdef CONFIG_BEANCOUNTERS +static inline void ub_dump_task_info(struct task_struct *tsk, + char *stsk, int ltsk, char *smm, int lmm) +{ + print_ub_uid(tsk->task_bc.task_ub, stsk, ltsk); + task_lock(tsk); + if (tsk->mm) + print_ub_uid(tsk->mm->mm_ub, smm, lmm); + else + strncpy(smm, "N/A", lmm); + task_unlock(tsk); +} +#endif + static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { @@ -319,6 +347,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm = get_task_mm(task); +#ifdef CONFIG_BEANCOUNTERS + char tsk_ub_info[64], mm_ub_info[64]; +#endif task_name(m, task); task_state(m, ns, pid, task); @@ -334,6 +365,14 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_show_regs(m, task); #endif task_context_switch_counts(m, task); +#ifdef CONFIG_BEANCOUNTERS + ub_dump_task_info(task, + tsk_ub_info, sizeof(tsk_ub_info), + mm_ub_info, sizeof(mm_ub_info)); + + seq_printf(m, "TaskUB:\t%s\n", tsk_ub_info); + seq_printf(m, "MMUB:\t%s\n", mm_ub_info); +#endif return 0; } @@ -356,6 +395,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, unsigned long rsslim = 0; char tcomm[sizeof(task->comm)]; unsigned long flags; +#ifdef CONFIG_BEANCOUNTERS + char ub_task_info[64]; + char ub_mm_info[64]; +#endif state = *get_task_state(task); vsize = eip = esp = 0; @@ -434,6 +477,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, priority = task_prio(task); nice = task_nice(task); +#ifndef CONFIG_VE /* Temporary variable needed for gcc-2.96 */ /* convert timespec -> nsec*/ start_time = @@ -441,10 +485,25 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, + task->real_start_time.tv_nsec; /* convert nsec -> ticks */ start_time = nsec_to_clock_t(start_time); +#else + start_time = ve_relative_clock(&task->start_time); +#endif + +#ifdef CONFIG_BEANCOUNTERS + ub_dump_task_info(task, ub_task_info, sizeof(ub_task_info), + ub_mm_info, sizeof(ub_mm_info)); +#endif seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld" +#ifdef CONFIG_VE + " 0 0 0 0 0 0 0 %d %u" +#endif +#ifdef CONFIG_BEANCOUNTERS + " %s %s" +#endif + "\n", pid_nr_ns(pid, ns), tcomm, state, @@ -491,7 +550,16 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, task->policy, (unsigned long long)delayacct_blkio_ticks(task), cputime_to_clock_t(gtime), - cputime_to_clock_t(cgtime)); + cputime_to_clock_t(cgtime) +#ifdef CONFIG_VE + , task_pid_vnr(task), + VEID(VE_TASK_INFO(task)->owner_env) +#endif +#ifdef CONFIG_BEANCOUNTERS + , ub_task_info, + ub_mm_info +#endif + ); if (mm) mmput(mm); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index a28840b..98f82bc 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -187,10 +187,12 @@ static int proc_cwd_link(struct inode *inode, struct path *path) } if (fs) { read_lock(&fs->lock); - *path = fs->pwd; - path_get(&fs->pwd); + result = d_root_check(&fs->pwd); + if (result == 0) { + *path = fs->pwd; + path_get(&fs->pwd); + } read_unlock(&fs->lock); - result = 0; put_fs_struct(fs); } return result; @@ -538,17 +540,31 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer) static int proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; - int allowed = 0; + int err; + /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information. */ + err = -ENOENT; task = get_proc_task(inode); if (task) { - allowed = ptrace_may_access(task, PTRACE_MODE_READ); + if (ptrace_may_access(task, PTRACE_MODE_READ)) + err = 0; + else + /* + * This clever ptrace_may_attach() may play a trick + * on us. If the task is zombie it will consider this + * task to be not dumpable at all and will deny any + * ptracing in VE. Not a big deal for ptrace(), but + * following the link will fail with the -EACCESS + * reason. Some software is unable to stand such a + * swindle and refuses to work :( + */ + err = (task->mm ? -EACCES : -ENOENT); put_task_struct(task); } - return allowed; + return err; } static int proc_setattr(struct dentry *dentry, struct iattr *attr) @@ -1023,6 +1039,8 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && oom_adjust != OOM_DISABLE) return -EINVAL; + if (oom_adjust == OOM_DISABLE && !ve_is_super(get_exec_env())) + return -EPERM; if (*end == '\n') end++; task = get_proc_task(file->f_path.dentry->d_inode); @@ -1277,6 +1295,7 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) mm->exe_file = new_exe_file; mm->num_exe_file_vmas = 0; } +EXPORT_SYMBOL(set_mm_exe_file); struct file *get_mm_exe_file(struct mm_struct *mm) { @@ -1315,10 +1334,15 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path) exe_file = get_mm_exe_file(mm); mmput(mm); if (exe_file) { - *exe_path = exe_file->f_path; - path_get(&exe_file->f_path); + int result; + + result = d_root_check(&exe_file->f_path); + if (result == 0) { + *exe_path = exe_file->f_path; + path_get(&exe_file->f_path); + } fput(exe_file); - return 0; + return result; } else return -ENOENT; } @@ -1326,13 +1350,14 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path) static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - int error = -EACCES; + int error; /* We don't need a base pointer in the /proc filesystem */ path_put(&nd->path); /* Are we allowed to snoop on the tasks file descriptors? */ - if (!proc_fd_access_allowed(inode)) + error = proc_fd_access_allowed(inode); + if (error < 0) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); @@ -1367,12 +1392,13 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) { - int error = -EACCES; + int error; struct inode *inode = dentry->d_inode; struct path path; /* Are we allowed to snoop on the tasks file descriptors? */ - if (!proc_fd_access_allowed(inode)) + error = proc_fd_access_allowed(inode); + if (error < 0) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &path); @@ -1613,6 +1639,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) struct files_struct *files = NULL; struct file *file; int fd = proc_fd(inode); + int err = -ENOENT; if (task) { files = get_files_struct(task); @@ -1625,7 +1652,8 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) */ spin_lock(&files->file_lock); file = fcheck_files(files, fd); - if (file) { + err = -EACCES; + if (file && !d_root_check(&file->f_path)) { if (path) { *path = file->f_path; path_get(&file->f_path); @@ -1643,7 +1671,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) spin_unlock(&files->file_lock); put_files_struct(files); } - return -ENOENT; + return err; } static int proc_fd_link(struct inode *inode, struct path *path) @@ -2410,7 +2438,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) struct task_struct *t = task; task_io_accounting_add(&acct, &task->signal->ioac); - while_each_thread(task, t) + while_each_thread_ve(task, t) task_io_accounting_add(&acct, &t->ioac); unlock_task_sighand(task, &flags); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 7821589..44604d5 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -228,6 +228,10 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) struct proc_dir_entry *de = PDE(inode); int error; + if ((iattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) && + LPDE(inode) == PDE(inode)) + return -EPERM; + error = inode_change_ok(inode, iattr); if (error) goto out; @@ -236,9 +240,12 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) if (error) goto out; - de->uid = inode->i_uid; - de->gid = inode->i_gid; - de->mode = inode->i_mode; + if (iattr->ia_valid & ATTR_UID) + de->uid = inode->i_uid; + if (iattr->ia_valid & ATTR_GID) + de->gid = inode->i_gid; + if (iattr->ia_valid & ATTR_MODE) + de->mode = inode->i_mode; out: return error; } @@ -369,29 +376,61 @@ static struct dentry_operations proc_dentry_operations = .d_delete = proc_delete_dentry, }; +static struct proc_dir_entry *__proc_lookup(struct proc_dir_entry *dir, + const char *name, int namelen) +{ + struct proc_dir_entry *de; + + for (de = dir->subdir; de ; de = de->next) { + if (de->namelen != namelen) + continue; + if (memcmp(de->name, name, namelen)) + continue; + break; + } + return de; +} + /* * Don't create negative dentries here, return -ENOENT by hand * instead. */ -struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, - struct dentry *dentry) +struct dentry *proc_lookup_de(struct proc_dir_entry *de, + struct proc_dir_entry *lde, + struct inode *dir, struct dentry *dentry) { struct inode *inode = NULL; int error = -ENOENT; lock_kernel(); spin_lock(&proc_subdir_lock); - for (de = de->subdir; de ; de = de->next) { - if (de->namelen != dentry->d_name.len) - continue; - if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { + de = __proc_lookup(de, dentry->d_name.name, dentry->d_name.len); + if (lde != NULL) + lde = __proc_lookup(lde, dentry->d_name.name, + dentry->d_name.len); + + if (de == NULL) + de = lde; + + if (de != NULL) { + /* + * de lde meaning inode(g,l) + * ------------------------------------ + * NULL NULL -ENOENT * + * X NULL global X NULL + * NULL X local X X + * X Y both X Y + */ + { unsigned int ino; ino = de->low_ino; de_get(de); + if (lde != NULL) + de_get(lde); spin_unlock(&proc_subdir_lock); error = -EINVAL; - inode = proc_get_inode(dir->i_sb, ino, de); + inode = proc_get_inode(dir->i_sb, ino, de, lde); goto out_unlock; } } @@ -406,13 +445,15 @@ out_unlock: } if (de) de_put(de); + if (lde) + de_put(lde); return ERR_PTR(error); } struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - return proc_lookup_de(PDE(dir), dir, dentry); + return proc_lookup_de(PDE(dir), LPDE(dir), dir, dentry); } /* @@ -424,13 +465,14 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, * value of the readdir() call, as long as it's non-negative * for success.. */ -int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, - filldir_t filldir) +int proc_readdir_de(struct proc_dir_entry *de, struct proc_dir_entry *lde, + struct file *filp, void *dirent, filldir_t filldir) { unsigned int ino; int i; struct inode *inode = filp->f_path.dentry->d_inode; int ret = 0; + struct proc_dir_entry *ode = de, *fde = NULL; lock_kernel(); @@ -453,25 +495,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, /* fall through */ default: spin_lock(&proc_subdir_lock); - de = de->subdir; i -= 2; - for (;;) { - if (!de) { - ret = 1; - spin_unlock(&proc_subdir_lock); - goto out; - } - if (!i) - break; - de = de->next; - i--; - } - - do { +repeat: + de = de->subdir; + while (de != NULL) { struct proc_dir_entry *next; - /* filldir passes info to user space */ de_get(de); + if (i-- > 0 || (fde != NULL && + __proc_lookup(fde, + de->name, de->namelen))) + goto skip; + + /* filldir passes info to user space */ spin_unlock(&proc_subdir_lock); if (filldir(dirent, de->name, de->namelen, filp->f_pos, de->low_ino, de->mode >> 12) < 0) { @@ -480,10 +516,17 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, } spin_lock(&proc_subdir_lock); filp->f_pos++; +skip: next = de->next; de_put(de); de = next; - } while (de); + } + + if (fde == NULL && lde != NULL && lde != ode) { + de = lde; + fde = ode; + goto repeat; + } spin_unlock(&proc_subdir_lock); } ret = 1; @@ -495,7 +538,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) { struct inode *inode = filp->f_path.dentry->d_inode; - return proc_readdir_de(PDE(inode), filp, dirent, filldir); + return proc_readdir_de(PDE(inode), LPDE(inode), filp, dirent, filldir); } /* diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8bb03f0..f680759 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -449,7 +449,7 @@ static const struct file_operations proc_reg_file_ops_no_compat = { #endif struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, - struct proc_dir_entry *de) + struct proc_dir_entry *de, struct proc_dir_entry *lde) { struct inode * inode; @@ -463,6 +463,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; PROC_I(inode)->fd = 0; PROC_I(inode)->pde = de; +#ifdef CONFIG_VE + PROC_I(inode)->lpde = lde; +#endif if (de->mode) { inode->i_mode = de->mode; @@ -509,9 +512,11 @@ int proc_fill_super(struct super_block *s) s->s_magic = PROC_SUPER_MAGIC; s->s_op = &proc_sops; s->s_time_gran = 1; - - de_get(&proc_root); - root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); + + de_get(get_exec_env()->proc_root); + de_get(&glob_proc_root); + root_inode = proc_get_inode(s, PROC_ROOT_INO, + &glob_proc_root, get_exec_env()->proc_root); if (!root_inode) goto out_no_root; root_inode->i_uid = 0; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 4422023..24ffc99 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -12,6 +12,12 @@ #include extern struct proc_dir_entry proc_root; +#ifdef CONFIG_VE +extern struct proc_dir_entry glob_proc_root; +#else +#define glob_proc_root proc_root +#endif + #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); #else @@ -63,7 +69,6 @@ extern const struct file_operations proc_smaps_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern const struct file_operations proc_net_operations; -extern const struct file_operations proc_kmsg_operations; extern const struct inode_operations proc_net_inode_operations; void free_proc_entry(struct proc_dir_entry *de); @@ -85,10 +90,11 @@ static inline int proc_fd(struct inode *inode) return PROC_I(inode)->fd; } -struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, +struct dentry *proc_lookup_de(struct proc_dir_entry *de, + struct proc_dir_entry *lpde, struct inode *ino, struct dentry *dentry); -int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, - filldir_t filldir); +int proc_readdir_de(struct proc_dir_entry *de, struct proc_dir_entry *lpde, + struct file *filp, void *dirent, filldir_t filldir); struct pde_opener { struct inode *inode; diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 9fd5df3..d1cbe06 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include @@ -42,7 +44,7 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, static unsigned int kmsg_poll(struct file *file, poll_table *wait) { - poll_wait(file, &log_wait, wait); + poll_wait(file, &ve_log_wait, wait); if (do_syslog(9, NULL, 0)) return POLLIN | POLLRDNORM; return 0; @@ -55,3 +57,4 @@ const struct file_operations proc_kmsg_operations = { .open = kmsg_open, .release = kmsg_release, }; +EXPORT_SYMBOL_GPL(proc_kmsg_operations); diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 29e20c6..89bfb76 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -87,19 +89,39 @@ static int loadavg_read_proc(char *page, char **start, off_t off, int a, b, c; int len; unsigned long seq; + long running, threads; + struct ve_struct *ve; + ve = get_exec_env(); do { seq = read_seqbegin(&xtime_lock); - a = avenrun[0] + (FIXED_1/200); - b = avenrun[1] + (FIXED_1/200); - c = avenrun[2] + (FIXED_1/200); + if (ve_is_super(ve)) { + a = avenrun[0] + (FIXED_1/200); + b = avenrun[1] + (FIXED_1/200); + c = avenrun[2] + (FIXED_1/200); +#ifdef CONFIG_VE + } else { + a = ve->avenrun[0] + (FIXED_1/200); + b = ve->avenrun[1] + (FIXED_1/200); + c = ve->avenrun[2] + (FIXED_1/200); +#endif + } } while (read_seqretry(&xtime_lock, seq)); + if (ve_is_super(ve)) { + running = nr_running(); + threads = nr_threads; +#ifdef CONFIG_VE + } else { + running = nr_running_ve(ve); + threads = atomic_read(&ve->pcounter); +#endif + } - len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%ld %d\n", LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running(), nr_threads, + running, threads, task_active_pid_ns(current)->last_pid); return proc_calc_metrics(page, start, off, count, eof, len); } @@ -114,6 +136,13 @@ static int uptime_read_proc(char *page, char **start, off_t off, do_posix_clock_monotonic_gettime(&uptime); monotonic_to_bootbased(&uptime); +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) { + set_normalized_timespec(&uptime, + uptime.tv_sec - get_exec_env()->start_timespec.tv_sec, + uptime.tv_nsec - get_exec_env()->start_timespec.tv_nsec); + } +#endif cputime_to_timespec(idletime, &idle); len = sprintf(page,"%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, @@ -132,29 +161,50 @@ int __attribute__((weak)) arch_report_meminfo(char *page) static int meminfo_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct sysinfo i; + struct meminfo mi; int len; - unsigned long committed; - unsigned long allowed; + unsigned long dummy; struct vmalloc_info vmi; - long cached; + + get_zone_counts(&mi.active, &mi.inactive, &dummy); /* * display in kilobytes. */ #define K(x) ((x) << (PAGE_SHIFT - 10)) - si_meminfo(&i); - si_swapinfo(&i); - committed = atomic_long_read(&vm_committed_space); - allowed = ((totalram_pages - hugetlb_total_pages()) + si_meminfo(&mi.si); + si_swapinfo(&mi.si); + mi.committed_space = atomic_read(&vm_committed_space); + mi.swapcache = total_swapcache_pages; + mi.allowed = ((totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100) + total_swap_pages; - cached = global_page_state(NR_FILE_PAGES) - - total_swapcache_pages - i.bufferram; - if (cached < 0) - cached = 0; + mi.cache = global_page_state(NR_FILE_PAGES) - + total_swapcache_pages - mi.si.bufferram; + if (mi.cache < 0) + mi.cache = 0; get_vmalloc_info(&vmi); + mi.vmalloc_used = vmi.used >> PAGE_SHIFT; + mi.vmalloc_largest = vmi.largest_chunk >> PAGE_SHIFT; + mi.vmalloc_total = VMALLOC_TOTAL >> PAGE_SHIFT; + + mi.pi.nr_file_dirty = global_page_state(NR_FILE_DIRTY); + mi.pi.nr_writeback = global_page_state(NR_WRITEBACK); + mi.pi.nr_anon_pages = global_page_state(NR_ANON_PAGES); + mi.pi.nr_file_mapped = global_page_state(NR_FILE_MAPPED); + mi.pi.nr_slab_rec = global_page_state(NR_SLAB_RECLAIMABLE); + mi.pi.nr_slab_unrec = global_page_state(NR_SLAB_UNRECLAIMABLE); + mi.pi.nr_pagetable = global_page_state(NR_PAGETABLE); + mi.pi.nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS); + mi.pi.nr_bounce = global_page_state(NR_BOUNCE); + mi.pi.nr_writeback_temp = global_page_state(NR_WRITEBACK_TEMP); + +#ifdef CONFIG_BEANCOUNTERS + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi) + & NOTIFY_FAIL) + return -ENOMSG; +#endif /* * Tagged format, for easy grepping and expansion. @@ -194,41 +244,42 @@ static int meminfo_read_proc(char *page, char **start, off_t off, "VmallocTotal: %8lu kB\n" "VmallocUsed: %8lu kB\n" "VmallocChunk: %8lu kB\n", - K(i.totalram), - K(i.freeram), - K(i.bufferram), - K(cached), - K(total_swapcache_pages), - K(global_page_state(NR_ACTIVE)), - K(global_page_state(NR_INACTIVE)), + K(mi.si.totalram), + K(mi.si.freeram), + K(mi.si.bufferram), + K(mi.cache), + K(mi.swapcache), + K(mi.active), + K(mi.inactive), #ifdef CONFIG_HIGHMEM - K(i.totalhigh), - K(i.freehigh), - K(i.totalram-i.totalhigh), - K(i.freeram-i.freehigh), -#endif - K(i.totalswap), - K(i.freeswap), - K(global_page_state(NR_FILE_DIRTY)), - K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES)), - K(global_page_state(NR_FILE_MAPPED)), - K(global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)), - K(global_page_state(NR_SLAB_RECLAIMABLE)), - K(global_page_state(NR_SLAB_UNRECLAIMABLE)), - K(global_page_state(NR_PAGETABLE)), + K(mi.si.totalhigh), + K(mi.si.freehigh), + K(mi.si.totalram-mi.si.totalhigh), + K(mi.si.freeram-mi.si.freehigh), +#endif + K(mi.si.totalswap), + K(mi.si.freeswap), + K(mi.pi.nr_file_dirty), + K(mi.pi.nr_writeback), + K(mi.pi.nr_anon_pages), + K(mi.pi.nr_file_mapped), + K(mi.pi.nr_slab_rec + + mi.pi.nr_slab_unrec), + K(mi.pi.nr_slab_rec), + K(mi.pi.nr_slab_unrec), + K(mi.pi.nr_pagetable), #ifdef CONFIG_QUICKLIST K(quicklist_total_size()), #endif - K(global_page_state(NR_UNSTABLE_NFS)), - K(global_page_state(NR_BOUNCE)), - K(global_page_state(NR_WRITEBACK_TEMP)), - K(allowed), - K(committed), - (unsigned long)VMALLOC_TOTAL >> 10, - vmi.used >> 10, - vmi.largest_chunk >> 10 + K(mi.pi.nr_unstable_nfs), + K(mi.pi.nr_bounce), + K(mi.pi.nr_writeback_temp), + K(mi.allowed), + K(mi.committed_space), + K(mi.vmalloc_total), + K(mi.vmalloc_used), + K(mi.vmalloc_largest), + K(quicklist_total_size()) ); len += hugetlb_report_meminfo(page + len); @@ -500,25 +551,21 @@ static const struct file_operations proc_vmalloc_operations = { #define arch_irq_stat() 0 #endif -static int show_stat(struct seq_file *p, void *v) +static void show_stat_ve0(struct seq_file *p) { int i; - unsigned long jif; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; cputime64_t guest; u64 sum = 0; - struct timespec boottime; unsigned int *per_irq_sum; per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL); if (!per_irq_sum) - return -ENOMEM; + return; user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; guest = cputime64_zero; - getboottime(&boottime); - jif = boottime.tv_sec; for_each_possible_cpu(i) { int j; @@ -580,9 +627,85 @@ static int show_stat(struct seq_file *p, void *v) for (i = 0; i < NR_IRQS; i++) seq_printf(p, " %u", per_irq_sum[i]); + kfree(per_irq_sum); + seq_printf(p, "\nswap %lu %lu\n", + vm_events(PSWPIN), vm_events(PSWPOUT)); +} + +#ifdef CONFIG_VE +static void show_stat_ve(struct seq_file *p, struct ve_struct *ve) +{ + int i; + u64 user, nice, system; + cycles_t idle, iowait; + cpumask_t ve_cpus; + + ve_cpu_online_map(ve, &ve_cpus); + + user = nice = system = idle = iowait = 0; + for_each_cpu_mask(i, ve_cpus) { + user += VE_CPU_STATS(ve, i)->user; + nice += VE_CPU_STATS(ve, i)->nice; + system += VE_CPU_STATS(ve, i)->system; + idle += ve_sched_get_idle_time(ve, i); + iowait += ve_sched_get_iowait_time(ve, i); + } + + seq_printf(p, "cpu %llu %llu %llu %llu %llu 0 0 0\n", + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cycles_to_clocks(idle), + (unsigned long long)cycles_to_clocks(iowait)); + + for_each_cpu_mask(i, ve_cpus) { + user = VE_CPU_STATS(ve, i)->user; + nice = VE_CPU_STATS(ve, i)->nice; + system = VE_CPU_STATS(ve, i)->system; + idle = ve_sched_get_idle_time(ve, i); + iowait = ve_sched_get_iowait_time(ve, i); + seq_printf(p, "cpu%d %llu %llu %llu %llu %llu 0 0 0\n", + i, + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cycles_to_clocks(idle), + (unsigned long long)cycles_to_clocks(iowait)); + } + seq_printf(p, "intr 0\nswap 0 0\n"); +} +#endif + +int show_stat(struct seq_file *p, void *v) +{ + extern unsigned long total_forks; + unsigned long seq, jif; + struct ve_struct *env; + unsigned long __nr_running, __nr_iowait; + + do { + seq = read_seqbegin(&xtime_lock); + jif = - wall_to_monotonic.tv_sec; + if (wall_to_monotonic.tv_nsec) + --jif; + } while (read_seqretry(&xtime_lock, seq)); + + env = get_exec_env(); + if (ve_is_super(env)) { + show_stat_ve0(p); + __nr_running = nr_running(); + __nr_iowait = nr_iowait(); + } +#ifdef CONFIG_VE + else { + show_stat_ve(p, env); + __nr_running = nr_running_ve(env); + __nr_iowait = nr_iowait_ve(env); + } +#endif seq_printf(p, - "\nctxt %llu\n" + "ctxt %llu\n" "btime %lu\n" "processes %lu\n" "procs_running %lu\n" @@ -590,10 +713,9 @@ static int show_stat(struct seq_file *p, void *v) nr_context_switches(), (unsigned long)jif, total_forks, - nr_running(), - nr_iowait()); + __nr_running, + __nr_iowait); - kfree(per_irq_sum); return 0; } @@ -680,7 +802,8 @@ static int cmdline_read_proc(char *page, char **start, off_t off, { int len; - len = sprintf(page, "%s\n", saved_command_line); + len = sprintf(page, "%s\n", + ve_is_super(get_exec_env()) ? saved_command_line : "quiet"); return proc_calc_metrics(page, start, off, count, eof, len); } @@ -711,11 +834,16 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { if (count) { - char c; + int i, cnt; + char c[32]; - if (get_user(c, buf)) + cnt = min(count, sizeof(c)); + if (copy_from_user(c, buf, cnt)) return -EFAULT; - __handle_sysrq(c, NULL, 0); + + + for (i = 0; i < cnt && c[i] != '\n'; i++) + __handle_sysrq(c[i], NULL, 0); } return count; } @@ -863,38 +991,39 @@ void __init proc_misc_init(void) static struct { char *name; int (*read_proc)(char*,char**,off_t,int,int*,void*); + struct proc_dir_entry *parent; } *p, simple_ones[] = { - {"loadavg", loadavg_read_proc}, - {"uptime", uptime_read_proc}, - {"meminfo", meminfo_read_proc}, - {"version", version_read_proc}, + {"loadavg", loadavg_read_proc, &glob_proc_root}, + {"uptime", uptime_read_proc, &glob_proc_root}, + {"meminfo", meminfo_read_proc, &glob_proc_root}, + {"version", version_read_proc, &glob_proc_root}, #ifdef CONFIG_PROC_HARDWARE {"hardware", hardware_read_proc}, #endif #ifdef CONFIG_STRAM_PROC {"stram", stram_read_proc}, #endif - {"filesystems", filesystems_read_proc}, - {"cmdline", cmdline_read_proc}, + {"filesystems", filesystems_read_proc, &glob_proc_root}, + {"cmdline", cmdline_read_proc, &glob_proc_root}, {"execdomains", execdomains_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) - create_proc_read_entry(p->name, 0, NULL, p->read_proc, NULL); + create_proc_read_entry(p->name, 0, p->parent, p->read_proc, NULL); - proc_symlink("mounts", NULL, "self/mounts"); + proc_symlink("mounts", &glob_proc_root, "self/mounts"); /* And now for trickier ones */ #ifdef CONFIG_PRINTK proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); #endif - proc_create("locks", 0, NULL, &proc_locks_operations); + proc_create("locks", 0, &glob_proc_root, &proc_locks_operations); proc_create("devices", 0, NULL, &proc_devinfo_operations); - proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); + proc_create("cpuinfo", 0, &glob_proc_root, &proc_cpuinfo_operations); #ifdef CONFIG_BLOCK proc_create("partitions", 0, NULL, &proc_partitions_operations); #endif - proc_create("stat", 0, NULL, &proc_stat_operations); + proc_create("stat", 0, &glob_proc_root, &proc_stat_operations); proc_create("interrupts", 0, NULL, &proc_interrupts_operations); #ifdef CONFIG_SLABINFO proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); @@ -907,13 +1036,13 @@ void __init proc_misc_init(void) #endif proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); - proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); + proc_create("vmstat", S_IRUGO, &glob_proc_root, &proc_vmstat_file_operations); proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); #ifdef CONFIG_BLOCK proc_create("diskstats", 0, NULL, &proc_diskstats_operations); #endif #ifdef CONFIG_MODULES - proc_create("modules", 0, NULL, &proc_modules_operations); + proc_create("modules", 0, &glob_proc_root, &proc_modules_operations); #endif #ifdef CONFIG_SCHEDSTATS proc_create("schedstat", 0, NULL, &proc_schedstat_operations); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 7bc296f..8cc47f6 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -127,7 +127,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, de = ERR_PTR(-ENOENT); net = get_proc_task_net(dir); if (net != NULL) { - de = proc_lookup_de(net->proc_net, dir, dentry); + de = proc_lookup_de(net->proc_net, NULL, dir, dentry); put_net(net); } return de; @@ -165,7 +165,8 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent, ret = -EINVAL; net = get_proc_task_net(filp->f_path.dentry->d_inode); if (net != NULL) { - ret = proc_readdir_de(net->proc_net, filp, dirent, filldir); + ret = proc_readdir_de(net->proc_net, NULL, + filp, dirent, filldir); put_net(net); } return ret; @@ -234,7 +235,7 @@ static struct pernet_operations __net_initdata proc_net_ns_ops = { int __init proc_net_init(void) { - proc_symlink("net", NULL, "self/net"); + proc_symlink("net", &glob_proc_root, "self/net"); return register_pernet_subsys(&proc_net_ns_ops); } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f9a8b89..a44cbaa 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -399,7 +399,7 @@ static struct proc_dir_entry *proc_sys_root; int proc_sys_init(void) { - proc_sys_root = proc_mkdir("sys", NULL); + proc_sys_root = proc_mkdir("sys", &glob_proc_root); proc_sys_root->proc_iops = &proc_sys_dir_operations; proc_sys_root->proc_fops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index d153946..d139eed 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -13,6 +13,7 @@ #include #include #include +#include #include /* @@ -70,6 +71,9 @@ static int show_tty_driver(struct seq_file *m, void *v) dev_t from = MKDEV(p->major, p->minor_start); dev_t to = from + p->num; + if (!ve_accessible_strict(p->owner_env, get_exec_env())) + goto out; + if (&p->tty_drivers == tty_drivers.next) { /* pseudo-drivers first */ seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty"); @@ -97,6 +101,7 @@ static int show_tty_driver(struct seq_file *m, void *v) } if (from != to) show_tty_range(m, p, from, to - from); +out: return 0; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 9511753..bc93788 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -43,6 +43,9 @@ static int proc_get_sb(struct file_system_type *fs_type, struct super_block *sb; struct pid_namespace *ns; struct proc_inode *ei; +#ifdef CONFIG_VE + struct vfsmount *proc_mnt = fs_type->owner_env->proc_mnt; +#endif if (proc_mnt) { /* Seed the root directory with a pid so it doesn't need @@ -96,11 +99,12 @@ static void proc_kill_sb(struct super_block *sb) put_pid_ns(ns); } -static struct file_system_type proc_fs_type = { +struct file_system_type proc_fs_type = { .name = "proc", .get_sb = proc_get_sb, .kill_sb = proc_kill_sb, }; +EXPORT_SYMBOL(proc_fs_type); void __init proc_root_init(void) { @@ -110,6 +114,11 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; + +#ifdef CONFIG_VE + get_ve0()->proc_root = &proc_root; +#endif + proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); err = PTR_ERR(proc_mnt); if (IS_ERR(proc_mnt)) { @@ -117,16 +126,22 @@ void __init proc_root_init(void) return; } +#ifdef CONFIG_VE + get_ve0()->proc_mnt = proc_mnt; +#endif + proc_misc_init(); proc_net_init(); #ifdef CONFIG_SYSVIPC - proc_mkdir("sysvipc", NULL); + proc_mkdir("sysvipc", &glob_proc_root); #endif - proc_mkdir("fs", NULL); + proc_mkdir("fs", &glob_proc_root); + proc_mkdir("fs", NULL); /* care about proc_mkdir("fs/xxx", NULL); */ + proc_mkdir("driver", NULL); - proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */ + proc_mkdir("fs/nfsd", &glob_proc_root); /* somewhere for the nfsd filesystem to be mounted */ #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) /* just give it a mountpoint */ proc_mkdir("openprom", NULL); @@ -211,6 +226,22 @@ struct proc_dir_entry proc_root = { .parent = &proc_root, }; +#ifdef CONFIG_VE +struct proc_dir_entry glob_proc_root = { + .low_ino = PROC_ROOT_INO, + .namelen = 5, + .name = "/proc", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .nlink = 2, + .count = ATOMIC_INIT(1), + .proc_iops = &proc_root_inode_operations, + .proc_fops = &proc_root_operations, + .parent = &glob_proc_root, +}; + +EXPORT_SYMBOL(glob_proc_root); +#endif + int pid_ns_prepare_proc(struct pid_namespace *ns) { struct vfsmount *mnt; diff --git a/fs/quota.c b/fs/quota.c index 7f4386e..374b682 100644 --- a/fs/quota.c +++ b/fs/quota.c @@ -18,6 +18,7 @@ #include #include #include +#include /* Check validity of generic quotactl commands */ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id) @@ -81,11 +82,11 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid if (cmd == Q_GETQUOTA) { if (((type == USRQUOTA && current->euid != id) || (type == GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_VE_SYS_ADMIN)) return -EPERM; } else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; return 0; @@ -132,10 +133,10 @@ static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t i if (cmd == Q_XGETQUOTA) { if (((type == XQM_USRQUOTA && current->euid != id) || (type == XQM_GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_VE_SYS_ADMIN)) return -EPERM; } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) { - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; } @@ -177,6 +178,8 @@ static void quota_sync_sb(struct super_block *sb, int type) continue; if (!sb_has_quota_enabled(sb, cnt)) continue; + if (!sb_dqopt(sb)->files[cnt]) + continue; mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA); truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); @@ -213,7 +216,7 @@ restart: sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root && sb->s_qcop->quota_sync) + if (sb->s_root && sb->s_qcop && sb->s_qcop->quota_sync) quota_sync_sb(sb, type); up_read(&sb->s_umount); spin_lock(&sb_lock); @@ -344,6 +347,7 @@ static inline struct super_block *quotactl_block(const char __user *special) struct block_device *bdev; struct super_block *sb; char *tmp = getname(special); + int error; if (IS_ERR(tmp)) return ERR_CAST(tmp); @@ -351,6 +355,13 @@ static inline struct super_block *quotactl_block(const char __user *special) putname(tmp); if (IS_ERR(bdev)) return ERR_CAST(bdev); + + error = devcgroup_inode_permission(bdev->bd_inode, MAY_QUOTACTL); + if (error) { + bdput(bdev); + return ERR_PTR(error); + } + sb = get_super(bdev); bdput(bdev); if (!sb) @@ -362,6 +373,215 @@ static inline struct super_block *quotactl_block(const char __user *special) #endif } +#ifdef CONFIG_QUOTA_COMPAT + +#define QC_QUOTAON 0x0100 /* enable quotas */ +#define QC_QUOTAOFF 0x0200 /* disable quotas */ +/* GETQUOTA, SETQUOTA and SETUSE which were at 0x0300-0x0500 has now other parameteres */ +#define QC_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ +#define QC_SETQLIM 0x0700 /* set limits */ +/* GETSTATS at 0x0800 is now longer... */ +#define QC_GETINFO 0x0900 /* get info about quotas - graces, flags... */ +#define QC_SETINFO 0x0A00 /* set info about quotas */ +#define QC_SETGRACE 0x0B00 /* set inode and block grace */ +#define QC_SETFLAGS 0x0C00 /* set flags for quota */ +#define QC_GETQUOTA 0x0D00 /* get limits and usage */ +#define QC_SETQUOTA 0x0E00 /* set limits and usage */ +#define QC_SETUSE 0x0F00 /* set usage */ +/* 0x1000 used by old RSQUASH */ +#define QC_GETSTATS 0x1100 /* get collected stats */ + +struct compat_dqblk { + unsigned int dqb_ihardlimit; + unsigned int dqb_isoftlimit; + unsigned int dqb_curinodes; + unsigned int dqb_bhardlimit; + unsigned int dqb_bsoftlimit; + qsize_t dqb_curspace; + __kernel_time_t dqb_btime; + __kernel_time_t dqb_itime; +}; + +struct compat_dqinfo { + unsigned int dqi_bgrace; + unsigned int dqi_igrace; + unsigned int dqi_flags; + unsigned int dqi_blocks; + unsigned int dqi_free_blk; + unsigned int dqi_free_entry; +}; + +struct compat_dqstats { + __u32 lookups; + __u32 drops; + __u32 reads; + __u32 writes; + __u32 cache_hits; + __u32 allocated_dquots; + __u32 free_dquots; + __u32 syncs; + __u32 version; +}; + +asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr); +static long compat_quotactl(unsigned int cmds, unsigned int type, + const char __user *special, qid_t id, + void __user *addr) +{ + struct super_block *sb; + long ret; + + sb = NULL; + switch (cmds) { + case QC_QUOTAON: + return sys_quotactl(QCMD(Q_QUOTAON, type), + special, id, addr); + + case QC_QUOTAOFF: + return sys_quotactl(QCMD(Q_QUOTAOFF, type), + special, id, addr); + + case QC_SYNC: + return sys_quotactl(QCMD(Q_SYNC, type), + special, id, addr); + + case QC_GETQUOTA: { + struct if_dqblk idq; + struct compat_dqblk cdq; + + sb = quotactl_block(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id); + if (ret) + break; + ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); + if (ret) + break; + cdq.dqb_ihardlimit = idq.dqb_ihardlimit; + cdq.dqb_isoftlimit = idq.dqb_isoftlimit; + cdq.dqb_curinodes = idq.dqb_curinodes; + cdq.dqb_bhardlimit = idq.dqb_bhardlimit; + cdq.dqb_bsoftlimit = idq.dqb_bsoftlimit; + cdq.dqb_curspace = idq.dqb_curspace; + cdq.dqb_btime = idq.dqb_btime; + cdq.dqb_itime = idq.dqb_itime; + ret = 0; + if (copy_to_user(addr, &cdq, sizeof(cdq))) + ret = -EFAULT; + break; + } + + case QC_SETQUOTA: + case QC_SETUSE: + case QC_SETQLIM: { + struct if_dqblk idq; + struct compat_dqblk cdq; + + sb = quotactl_block(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_SETQUOTA, id); + if (ret) + break; + ret = -EFAULT; + if (copy_from_user(&cdq, addr, sizeof(cdq))) + break; + idq.dqb_ihardlimit = cdq.dqb_ihardlimit; + idq.dqb_isoftlimit = cdq.dqb_isoftlimit; + idq.dqb_curinodes = cdq.dqb_curinodes; + idq.dqb_bhardlimit = cdq.dqb_bhardlimit; + idq.dqb_bsoftlimit = cdq.dqb_bsoftlimit; + idq.dqb_curspace = cdq.dqb_curspace; + idq.dqb_valid = 0; + if (cmds == QC_SETQUOTA || cmds == QC_SETQLIM) + idq.dqb_valid |= QIF_LIMITS; + if (cmds == QC_SETQUOTA || cmds == QC_SETUSE) + idq.dqb_valid |= QIF_USAGE; + ret = sb->s_qcop->set_dqblk(sb, type, id, &idq); + break; + } + + case QC_GETINFO: { + struct if_dqinfo iinf; + struct compat_dqinfo cinf; + + sb = quotactl_block(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id); + if (ret) + break; + ret = sb->s_qcop->get_info(sb, type, &iinf); + if (ret) + break; + cinf.dqi_bgrace = iinf.dqi_bgrace; + cinf.dqi_igrace = iinf.dqi_igrace; + cinf.dqi_flags = 0; + if (iinf.dqi_flags & DQF_INFO_DIRTY) + cinf.dqi_flags |= 0x0010; + cinf.dqi_blocks = 0; + cinf.dqi_free_blk = 0; + cinf.dqi_free_entry = 0; + ret = 0; + if (copy_to_user(addr, &cinf, sizeof(cinf))) + ret = -EFAULT; + break; + } + + case QC_SETINFO: + case QC_SETGRACE: + case QC_SETFLAGS: { + struct if_dqinfo iinf; + struct compat_dqinfo cinf; + + sb = quotactl_block(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_SETINFO, id); + if (ret) + break; + ret = -EFAULT; + if (copy_from_user(&cinf, addr, sizeof(cinf))) + break; + iinf.dqi_bgrace = cinf.dqi_bgrace; + iinf.dqi_igrace = cinf.dqi_igrace; + iinf.dqi_flags = cinf.dqi_flags; + iinf.dqi_valid = 0; + if (cmds == QC_SETINFO || cmds == QC_SETGRACE) + iinf.dqi_valid |= IIF_BGRACE | IIF_IGRACE; + if (cmds == QC_SETINFO || cmds == QC_SETFLAGS) + iinf.dqi_valid |= IIF_FLAGS; + ret = sb->s_qcop->set_info(sb, type, &iinf); + break; + } + + case QC_GETSTATS: { + struct compat_dqstats stat; + + memset(&stat, 0, sizeof(stat)); + stat.version = 6*10000+5*100+0; + ret = 0; + if (copy_to_user(addr, &stat, sizeof(stat))) + ret = -EFAULT; + break; + } + + default: + ret = -ENOSYS; + break; + } + if (sb && !IS_ERR(sb)) + drop_super(sb); + return ret; +} + +#endif + /* * This is the system call interface. This communicates with * the user-level programs. Currently this only supports diskquota @@ -377,6 +597,11 @@ asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; +#ifdef CONFIG_QUOTA_COMPAT + if (cmds >= 0x0100 && cmds < 0x3000) + return compat_quotactl(cmds, type, special, id, addr); +#endif + if (cmds != Q_SYNC || special) { sb = quotactl_block(special); if (IS_ERR(sb)) diff --git a/fs/read_write.c b/fs/read_write.c index 9ba495d..be9f68e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -21,6 +21,8 @@ #include #include +#include + const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -334,6 +336,29 @@ static inline void file_pos_write(struct file *file, loff_t pos) file->f_pos = pos; } +static inline void bc_acct_write(size_t bytes) +{ + struct user_beancounter *ub; + + if (bytes > 0) { + ub = get_exec_ub(); + ub_percpu_inc(ub, write); + ub_percpu_add(ub, wchar, bytes); + } +} + +static inline void bc_acct_read(size_t bytes) +{ + struct user_beancounter *ub; + + if (bytes > 0) { + ub = get_exec_ub(); + ub_percpu_inc(ub, read); + ub_percpu_add(ub, rchar, bytes); + } +} + + asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) { struct file *file; @@ -346,6 +371,8 @@ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) ret = vfs_read(file, buf, count, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_read(ret); } return ret; @@ -363,6 +390,8 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t co ret = vfs_write(file, buf, count, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_write(ret); } return ret; @@ -384,6 +413,8 @@ asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, if (file->f_mode & FMODE_PREAD) ret = vfs_read(file, buf, count, &pos); fput_light(file, fput_needed); + + bc_acct_read(ret); } return ret; @@ -405,6 +436,8 @@ asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf, if (file->f_mode & FMODE_PWRITE) ret = vfs_write(file, buf, count, &pos); fput_light(file, fput_needed); + + bc_acct_write(ret); } return ret; @@ -650,6 +683,8 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) ret = vfs_readv(file, vec, vlen, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_read(ret); } if (ret > 0) @@ -671,6 +706,8 @@ sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) ret = vfs_writev(file, vec, vlen, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_write(ret); } if (ret > 0) diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index c1add28..3ca5049 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -859,6 +859,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) INITIALIZE_PATH(path); struct reiserfs_dir_entry de; + inode = dentry->d_inode; + DQUOT_INIT(inode); + /* we will be doing 2 balancings and update 2 stat data, we change quotas * of the owner of the directory and of the owner of the parent directory. * The quota structure is possibly deleted only on last iput => outside @@ -883,8 +886,6 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) goto end_rmdir; } - inode = dentry->d_inode; - reiserfs_update_inode_transaction(inode); reiserfs_update_inode_transaction(dir); @@ -947,6 +948,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) unsigned long savelink; inode = dentry->d_inode; + DQUOT_INIT(inode); /* in this transaction we can be doing at max two balancings and update * two stat datas, we change quotas of the owner of the directory and of @@ -1254,6 +1256,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_inode = old_dentry->d_inode; new_dentry_inode = new_dentry->d_inode; + if (new_dentry_inode) + DQUOT_INIT(new_dentry_inode); // make sure, that oldname still exists and points to an object we // are going to rename diff --git a/fs/select.c b/fs/select.c index da0e882..e0eb1cd 100644 --- a/fs/select.c +++ b/fs/select.c @@ -27,6 +27,8 @@ #include +#include + struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; @@ -332,7 +334,8 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; - bits = kmalloc(6 * size, GFP_KERNEL); + bits = kmalloc(6 * size, size > PAGE_SIZE / 6 ? + GFP_KERNEL_UBC : GFP_KERNEL); if (!bits) goto out_nofds; } @@ -678,7 +681,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout) len = min(todo, POLLFD_PER_PAGE); size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; - walk = walk->next = kmalloc(size, GFP_KERNEL); + walk = walk->next = kmalloc(size, GFP_KERNEL_UBC); if (!walk) { err = -ENOMEM; goto out_fds; @@ -710,7 +713,7 @@ out_fds: return err; } -static long do_restart_poll(struct restart_block *restart_block) +long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = (struct pollfd __user*)restart_block->arg0; int nfds = restart_block->arg1; @@ -726,6 +729,7 @@ static long do_restart_poll(struct restart_block *restart_block) } return ret; } +EXPORT_SYMBOL_GPL(do_restart_poll); asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, long timeout_msecs) diff --git a/fs/seq_file.c b/fs/seq_file.c index bd20f7f..007b419 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -32,7 +32,7 @@ int seq_open(struct file *file, const struct seq_operations *op) struct seq_file *p = file->private_data; if (!p) { - p = kmalloc(sizeof(*p), GFP_KERNEL); + p = kmalloc(sizeof(*p), GFP_KERNEL_UBC); if (!p) return -ENOMEM; file->private_data = p; @@ -87,7 +87,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) m->version = file->f_version; /* grab buffer if we didn't have one */ if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC); if (!m->buf) goto Enomem; } @@ -128,7 +128,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) goto Fill; m->op->stop(m, p); kfree(m->buf); - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC); if (!m->buf) goto Enomem; m->count = 0; @@ -200,7 +200,7 @@ static int traverse(struct seq_file *m, loff_t offset) return 0; } if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC); if (!m->buf) return -ENOMEM; } @@ -239,7 +239,7 @@ static int traverse(struct seq_file *m, loff_t offset) Eoverflow: m->op->stop(m, p); kfree(m->buf); - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC); return !m->buf ? -ENOMEM : -EAGAIN; } @@ -385,6 +385,8 @@ int seq_path(struct seq_file *m, struct path *path, char *esc) if (m->count < m->size) { char *s = m->buf + m->count; char *p = d_path(path, s, m->size - m->count); + if (IS_ERR(p) && PTR_ERR(p) != -ENAMETOOLONG) + return 0; if (!IS_ERR(p)) { s = mangle_path(s, p, esc); if (s) { @@ -482,7 +484,7 @@ static void single_stop(struct seq_file *p, void *v) int single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data) { - struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL); + struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_UBC); int res = -ENOMEM; if (op) { @@ -526,7 +528,7 @@ void *__seq_open_private(struct file *f, const struct seq_operations *ops, void *private; struct seq_file *seq; - private = kzalloc(psize, GFP_KERNEL); + private = kzalloc(psize, GFP_KERNEL_UBC); if (private == NULL) goto out; diff --git a/fs/simfs.c b/fs/simfs.c new file mode 100644 index 0000000..366a3ed --- /dev/null +++ b/fs/simfs.c @@ -0,0 +1,332 @@ +/* + * fs/simfs.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define SIMFS_GET_LOWER_FS_SB(sb) sb->s_root->d_sb + +static struct super_operations sim_super_ops; + +static int sim_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct super_block *sb; + struct inode *inode; + + inode = dentry->d_inode; + if (!inode->i_op->getattr) { + generic_fillattr(inode, stat); + if (!stat->blksize) { + unsigned blocks; + + sb = inode->i_sb; + blocks = (stat->size + sb->s_blocksize-1) >> + sb->s_blocksize_bits; + stat->blocks = (sb->s_blocksize / 512) * blocks; + stat->blksize = sb->s_blocksize; + } + } else { + int err; + + err = inode->i_op->getattr(mnt, dentry, stat); + if (err) + return err; + } + + sb = mnt->mnt_sb; + if (sb->s_op == &sim_super_ops) + stat->dev = sb->s_dev; + return 0; +} + +static void quota_get_stat(struct super_block *sb, struct kstatfs *buf) +{ + int err; + struct dq_stat qstat; + struct virt_info_quota q; + long free_file, adj_file; + s64 blk, free_blk, adj_blk; + int bsize_bits; + + q.super = sb; + q.qstat = &qstat; + err = virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_GETSTAT, &q); + if (err != NOTIFY_OK) + return; + + bsize_bits = ffs(buf->f_bsize) - 1; + + if (qstat.bsoftlimit > qstat.bcurrent) + free_blk = (qstat.bsoftlimit - qstat.bcurrent) >> bsize_bits; + else + free_blk = 0; + /* + * In the regular case, we always set buf->f_bfree and buf->f_blocks to + * the values reported by quota. In case of real disk space shortage, + * we adjust the values. We want this adjustment to look as if the + * total disk space were reduced, not as if the usage were increased. + * -- SAW + */ + adj_blk = 0; + if (buf->f_bfree < free_blk) + adj_blk = free_blk - buf->f_bfree; + buf->f_bfree = free_blk - adj_blk; + + if (free_blk < buf->f_bavail) + buf->f_bavail = free_blk; + + blk = (qstat.bsoftlimit >> bsize_bits) - adj_blk; + buf->f_blocks = blk > LONG_MAX ? LONG_MAX : blk; + + free_file = qstat.isoftlimit - qstat.icurrent; + if (free_file < 0) + free_file = 0; + if (buf->f_type == REISERFS_SUPER_MAGIC) + /* + * reiserfs doesn't initialize f_ffree and f_files values of + * kstatfs because it doesn't have an inode limit. + */ + buf->f_ffree = free_file; + adj_file = 0; + if (buf->f_ffree < free_file) + adj_file = free_file - buf->f_ffree; + buf->f_ffree = free_file - adj_file; + buf->f_files = qstat.isoftlimit - adj_file; +} + +static int sim_statfs(struct super_block *sb, struct kstatfs *buf) +{ + int err; + struct super_block *lsb; + struct kstatfs statbuf; + + err = 0; + if (sb->s_op != &sim_super_ops) + return 0; + + memset(&statbuf, 0, sizeof(statbuf)); + lsb = SIMFS_GET_LOWER_FS_SB(sb); + + err = -ENOSYS; + if (lsb && lsb->s_op && lsb->s_op->statfs) + err = lsb->s_op->statfs(lsb->s_root, &statbuf); + if (err) + return err; + + quota_get_stat(sb, &statbuf); + + buf->f_files = statbuf.f_files; + buf->f_ffree = statbuf.f_ffree; + buf->f_blocks = statbuf.f_blocks; + buf->f_bfree = statbuf.f_bfree; + buf->f_bavail = statbuf.f_bavail; + return 0; +} + +static int sim_systemcall(struct vnotifier_block *me, unsigned long n, + void *d, int old_ret) +{ + int err; + + switch (n) { + case VIRTINFO_FAUDIT_STAT: { + struct faudit_stat_arg *arg; + + arg = (struct faudit_stat_arg *)d; + err = sim_getattr(arg->mnt, arg->dentry, arg->stat); + arg->err = err; + } + break; + case VIRTINFO_FAUDIT_STATFS: { + struct faudit_statfs_arg *arg; + + arg = (struct faudit_statfs_arg *)d; + err = sim_statfs(arg->sb, arg->stat); + arg->err = err; + } + break; + default: + return old_ret; + } + return (err ? NOTIFY_BAD : NOTIFY_OK); +} + +static struct inode *sim_quota_root(struct super_block *sb) +{ + return sb->s_root->d_inode; +} + +/* + * NOTE: We need to setup s_bdev field on super block, since sys_quotactl() + * does lookup_bdev() and get_super() which are comparing sb->s_bdev. + * so this is a MUST if we want unmodified sys_quotactl + * to work correctly on /dev/simfs inside VE + */ +static int sim_init_blkdev(struct super_block *sb) +{ + static struct hd_struct fake_hd; + struct block_device *blkdev; + + blkdev = bdget(sb->s_dev); + if (blkdev == NULL) + return -ENOMEM; + + blkdev->bd_part = &fake_hd; /* required for bdev_read_only() */ + sb->s_bdev = blkdev; + + return 0; +} + +static void sim_free_blkdev(struct super_block *sb) +{ + /* set bd_part back to NULL */ + sb->s_bdev->bd_part = NULL; + bdput(sb->s_bdev); +} + +static void sim_quota_init(struct super_block *sb) +{ + struct virt_info_quota viq; + + viq.super = sb; + virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_ON, &viq); +} + +static void sim_quota_free(struct super_block *sb) +{ + struct virt_info_quota viq; + + viq.super = sb; + virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_OFF, &viq); +} + +static struct super_operations sim_super_ops = { + .get_quota_root = sim_quota_root, +}; + +static int sim_fill_super(struct super_block *s, void *data) +{ + int err; + struct nameidata *nd; + + err = set_anon_super(s, NULL); + if (err) + goto out; + + err = 0; + nd = (struct nameidata *)data; + s->s_fs_info = mntget(nd->path.mnt); + s->s_root = dget(nd->path.dentry); + s->s_op = &sim_super_ops; +out: + return err; +} + +static int sim_get_sb(struct file_system_type *type, int flags, + const char *dev_name, void *opt, struct vfsmount *mnt) +{ + int err; + struct nameidata nd; + struct super_block *sb; + + err = -EINVAL; + if (opt == NULL) + goto out; + + err = path_lookup(opt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); + if (err) + goto out; + + sb = sget(type, NULL, sim_fill_super, &nd); + err = PTR_ERR(sb); + if (IS_ERR(sb)) + goto out_path; + + err = sim_init_blkdev(sb); + if (err) + goto out_killsb; + + sim_quota_init(sb); + + path_put(&nd.path); + return simple_set_mnt(mnt, sb); + +out_killsb: + up_write(&sb->s_umount); + deactivate_super(sb); +out_path: + path_put(&nd.path); +out: + return err; +} + +static void sim_kill_sb(struct super_block *sb) +{ + dput(sb->s_root); + sb->s_root = NULL; + mntput((struct vfsmount *)(sb->s_fs_info)); + + sim_quota_free(sb); + sim_free_blkdev(sb); + + kill_anon_super(sb); +} + +static struct file_system_type sim_fs_type = { + .owner = THIS_MODULE, + .name = "simfs", + .get_sb = sim_get_sb, + .kill_sb = sim_kill_sb, + .fs_flags = FS_MANGLE_PROC, +}; + +static struct vnotifier_block sim_syscalls = { + .notifier_call = sim_systemcall, +}; + +static int __init init_simfs(void) +{ + int err; + + err = register_filesystem(&sim_fs_type); + if (err) + return err; + + virtinfo_notifier_register(VITYPE_FAUDIT, &sim_syscalls); + return 0; +} + +static void __exit exit_simfs(void) +{ + virtinfo_notifier_unregister(VITYPE_FAUDIT, &sim_syscalls); + unregister_filesystem(&sim_fs_type); +} + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Open Virtuozzo Simulation of File System"); +MODULE_LICENSE("GPL v2"); + +module_init(init_simfs); +module_exit(exit_simfs); diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c index e37fe4d..1992fc0 100644 --- a/fs/smbfs/sock.c +++ b/fs/smbfs/sock.c @@ -99,6 +99,7 @@ smb_close_socket(struct smb_sb_info *server) VERBOSE("closing socket %p\n", sock); sock->sk->sk_data_ready = server->data_ready; + sock->sk->sk_user_data = NULL; server->sock_file = NULL; fput(file); } diff --git a/fs/stat.c b/fs/stat.c index 7c46fbe..684baed 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -41,11 +42,19 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; int retval; + struct faudit_stat_arg arg; retval = security_inode_getattr(mnt, dentry); if (retval) return retval; + arg.mnt = mnt; + arg.dentry = dentry; + arg.stat = stat; + if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STAT, &arg) + != NOTIFY_DONE) + return arg.err; + if (inode->i_op->getattr) return inode->i_op->getattr(mnt, dentry, stat); diff --git a/fs/super.c b/fs/super.c index e931ae9..60a7209 100644 --- a/fs/super.c +++ b/fs/super.c @@ -38,12 +38,15 @@ #include #include #include +#include #include #include "internal.h" LIST_HEAD(super_blocks); +EXPORT_SYMBOL_GPL(super_blocks); DEFINE_SPINLOCK(sb_lock); +EXPORT_SYMBOL_GPL(sb_lock); /** * alloc_super - create new superblock @@ -73,13 +76,15 @@ static struct super_block *alloc_super(struct file_system_type *type) INIT_LIST_HEAD(&s->s_dentry_lru); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); - lockdep_set_class(&s->s_umount, &type->s_umount_key); + lockdep_set_class(&s->s_umount, + &type->proto->s_umount_key); /* * The locking rules for s_lock are up to the * filesystem. For example ext3fs has different * lock ordering than usbfs: */ - lockdep_set_class(&s->s_lock, &type->s_lock_key); + lockdep_set_class(&s->s_lock, + &type->proto->s_lock_key); down_write(&s->s_umount); s->s_count = S_BIAS; atomic_set(&s->s_active, 1); @@ -304,7 +309,7 @@ void generic_shutdown_super(struct super_block *sb) sop->put_super(sb); /* Forget any remaining inodes */ - if (invalidate_inodes(sb)) { + if (invalidate_inodes_check(sb, 1)) { printk("VFS: Busy inodes after unmount of %s. " "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); @@ -533,17 +538,26 @@ rescan: spin_unlock(&sb_lock); return NULL; } +EXPORT_SYMBOL(user_get_super); asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf) { + dev_t kdev; struct super_block *s; struct ustat tmp; struct kstatfs sbuf; - int err = -EINVAL; + int err; + + kdev = new_decode_dev(dev); + err = get_device_perms_ve(S_IFBLK, kdev, FMODE_READ); + if (err) + goto out; + + err = -EINVAL; + s = user_get_super(kdev); + if (s == NULL) + goto out; - s = user_get_super(new_decode_dev(dev)); - if (s == NULL) - goto out; err = vfs_statfs(s->s_root, &sbuf); drop_super(s); if (err) @@ -685,6 +699,13 @@ void emergency_remount(void) static struct idr unnamed_dev_idr; static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ +/* for compatibility with coreutils still unaware of new minor sizes */ +int unnamed_dev_majors[] = { + 0, 144, 145, 146, 242, 243, 244, 245, + 246, 247, 248, 249, 250, 251, 252, 253 +}; +EXPORT_SYMBOL(unnamed_dev_majors); + int set_anon_super(struct super_block *s, void *data) { int dev; @@ -702,13 +723,13 @@ int set_anon_super(struct super_block *s, void *data) else if (error) return -EAGAIN; - if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { + if ((dev & MAX_ID_MASK) >= (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); idr_remove(&unnamed_dev_idr, dev); spin_unlock(&unnamed_dev_lock); return -EMFILE; } - s->s_dev = MKDEV(0, dev & MINORMASK); + s->s_dev = make_unnamed_dev(dev); return 0; } @@ -716,8 +737,9 @@ EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { - int slot = MINOR(sb->s_dev); + int slot; + slot = unnamed_dev_idx(sb->s_dev); generic_shutdown_super(sb); spin_lock(&unnamed_dev_lock); idr_remove(&unnamed_dev_idr, slot); diff --git a/fs/sync.c b/fs/sync.c index 2967562..9b03c39 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -14,6 +14,8 @@ #include #include +#include + #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ SYNC_FILE_RANGE_WAIT_AFTER) @@ -38,7 +40,14 @@ static void do_sync(unsigned long wait) asmlinkage long sys_sync(void) { + struct user_beancounter *ub; + + ub = get_exec_ub(); + ub_percpu_inc(ub, sync); + do_sync(1); + + ub_percpu_inc(ub, sync_done); return 0; } @@ -80,6 +89,7 @@ long do_fsync(struct file *file, int datasync) int ret; int err; struct address_space *mapping = file->f_mapping; + struct user_beancounter *ub; if (!file->f_op || !file->f_op->fsync) { /* Why? We can still call filemap_fdatawrite */ @@ -87,6 +97,12 @@ long do_fsync(struct file *file, int datasync) goto out; } + ub = get_exec_ub(); + if (datasync) + ub_percpu_inc(ub, fdsync); + else + ub_percpu_inc(ub, fsync); + ret = filemap_fdatawrite(mapping); /* @@ -101,6 +117,11 @@ long do_fsync(struct file *file, int datasync) err = filemap_fdatawait(mapping); if (!ret) ret = err; + + if (datasync) + ub_percpu_inc(ub, fdsync_done); + else + ub_percpu_inc(ub, fsync_done); out: return ret; } @@ -252,12 +273,16 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset, loff_t endbyte, unsigned int flags) { int ret; + struct user_beancounter *ub; if (!mapping) { ret = -EINVAL; - goto out; + goto out_noacct; } + ub = get_exec_ub(); + ub_percpu_inc(ub, frsync); + ret = 0; if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { ret = wait_on_page_writeback_range(mapping, @@ -280,6 +305,8 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset, endbyte >> PAGE_CACHE_SHIFT); } out: + ub_percpu_inc(ub, frsync_done); +out_noacct: return ret; } EXPORT_SYMBOL_GPL(do_sync_mapping_range); diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 006fc64..9aec999 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -177,6 +177,9 @@ static int open(struct inode * inode, struct file * file) struct bin_buffer *bb = NULL; int error; + if (!ve_sysfs_alowed()) + return 0; + /* binary file operations requires both @sd and its parent */ if (!sysfs_get_active_two(attr_sd)) return -ENODEV; @@ -238,6 +241,9 @@ const struct file_operations bin_fops = { int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) { + if (!ve_sysfs_alowed()) + return 0; + BUG_ON(!kobj || !kobj->sd || !attr); return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); @@ -252,6 +258,8 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) { + if (!ve_sysfs_alowed()) + return; sysfs_hash_and_remove(kobj->sd, attr->attr.name); } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index aedaeba..5c15f5d 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -508,6 +508,9 @@ static void sysfs_drop_dentry(struct sysfs_dirent *sd) struct inode *inode; struct dentry *dentry; + if (!ve_sysfs_alowed()) + return; + inode = ilookup(sysfs_sb, sd->s_ino); if (!inode) return; @@ -679,12 +682,15 @@ int sysfs_create_dir(struct kobject * kobj) struct sysfs_dirent *parent_sd, *sd; int error = 0; + if (!ve_sysfs_alowed()) + return 0; + BUG_ON(!kobj); if (kobj->parent) parent_sd = kobj->parent->sd; else - parent_sd = &sysfs_root; + parent_sd = ve_sysfs_root; error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd); if (!error) @@ -785,6 +791,9 @@ void sysfs_remove_dir(struct kobject * kobj) { struct sysfs_dirent *sd = kobj->sd; + if (!ve_sysfs_alowed()) + return; + spin_lock(&sysfs_assoc_lock); kobj->sd = NULL; spin_unlock(&sysfs_assoc_lock); @@ -800,6 +809,9 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name) const char *dup_name = NULL; int error; + if (!ve_sysfs_alowed()) + return 0; + mutex_lock(&sysfs_rename_mutex); error = 0; @@ -868,7 +880,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) mutex_lock(&sysfs_rename_mutex); BUG_ON(!sd->s_parent); - new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; + new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : ve_sysfs_root; error = 0; if (sd->s_parent == new_parent_sd) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index c9e4e50..6389078 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -516,6 +516,8 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) { + if (!ve_sysfs_alowed()) + return 0; BUG_ON(!kobj || !kobj->sd || !attr); return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR); @@ -614,6 +616,8 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) { + if (!ve_sysfs_alowed()) + return; sysfs_hash_and_remove(kobj->sd, attr->name); } diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index fe61194..628afe3 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -62,6 +62,8 @@ static int internal_create_group(struct kobject *kobj, int update, struct sysfs_dirent *sd; int error; + if (!ve_sysfs_alowed()) + return 0; BUG_ON(!kobj || (!update && !kobj->sd)); /* Updates may happen before the object has been instantiated */ @@ -131,6 +133,9 @@ void sysfs_remove_group(struct kobject * kobj, struct sysfs_dirent *dir_sd = kobj->sd; struct sysfs_dirent *sd; + if (!ve_sysfs_alowed()) + return; + if (grp->name) { sd = sysfs_get_dirent(dir_sd, grp->name); if (!sd) { diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index eb53c63..a09bfa5 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -20,8 +20,6 @@ #include #include "sysfs.h" -extern struct super_block * sysfs_sb; - static const struct address_space_operations sysfs_aops = { .readpage = simple_readpage, .write_begin = simple_write_begin, diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 14f0023..974bf82 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "sysfs.h" @@ -22,8 +23,11 @@ /* Random magic number */ #define SYSFS_MAGIC 0x62656572 -static struct vfsmount *sysfs_mount; +#ifndef CONFIG_VE +struct vfsmount *sysfs_mount; struct super_block * sysfs_sb = NULL; +#endif + struct kmem_cache *sysfs_dir_cachep; static const struct super_operations sysfs_ops = { @@ -39,6 +43,13 @@ struct sysfs_dirent sysfs_root = { .s_ino = 1, }; +static void init_ve0_sysfs_root(void) +{ +#ifdef CONFIG_VE + get_ve0()->_sysfs_root = &sysfs_root; +#endif +} + static int sysfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; @@ -52,7 +63,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) sysfs_sb = sb; /* get root inode, initialize and unlock it */ - inode = sysfs_get_inode(&sysfs_root); + inode = sysfs_get_inode(ve_sysfs_root); if (!inode) { pr_debug("sysfs: could not get root inode\n"); return -ENOMEM; @@ -65,7 +76,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) iput(inode); return -ENOMEM; } - root->d_fsdata = &sysfs_root; + root->d_fsdata = ve_sysfs_root; sb->s_root = root; return 0; } @@ -76,16 +87,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type, return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt); } -static struct file_system_type sysfs_fs_type = { +struct file_system_type sysfs_fs_type = { .name = "sysfs", .get_sb = sysfs_get_sb, .kill_sb = kill_anon_super, }; +EXPORT_SYMBOL(sysfs_fs_type); + int __init sysfs_init(void) { int err = -ENOMEM; + init_ve0_sysfs_root(); sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", sizeof(struct sysfs_dirent), 0, 0, NULL); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index a3ba217..12ce2f5 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -28,10 +28,13 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, struct sysfs_addrm_cxt acxt; int error; + if (!ve_sysfs_alowed()) + return 0; + BUG_ON(!name); if (!kobj) - parent_sd = &sysfs_root; + parent_sd = ve_sysfs_root; else parent_sd = kobj->sd; @@ -114,8 +117,11 @@ void sysfs_remove_link(struct kobject * kobj, const char * name) { struct sysfs_dirent *parent_sd = NULL; + if(!ve_sysfs_alowed()) + return; + if (!kobj) - parent_sd = &sysfs_root; + parent_sd = ve_sysfs_root; else parent_sd = kobj->sd; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index a5db496..ff709ab 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -8,67 +8,17 @@ * This file is released under the GPLv2. */ -struct sysfs_open_dirent; - -/* type-specific structures for sysfs_dirent->s_* union members */ -struct sysfs_elem_dir { - struct kobject *kobj; - /* children list starts here and goes through sd->s_sibling */ - struct sysfs_dirent *children; -}; - -struct sysfs_elem_symlink { - struct sysfs_dirent *target_sd; -}; - -struct sysfs_elem_attr { - struct attribute *attr; - struct sysfs_open_dirent *open; -}; - -struct sysfs_elem_bin_attr { - struct bin_attribute *bin_attr; -}; - -/* - * sysfs_dirent - the building block of sysfs hierarchy. Each and - * every sysfs node is represented by single sysfs_dirent. - * - * As long as s_count reference is held, the sysfs_dirent itself is - * accessible. Dereferencing s_elem or any other outer entity - * requires s_active reference. - */ -struct sysfs_dirent { - atomic_t s_count; - atomic_t s_active; - struct sysfs_dirent *s_parent; - struct sysfs_dirent *s_sibling; - const char *s_name; - - union { - struct sysfs_elem_dir s_dir; - struct sysfs_elem_symlink s_symlink; - struct sysfs_elem_attr s_attr; - struct sysfs_elem_bin_attr s_bin_attr; - }; - - unsigned int s_flags; - ino_t s_ino; - umode_t s_mode; - struct iattr *s_iattr; -}; - -#define SD_DEACTIVATED_BIAS INT_MIN - -#define SYSFS_TYPE_MASK 0x00ff -#define SYSFS_DIR 0x0001 -#define SYSFS_KOBJ_ATTR 0x0002 -#define SYSFS_KOBJ_BIN_ATTR 0x0004 -#define SYSFS_KOBJ_LINK 0x0008 -#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) - -#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK -#define SYSFS_FLAG_REMOVED 0x0200 +#ifndef CONFIG_VE +extern struct vfsmount *sysfs_mount; +extern struct super_block *sysfs_sb; +#define ve_sysfs_alowed() 1 +#else +#include +#include +#define sysfs_mount (get_exec_env()->sysfs_mnt) +#define sysfs_sb (get_exec_env()->sysfs_sb) +#define ve_sysfs_alowed() (sysfs_sb != NULL) +#endif static inline unsigned int sysfs_type(struct sysfs_dirent *sd) { @@ -88,8 +38,12 @@ struct sysfs_addrm_cxt { /* * mount.c */ +#ifdef CONFIG_VE +#define ve_sysfs_root (get_exec_env()->_sysfs_root) +#else extern struct sysfs_dirent sysfs_root; -extern struct super_block *sysfs_sb; +#define ve_sysfs_root (&sysfs_root) +#endif extern struct kmem_cache *sysfs_dir_cachep; /* diff --git a/fs/vzdq_file.c b/fs/vzdq_file.c new file mode 100644 index 0000000..4d814d9 --- /dev/null +++ b/fs/vzdq_file.c @@ -0,0 +1,923 @@ +/* + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo quota files as proc entry implementation. + * It is required for std quota tools to work correctly as they are expecting + * aquota.user and aquota.group files. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* ---------------------------------------------------------------------- + * + * File read operation + * + * FIXME: functions in this section (as well as many functions in vzdq_ugid.c, + * perhaps) abuse vz_quota_sem. + * Taking a global semaphore for lengthy and user-controlled operations inside + * VPSs is not a good idea in general. + * In this case, the reasons for taking this semaphore are completely unclear, + * especially taking into account that the only function that has comments + * about the necessity to be called under this semaphore + * (create_proc_quotafile) is actually called OUTSIDE it. + * + * --------------------------------------------------------------------- */ + +#define DQBLOCK_SIZE 1024 +#define DQUOTBLKNUM 21U +#define DQTREE_DEPTH 4 +#define TREENUM_2_BLKNUM(num) (((num) + 1) << 1) +#define ISINDBLOCK(num) ((num)%2 != 0) +#define FIRST_DATABLK 2 /* first even number */ +#define LAST_IND_LEVEL (DQTREE_DEPTH - 1) +#define CONVERT_LEVEL(level) ((level) * (QUOTAID_EBITS/QUOTAID_BBITS)) +#define GETLEVINDX(ind, lev) (((ind) >> QUOTAID_BBITS*(lev)) \ + & QUOTATREE_BMASK) + +#if (QUOTAID_EBITS / QUOTAID_BBITS) != (QUOTATREE_DEPTH / DQTREE_DEPTH) +#error xBITS and DQTREE_DEPTH does not correspond +#endif + +#define BLOCK_NOT_FOUND 1 + +/* data for quota file -- one per proc entry */ +struct quotatree_data { + struct list_head list; + struct vz_quota_master *qmblk; + int type; /* type of the tree */ +}; + +/* serialized by vz_quota_sem */ +static LIST_HEAD(qf_data_head); + +static const u_int32_t vzquota_magics[] = V2_INITQMAGICS; +static const u_int32_t vzquota_versions[] = V2_INITQVERSIONS; +static const char aquota_user[] = "aquota.user"; +static const char aquota_group[] = "aquota.group"; + + +static inline loff_t get_depoff(int depth) +{ + loff_t res = 1; + while (depth) { + res += (1 << ((depth - 1)*QUOTAID_EBITS + 1)); + depth--; + } + return res; +} + +static inline loff_t get_blknum(loff_t num, int depth) +{ + loff_t res; + res = (num << 1) + get_depoff(depth); + return res; +} + +static int get_depth(loff_t num) +{ + int i; + for (i = 0; i < DQTREE_DEPTH; i++) { + if (num >= get_depoff(i) && (i == DQTREE_DEPTH - 1 + || num < get_depoff(i + 1))) + return i; + } + return -1; +} + +static inline loff_t get_offset(loff_t num) +{ + loff_t res, tmp; + + tmp = get_depth(num); + if (tmp < 0) + return -1; + num -= get_depoff(tmp); + BUG_ON(num < 0); + res = num >> 1; + + return res; +} + +static inline loff_t get_quot_blk_num(struct quotatree_tree *tree, int level) +{ + /* return maximum available block num */ + return tree->levels[level].freenum; +} + +static inline loff_t get_block_num(struct quotatree_tree *tree) +{ + loff_t ind_blk_num, quot_blk_num, max_ind, max_quot; + + quot_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH) - 1); + max_quot = TREENUM_2_BLKNUM(quot_blk_num); + ind_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH - 1)); + max_ind = (quot_blk_num) ? get_blknum(ind_blk_num, LAST_IND_LEVEL) + : get_blknum(ind_blk_num, 0); + + return (max_ind > max_quot) ? max_ind + 1 : max_quot + 1; +} + +/* Write quota file header */ +static int read_header(void *buf, struct quotatree_tree *tree, + struct dq_info *dq_ugid_info, int type) +{ + struct v2_disk_dqheader *dqh; + struct v2_disk_dqinfo *dq_disk_info; + + dqh = buf; + dq_disk_info = buf + sizeof(struct v2_disk_dqheader); + + dqh->dqh_magic = vzquota_magics[type]; + dqh->dqh_version = vzquota_versions[type]; + + dq_disk_info->dqi_bgrace = dq_ugid_info[type].bexpire; + dq_disk_info->dqi_igrace = dq_ugid_info[type].iexpire; + dq_disk_info->dqi_flags = 0; /* no flags */ + dq_disk_info->dqi_blocks = get_block_num(tree); + dq_disk_info->dqi_free_blk = 0; /* first block in the file */ + dq_disk_info->dqi_free_entry = FIRST_DATABLK; + + return 0; +} + +static int get_block_child(int depth, struct quotatree_node *p, u_int32_t *buf) +{ + int i, j, lev_num; + + lev_num = QUOTATREE_DEPTH/DQTREE_DEPTH - 1; + for (i = 0; i < BLOCK_SIZE/sizeof(u_int32_t); i++) { + struct quotatree_node *next, *parent; + + parent = p; + next = p; + for (j = lev_num; j >= 0; j--) { + if (!next->blocks[GETLEVINDX(i,j)]) { + buf[i] = 0; + goto bad_branch; + } + parent = next; + next = next->blocks[GETLEVINDX(i,j)]; + } + buf[i] = (depth == DQTREE_DEPTH - 1) ? + TREENUM_2_BLKNUM(parent->num) + : get_blknum(next->num, depth + 1); + + bad_branch: + ; + } + + return 0; +} + +/* + * Write index block to disk (or buffer) + * @buf has length 256*sizeof(u_int32_t) bytes + */ +static int read_index_block(int num, u_int32_t *buf, + struct quotatree_tree *tree) +{ + struct quotatree_node *p; + u_int32_t index; + loff_t off; + int depth, res; + + res = BLOCK_NOT_FOUND; + index = 0; + depth = get_depth(num); + off = get_offset(num); + if (depth < 0 || off < 0) + return -EINVAL; + + list_for_each_entry(p, &tree->levels[CONVERT_LEVEL(depth)].usedlh, + list) { + if (p->num >= off) + res = 0; + if (p->num != off) + continue; + get_block_child(depth, p, buf); + break; + } + + return res; +} + +static inline void convert_quot_format(struct v2_disk_dqblk *dq, + struct vz_quota_ugid *vzq) +{ + dq->dqb_id = vzq->qugid_id; + dq->dqb_ihardlimit = vzq->qugid_stat.ihardlimit; + dq->dqb_isoftlimit = vzq->qugid_stat.isoftlimit; + dq->dqb_curinodes = vzq->qugid_stat.icurrent; + dq->dqb_bhardlimit = vzq->qugid_stat.bhardlimit / QUOTABLOCK_SIZE; + dq->dqb_bsoftlimit = vzq->qugid_stat.bsoftlimit / QUOTABLOCK_SIZE; + dq->dqb_curspace = vzq->qugid_stat.bcurrent; + dq->dqb_btime = vzq->qugid_stat.btime; + dq->dqb_itime = vzq->qugid_stat.itime; +} + +static int read_dquot(loff_t num, void *buf, struct quotatree_tree *tree) +{ + int res, i, entries = 0; + struct v2_disk_dqdbheader *dq_header; + struct quotatree_node *p; + struct v2_disk_dqblk *blk = buf + sizeof(struct v2_disk_dqdbheader); + + res = BLOCK_NOT_FOUND; + dq_header = buf; + memset(dq_header, 0, sizeof(*dq_header)); + + list_for_each_entry(p, &(tree->levels[QUOTATREE_DEPTH - 1].usedlh), + list) { + if (TREENUM_2_BLKNUM(p->num) >= num) + res = 0; + if (TREENUM_2_BLKNUM(p->num) != num) + continue; + + for (i = 0; i < QUOTATREE_BSIZE; i++) { + if (!p->blocks[i]) + continue; + convert_quot_format(blk + entries, + (struct vz_quota_ugid *)p->blocks[i]); + entries++; + res = 0; + } + break; + } + dq_header->dqdh_entries = entries; + + return res; +} + +static int read_block(int num, void *buf, struct quotatree_tree *tree, + struct dq_info *dq_ugid_info, int magic) +{ + int res; + + memset(buf, 0, DQBLOCK_SIZE); + if (!num) + res = read_header(buf, tree, dq_ugid_info, magic); + else if (ISINDBLOCK(num)) + res = read_index_block(num, (u_int32_t*)buf, tree); + else + res = read_dquot(num, buf, tree); + + return res; +} + +/* + * FIXME: this function can handle quota files up to 2GB only. + */ +static int read_proc_quotafile(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + off_t blk_num, blk_off, buf_off; + char *tmp; + size_t buf_size; + struct quotatree_data *qtd; + struct quotatree_tree *tree; + struct dq_info *dqi; + int res; + + *start = NULL; + tmp = kmalloc(DQBLOCK_SIZE, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + qtd = data; + down(&vz_quota_sem); + down(&qtd->qmblk->dq_sem); + + res = 0; + tree = QUGID_TREE(qtd->qmblk, qtd->type); + if (!tree) { + *eof = 1; + goto out_dq; + } + + dqi = &qtd->qmblk->dq_ugid_info[qtd->type]; + + buf_off = 0; + buf_size = count; + blk_num = off / DQBLOCK_SIZE; + blk_off = off % DQBLOCK_SIZE; + + while (buf_size > 0) { + off_t len; + + len = min((size_t)(DQBLOCK_SIZE-blk_off), buf_size); + res = read_block(blk_num, tmp, tree, dqi, qtd->type); + if (res < 0) + goto out_err; + if (res == BLOCK_NOT_FOUND) { + *eof = 1; + break; + } + memcpy(page + buf_off, tmp + blk_off, len); + + blk_num++; + buf_size -= len; + blk_off = 0; + buf_off += len; + } + res = buf_off; + +out_err: + *start += count; +out_dq: + up(&qtd->qmblk->dq_sem); + up(&vz_quota_sem); + kfree(tmp); + + return res; +} + + +/* ---------------------------------------------------------------------- + * + * /proc/vz/vzaquota/QID/aquota.* files + * + * FIXME: this code lacks serialization of read/readdir/lseek. + * However, this problem should be fixed after the mainstream issue of what + * appears to be non-atomic read and update of file position in sys_read. + * + * --------------------------------------------------------------------- */ + +static inline unsigned long vzdq_aquot_getino(dev_t dev) +{ + return 0xec000000UL + dev; +} + +static inline dev_t vzdq_aquot_getidev(struct inode *inode) +{ + return (dev_t)(unsigned long)PROC_I(inode)->op.proc_get_link; +} + +static inline void vzdq_aquot_setidev(struct inode *inode, dev_t dev) +{ + PROC_I(inode)->op.proc_get_link = (void *)(unsigned long)dev; +} + +static ssize_t vzdq_aquotf_read(struct file *file, + char __user *buf, size_t size, loff_t *ppos) +{ + char *page; + size_t bufsize; + ssize_t l, l2, copied; + char *start; + struct inode *inode; + struct block_device *bdev; + struct super_block *sb; + struct quotatree_data data; + int eof, err; + + err = -ENOMEM; + page = (char *)__get_free_page(GFP_KERNEL); + if (page == NULL) + goto out_err; + + err = -ENODEV; + inode = file->f_dentry->d_inode; + bdev = bdget(vzdq_aquot_getidev(inode)); + if (bdev == NULL) + goto out_err; + sb = get_super(bdev); + bdput(bdev); + if (sb == NULL) + goto out_err; + data.qmblk = vzquota_find_qmblk(sb); + data.type = PROC_I(inode)->fd - 1; + drop_super(sb); + if (data.qmblk == NULL || data.qmblk == VZ_QUOTA_BAD) + goto out_err; + + copied = 0; + l = l2 = 0; + while (1) { + bufsize = min(size, (size_t)PAGE_SIZE); + if (bufsize <= 0) + break; + + l = read_proc_quotafile(page, &start, *ppos, bufsize, + &eof, &data); + if (l <= 0) + break; + + l2 = copy_to_user(buf, page, l); + copied += l - l2; + if (l2) + break; + + buf += l; + size -= l; + *ppos += (unsigned long)start; + l = l2 = 0; + } + + qmblk_put(data.qmblk); + free_page((unsigned long)page); + if (copied) + return copied; + else if (l2) /* last copy_to_user failed */ + return -EFAULT; + else /* read error or EOF */ + return l; + +out_err: + if (page != NULL) + free_page((unsigned long)page); + return err; +} + +static struct file_operations vzdq_aquotf_file_operations = { + .read = &vzdq_aquotf_read, +}; + +static struct inode_operations vzdq_aquotf_inode_operations = { +}; + + +/* ---------------------------------------------------------------------- + * + * /proc/vz/vzaquota/QID directory + * + * --------------------------------------------------------------------- */ + +static int vzdq_aquotq_readdir(struct file *file, void *data, filldir_t filler) +{ + loff_t n; + int err; + + n = file->f_pos; + for (err = 0; !err; n++) { + /* ppc32 can't cmp 2 long long's in switch, calls __cmpdi2() */ + switch ((unsigned long)n) { + case 0: + err = (*filler)(data, ".", 1, n, + file->f_dentry->d_inode->i_ino, + DT_DIR); + break; + case 1: + err = (*filler)(data, "..", 2, n, + parent_ino(file->f_dentry), DT_DIR); + break; + case 2: + err = (*filler)(data, aquota_user, + sizeof(aquota_user)-1, n, + file->f_dentry->d_inode->i_ino + + USRQUOTA + 1, + DT_REG); + break; + case 3: + err = (*filler)(data, aquota_group, + sizeof(aquota_group)-1, n, + file->f_dentry->d_inode->i_ino + + GRPQUOTA + 1, + DT_REG); + break; + default: + goto out; + } + } +out: + file->f_pos = n; + return err; +} + +struct vzdq_aquotq_lookdata { + dev_t dev; + int type; + struct vz_quota_master *qmblk; +}; + +static int vzdq_aquotq_looktest(struct inode *inode, void *data) +{ + struct vzdq_aquotq_lookdata *d; + + d = data; + return inode->i_op == &vzdq_aquotf_inode_operations && + vzdq_aquot_getidev(inode) == d->dev && + PROC_I(inode)->fd == d->type + 1; +} + +static int vzdq_aquotq_lookset(struct inode *inode, void *data) +{ + struct vzdq_aquotq_lookdata *d; + struct super_block *sb; + struct quotatree_data qtd; + struct quotatree_tree *tree; + + d = data; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = vzdq_aquot_getino(d->dev) + d->type + 1; + inode->i_mode = S_IFREG | S_IRUSR; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_nlink = 1; + inode->i_op = &vzdq_aquotf_inode_operations; + inode->i_fop = &vzdq_aquotf_file_operations; + PROC_I(inode)->fd = d->type + 1; + vzdq_aquot_setidev(inode, d->dev); + + /* Setting size */ + sb = user_get_super(d->dev); + if (sb == NULL) + return -ENODEV; + qtd.qmblk = vzquota_find_qmblk(sb); + drop_super(sb); + + if (qtd.qmblk == NULL) + return -ESRCH; + if (qtd.qmblk == VZ_QUOTA_BAD) + return -EIO; + + qtd.type = PROC_I(inode)->fd - 1; + tree = QUGID_TREE(qtd.qmblk, qtd.type); + inode->i_size = get_block_num(tree) * 1024; + return 0; +} + +static int vzdq_aquotq_revalidate(struct dentry *vdentry, struct nameidata *nd) +{ + return 0; +} + +static struct dentry_operations vzdq_aquotq_dentry_operations = { + .d_revalidate = &vzdq_aquotq_revalidate, +}; + +static struct vz_quota_master *find_qmblk_by_dev(dev_t dev) +{ + struct super_block *sb; + struct vz_quota_master *qmblk; + + qmblk = NULL; + sb = user_get_super(dev); + if (sb != NULL) { + qmblk = vzquota_find_qmblk(sb); + drop_super(sb); + + if (qmblk == VZ_QUOTA_BAD) + qmblk = NULL; + } + + return qmblk; +} + +static struct dentry *vzdq_aquotq_lookup(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + struct vzdq_aquotq_lookdata d; + int k; + + if (dentry->d_name.len == sizeof(aquota_user)-1) { + if (memcmp(dentry->d_name.name, aquota_user, + sizeof(aquota_user)-1)) + goto out; + k = USRQUOTA; + } else if (dentry->d_name.len == sizeof(aquota_group)-1) { + if (memcmp(dentry->d_name.name, aquota_group, + sizeof(aquota_group)-1)) + goto out; + k = GRPQUOTA; + } else + goto out; + d.dev = vzdq_aquot_getidev(dir); + d.type = k; + d.qmblk = find_qmblk_by_dev(d.dev); + if (d.qmblk == NULL) + goto out; + + inode = iget5_locked(dir->i_sb, dir->i_ino + k + 1, + vzdq_aquotq_looktest, vzdq_aquotq_lookset, &d); + if (inode == NULL) + goto out; + unlock_new_inode(inode); + dentry->d_op = &vzdq_aquotq_dentry_operations; + d_add(dentry, inode); + return NULL; + +out: + return ERR_PTR(-ENOENT); +} + +static struct file_operations vzdq_aquotq_file_operations = { + .read = &generic_read_dir, + .readdir = &vzdq_aquotq_readdir, +}; + +static struct inode_operations vzdq_aquotq_inode_operations = { + .lookup = &vzdq_aquotq_lookup, +}; + + +/* ---------------------------------------------------------------------- + * + * /proc/vz/vzaquota directory + * + * --------------------------------------------------------------------- */ + +struct vzdq_aquot_de { + struct list_head list; + struct vfsmount *mnt; +}; + +static int vzdq_aquot_buildmntlist(struct ve_struct *ve, + struct list_head *head) +{ + struct vfsmount *rmnt, *mnt; + struct vzdq_aquot_de *p; + int err; + +#ifdef CONFIG_VE + rmnt = mntget(ve->root_path.mnt); +#else + read_lock(¤t->fs->lock); + rmnt = mntget(current->fs->rootmnt); + read_unlock(¤t->fs->lock); +#endif + mnt = rmnt; + spin_lock(&vfsmount_lock); + while (1) { + list_for_each_entry(p, head, list) { + if (p->mnt->mnt_sb == mnt->mnt_sb) + goto skip; + } + + err = -ENOMEM; + p = kmalloc(sizeof(*p), GFP_ATOMIC); + if (p == NULL) + goto out; + p->mnt = mntget(mnt); + list_add_tail(&p->list, head); + +skip: + err = 0; + if (list_empty(&mnt->mnt_mounts)) { + while (1) { + if (mnt == rmnt) + goto out; + if (mnt->mnt_child.next != + &mnt->mnt_parent->mnt_mounts) + break; + mnt = mnt->mnt_parent; + } + mnt = list_entry(mnt->mnt_child.next, + struct vfsmount, mnt_child); + } else + mnt = list_entry(mnt->mnt_mounts.next, + struct vfsmount, mnt_child); + } +out: + spin_unlock(&vfsmount_lock); + mntput(rmnt); + return err; +} + +static void vzdq_aquot_releasemntlist(struct ve_struct *ve, + struct list_head *head) +{ + struct vzdq_aquot_de *p; + + while (!list_empty(head)) { + p = list_entry(head->next, typeof(*p), list); + mntput(p->mnt); + list_del(&p->list); + kfree(p); + } +} + +static int vzdq_aquotd_readdir(struct file *file, void *data, filldir_t filler) +{ + struct ve_struct *ve, *old_ve; + struct list_head mntlist; + struct vzdq_aquot_de *de; + struct super_block *sb; + struct vz_quota_master *qmblk; + loff_t i, n; + char buf[24]; + int l, err; + + i = 0; + n = file->f_pos; + ve = file->f_dentry->d_sb->s_type->owner_env; + old_ve = set_exec_env(ve); + + INIT_LIST_HEAD(&mntlist); +#ifdef CONFIG_VE + /* + * The only reason of disabling readdir for the host system is that + * this readdir can be slow and CPU consuming with large number of VPSs + * (or just mount points). + */ + err = ve_is_super(ve); +#else + err = 0; +#endif + if (!err) { + err = vzdq_aquot_buildmntlist(ve, &mntlist); + if (err) + goto out_err; + } + + if (i >= n) { + if ((*filler)(data, ".", 1, i, + file->f_dentry->d_inode->i_ino, DT_DIR)) + goto out_fill; + } + i++; + + if (i >= n) { + if ((*filler)(data, "..", 2, i, + parent_ino(file->f_dentry), DT_DIR)) + goto out_fill; + } + i++; + + list_for_each_entry (de, &mntlist, list) { + sb = de->mnt->mnt_sb; + if (get_device_perms_ve(S_IFBLK, sb->s_dev, FMODE_QUOTACTL)) + continue; + + qmblk = vzquota_find_qmblk(sb); + if (qmblk == NULL || qmblk == VZ_QUOTA_BAD) + continue; + + qmblk_put(qmblk); + i++; + if (i <= n) + continue; + + l = sprintf(buf, "%08x", new_encode_dev(sb->s_dev)); + if ((*filler)(data, buf, l, i - 1, + vzdq_aquot_getino(sb->s_dev), DT_DIR)) + break; + } + +out_fill: + err = 0; + file->f_pos = i; +out_err: + vzdq_aquot_releasemntlist(ve, &mntlist); + (void)set_exec_env(old_ve); + return err; +} + +static int vzdq_aquotd_looktest(struct inode *inode, void *data) +{ + return inode->i_op == &vzdq_aquotq_inode_operations && + vzdq_aquot_getidev(inode) == (dev_t)(unsigned long)data; +} + +static int vzdq_aquotd_lookset(struct inode *inode, void *data) +{ + dev_t dev; + + dev = (dev_t)(unsigned long)data; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = vzdq_aquot_getino(dev); + inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_nlink = 2; + inode->i_op = &vzdq_aquotq_inode_operations; + inode->i_fop = &vzdq_aquotq_file_operations; + vzdq_aquot_setidev(inode, dev); + return 0; +} + +static struct dentry *vzdq_aquotd_lookup(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct ve_struct *ve, *old_ve; + const unsigned char *s; + int l; + dev_t dev; + struct inode *inode; + + ve = dir->i_sb->s_type->owner_env; + old_ve = set_exec_env(ve); +#ifdef CONFIG_VE + /* + * Lookup is much lighter than readdir, so it can be allowed for the + * host system. But it would be strange to be able to do lookup only + * without readdir... + */ + if (ve_is_super(ve)) + goto out; +#endif + + dev = 0; + l = dentry->d_name.len; + if (l <= 0) + goto out; + for (s = dentry->d_name.name; l > 0; s++, l--) { + if (!isxdigit(*s)) + goto out; + if (dev & ~(~0UL >> 4)) + goto out; + dev <<= 4; + if (isdigit(*s)) + dev += *s - '0'; + else if (islower(*s)) + dev += *s - 'a' + 10; + else + dev += *s - 'A' + 10; + } + dev = new_decode_dev(dev); + + if (get_device_perms_ve(S_IFBLK, dev, FMODE_QUOTACTL)) + goto out; + + inode = iget5_locked(dir->i_sb, vzdq_aquot_getino(dev), + vzdq_aquotd_looktest, vzdq_aquotd_lookset, + (void *)(unsigned long)dev); + if (inode == NULL) + goto out; + unlock_new_inode(inode); + + d_add(dentry, inode); + (void)set_exec_env(old_ve); + return NULL; + +out: + (void)set_exec_env(old_ve); + return ERR_PTR(-ENOENT); +} + +static struct file_operations vzdq_aquotd_file_operations = { + .read = &generic_read_dir, + .readdir = &vzdq_aquotd_readdir, +}; + +static struct inode_operations vzdq_aquotd_inode_operations = { + .lookup = &vzdq_aquotd_lookup, +}; + + +/* ---------------------------------------------------------------------- + * + * Initialization and deinitialization + * + * --------------------------------------------------------------------- */ +static int fake_data; +static struct ctl_table fake_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = ".fake", + .mode = 0600, + .proc_handler = proc_dointvec, + .data = &fake_data, + .maxlen = sizeof(int), + }, + { } +}; + +static struct ctl_path fake_path[] = { + { .ctl_name = CTL_FS, .procname = "fs", }, + { .ctl_name = FS_DQSTATS, .procname = "quota", }, + { } +}; + +/* + * FIXME: creation of proc entries here is unsafe with respect to module + * unloading. + */ +void vzaquota_init(void) +{ + struct proc_dir_entry *de; + + de = proc_create("vzaquota", S_IFDIR | S_IRUSR | S_IXUSR, + glob_proc_vz_dir, &vzdq_aquotd_file_operations); + if (de != NULL) + de->proc_iops = &vzdq_aquotd_inode_operations; + else + printk("VZDQ: vz/vzaquota creation failed\n"); + + register_sysctl_glob_paths(fake_path, fake_table, 1); +} + +void vzaquota_fini(void) +{ + remove_proc_entry("vz/vzaquota", NULL); +} diff --git a/fs/vzdq_mgmt.c b/fs/vzdq_mgmt.c new file mode 100644 index 0000000..7dda882 --- /dev/null +++ b/fs/vzdq_mgmt.c @@ -0,0 +1,753 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* ---------------------------------------------------------------------- + * Switching quota on. + * --------------------------------------------------------------------- */ + +/* + * check limits copied from user + */ +int vzquota_check_sane_limits(struct dq_stat *qstat) +{ + int err; + + err = -EINVAL; + + /* softlimit must be less then hardlimit */ + if (qstat->bsoftlimit > qstat->bhardlimit) + goto out; + + if (qstat->isoftlimit > qstat->ihardlimit) + goto out; + + err = 0; +out: + return err; +} + +/* + * check usage values copied from user + */ +int vzquota_check_sane_values(struct dq_stat *qstat) +{ + int err; + + err = -EINVAL; + + /* expiration time must not be set if softlimit was not exceeded */ + if (qstat->bcurrent < qstat->bsoftlimit && qstat->btime != 0) + goto out; + + if (qstat->icurrent < qstat->isoftlimit && qstat->itime != 0) + goto out; + + err = vzquota_check_sane_limits(qstat); +out: + return err; +} + +/* + * create new quota master block + * this function should: + * - copy limits and usage parameters from user buffer; + * - allock, initialize quota block and insert it to hash; + */ +static int vzquota_create(unsigned int quota_id, + struct vz_quota_stat __user *u_qstat, int compat) +{ + int err; + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + + down(&vz_quota_sem); + + err = -EFAULT; + if (!compat) { + if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) + goto out; + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_stat cqstat; + if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat))) + goto out; + compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat); + compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info); +#endif + } + + err = -EINVAL; + if (quota_id == 0) + goto out; + + if (vzquota_check_sane_values(&qstat.dq_stat)) + goto out; + err = 0; + qmblk = vzquota_alloc_master(quota_id, &qstat); + + if (IS_ERR(qmblk)) /* ENOMEM or EEXIST */ + err = PTR_ERR(qmblk); +out: + up(&vz_quota_sem); + + return err; +} + +/** + * vzquota_on - turn quota on + * + * This function should: + * - find and get refcnt of directory entry for quota root and corresponding + * mountpoint; + * - find corresponding quota block and mark it with given path; + * - check quota tree; + * - initialize quota for the tree root. + */ +static int vzquota_on(unsigned int quota_id, const char __user *quota_root, + char __user *buf) +{ + int err; + struct path path; + struct vz_quota_master *qmblk; + struct super_block *dqsb; + + dqsb = NULL; + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EBUSY; + if (qmblk->dq_state != VZDQ_STARTING) + goto out; + + err = user_path(quota_root, &path); + if (err) + goto out; + /* init path must be a directory */ + err = -ENOTDIR; + if (!S_ISDIR(path.dentry->d_inode->i_mode)) + goto out_path; + + qmblk->dq_root_path = path; + qmblk->dq_sb = path.dentry->d_inode->i_sb; + err = vzquota_get_super(qmblk->dq_sb); + if (err) + goto out_super; + + /* + * Serialization with quota initialization and operations is performed + * through generation check: generation is memorized before qmblk is + * found and compared under inode_qmblk_lock with assignment. + * + * Note that the dentry tree is shrunk only for high-level logical + * serialization, purely as a courtesy to the user: to have consistent + * quota statistics, files should be closed etc. on quota on. + */ + err = vzquota_on_qmblk(qmblk->dq_sb, qmblk->dq_root_path.dentry->d_inode, + qmblk, buf); + if (err) + goto out_init; + qmblk->dq_state = VZDQ_WORKING; + + up(&vz_quota_sem); + return 0; + +out_init: + dqsb = qmblk->dq_sb; +out_super: + /* clear for qmblk_put/quota_free_master */ + qmblk->dq_sb = NULL; + qmblk->dq_root_path.dentry = NULL; + qmblk->dq_root_path.mnt = NULL; +out_path: + path_put(&path); +out: + if (dqsb) + vzquota_put_super(dqsb); + up(&vz_quota_sem); + return err; +} + + +/* ---------------------------------------------------------------------- + * Switching quota off. + * --------------------------------------------------------------------- */ + +/* + * destroy quota block by ID + */ +static int vzquota_destroy(unsigned int quota_id) +{ + int err; + struct vz_quota_master *qmblk; + struct path root; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EBUSY; + if (qmblk->dq_state == VZDQ_WORKING) + goto out; /* quota_off first */ + + list_del_init(&qmblk->dq_hash); + root = qmblk->dq_root_path; + qmblk->dq_root_path.dentry = NULL; + qmblk->dq_root_path.mnt = NULL; + + if (qmblk->dq_sb) + vzquota_put_super(qmblk->dq_sb); + up(&vz_quota_sem); + + qmblk_put(qmblk); + path_put(&root); + return 0; + +out: + up(&vz_quota_sem); + return err; +} + +/** + * vzquota_off - turn quota off + */ + +static int __vzquota_sync_list(struct list_head *lh, + struct vz_quota_master *qmblk, + enum writeback_sync_modes sync_mode) +{ + struct writeback_control wbc; + LIST_HEAD(list); + struct vz_quota_ilink *qlnk; + struct inode *inode; + int err, ret; + + memset(&wbc, 0, sizeof(wbc)); + wbc.sync_mode = sync_mode; + + err = ret = 0; + while (!list_empty(lh)) { + if (need_resched()) { + inode_qmblk_unlock(qmblk->dq_sb); + schedule(); + inode_qmblk_lock(qmblk->dq_sb); + continue; + } + + qlnk = list_first_entry(lh, struct vz_quota_ilink, list); + list_move(&qlnk->list, &list); + + inode = igrab(QLNK_INODE(qlnk)); + if (!inode) + continue; + + inode_qmblk_unlock(qmblk->dq_sb); + + wbc.nr_to_write = LONG_MAX; + ret = sync_inode(inode, &wbc); + if (ret) + err = ret; + iput(inode); + + inode_qmblk_lock(qmblk->dq_sb); + } + + list_splice(&list, lh); + return err; +} + +static int vzquota_sync_list(struct list_head *lh, + struct vz_quota_master *qmblk) +{ + (void)__vzquota_sync_list(lh, qmblk, WB_SYNC_NONE); + return __vzquota_sync_list(lh, qmblk, WB_SYNC_ALL); +} + +static int vzquota_sync_inodes(struct vz_quota_master *qmblk) +{ + int err; + LIST_HEAD(qlnk_list); + + list_splice_init(&qmblk->dq_ilink_list, &qlnk_list); + err = vzquota_sync_list(&qlnk_list, qmblk); + if (!err && !list_empty(&qmblk->dq_ilink_list)) + err = -EBUSY; + list_splice(&qlnk_list, &qmblk->dq_ilink_list); + + return err; +} + +static int vzquota_off(unsigned int quota_id, char __user *buf, int force) +{ + int err, ret; + struct vz_quota_master *qmblk; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EALREADY; + if (qmblk->dq_state != VZDQ_WORKING) + goto out; + + inode_qmblk_lock(qmblk->dq_sb); /* protects dq_ilink_list also */ + ret = vzquota_sync_inodes(qmblk); + inode_qmblk_unlock(qmblk->dq_sb); + + err = vzquota_off_qmblk(qmblk->dq_sb, qmblk, buf, force); + if (err) + goto out; + + err = ret; + /* vzquota_destroy will free resources */ + qmblk->dq_state = VZDQ_STOPING; +out: + up(&vz_quota_sem); + + return err; +} + + +/* ---------------------------------------------------------------------- + * Other VZQUOTA ioctl's. + * --------------------------------------------------------------------- */ + +/* + * this function should: + * - set new limits/buffer under quota master block lock + * - if new softlimit less then usage, then set expiration time + * - no need to alloc ugid hash table - we'll do that on demand + */ +int vzquota_update_limit(struct dq_stat *_qstat, + struct dq_stat *qstat) +{ + int err; + + err = -EINVAL; + if (vzquota_check_sane_limits(qstat)) + goto out; + + err = 0; + + /* limits */ + _qstat->bsoftlimit = qstat->bsoftlimit; + _qstat->bhardlimit = qstat->bhardlimit; + /* + * If the soft limit is exceeded, administrator can override the moment + * when the grace period for limit exceeding ends. + * Specifying the moment may be useful if the soft limit is set to be + * lower than the current usage. In the latter case, if the grace + * period end isn't specified, the grace period will start from the + * moment of the first write operation. + * There is a race with the user level. Soft limit may be already + * exceeded before the limit change, and grace period end calculated by + * the kernel will be overriden. User level may check if the limit is + * already exceeded, but check and set calls are not atomic. + * This race isn't dangerous. Under normal cicrumstances, the + * difference between the grace period end calculated by the kernel and + * the user level should be not greater than as the difference between + * the moments of check and set calls, i.e. not bigger than the quota + * timer resolution - 1 sec. + */ + if (qstat->btime != (time_t)0 && + _qstat->bcurrent >= _qstat->bsoftlimit) + _qstat->btime = qstat->btime; + + _qstat->isoftlimit = qstat->isoftlimit; + _qstat->ihardlimit = qstat->ihardlimit; + if (qstat->itime != (time_t)0 && + _qstat->icurrent >= _qstat->isoftlimit) + _qstat->itime = qstat->itime; + +out: + return err; +} + +/* + * set new quota limits. + * this function should: + * copy new limits from user level + * - find quota block + * - set new limits and flags. + */ +static int vzquota_setlimit(unsigned int quota_id, + struct vz_quota_stat __user *u_qstat, int compat) +{ + int err; + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + + down(&vz_quota_sem); /* for hash list protection */ + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (!compat) { + if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) + goto out; + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_stat cqstat; + if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat))) + goto out; + compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat); + compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info); +#endif + } + + qmblk_data_write_lock(qmblk); + err = vzquota_update_limit(&qmblk->dq_stat, &qstat.dq_stat); + if (err == 0) + qmblk->dq_info = qstat.dq_info; + qmblk_data_write_unlock(qmblk); + +out: + up(&vz_quota_sem); + return err; +} + +/* + * get quota limits. + * very simple - just return stat buffer to user + */ +static int vzquota_getstat(unsigned int quota_id, + struct vz_quota_stat __user *u_qstat, int compat) +{ + int err; + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + qmblk_data_read_lock(qmblk); + /* copy whole buffer under lock */ + memcpy(&qstat.dq_stat, &qmblk->dq_stat, sizeof(qstat.dq_stat)); + memcpy(&qstat.dq_info, &qmblk->dq_info, sizeof(qstat.dq_info)); + qmblk_data_read_unlock(qmblk); + + if (!compat) + err = copy_to_user(u_qstat, &qstat, sizeof(qstat)); + else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_stat cqstat; + dqstat2compat_dqstat(&qstat.dq_stat, &cqstat.dq_stat); + dqinfo2compat_dqinfo(&qstat.dq_info, &cqstat.dq_info); + err = copy_to_user(u_qstat, &cqstat, sizeof(cqstat)); +#endif + } + if (err) + err = -EFAULT; + +out: + up(&vz_quota_sem); + return err; +} + +/* + * This is a system call to turn per-VE disk quota on. + * Note this call is allowed to run ONLY from VE0 + */ +long do_vzquotactl(int cmd, unsigned int quota_id, + struct vz_quota_stat __user *qstat, const char __user *ve_root, + int compat) +{ + int ret; + int force = 0; + + ret = -EPERM; + /* access allowed only from root of VE0 */ + if (!capable(CAP_SYS_RESOURCE) || + !capable(CAP_SYS_ADMIN)) + goto out; + + switch (cmd) { + case VZ_DQ_CREATE: + ret = vzquota_create(quota_id, qstat, compat); + break; + case VZ_DQ_DESTROY: + ret = vzquota_destroy(quota_id); + break; + case VZ_DQ_ON: + /* + * qstat is just a pointer to userspace buffer to + * store busy files path in case of vzquota_on fail + */ + ret = vzquota_on(quota_id, ve_root, (char *)qstat); + break; + case VZ_DQ_OFF_FORCED: + force = 1; + case VZ_DQ_OFF: + /* + * ve_root is just a pointer to userspace buffer to + * store busy files path in case of vzquota_off fail + */ + ret = vzquota_off(quota_id, (char *)ve_root, force); + break; + case VZ_DQ_SETLIMIT: + ret = vzquota_setlimit(quota_id, qstat, compat); + break; + case VZ_DQ_GETSTAT: + ret = vzquota_getstat(quota_id, qstat, compat); + break; + + default: + ret = -EINVAL; + goto out; + } + +out: + return ret; +} + + +/* ---------------------------------------------------------------------- + * Proc filesystem routines + * ---------------------------------------------------------------------*/ + +#if defined(CONFIG_PROC_FS) + +#define QUOTA_UINT_LEN 15 +#define QUOTA_TIME_LEN_FMT_UINT "%11u" +#define QUOTA_NUM_LEN_FMT_UINT "%15u" +#define QUOTA_NUM_LEN_FMT_ULL "%15Lu" +#define QUOTA_TIME_LEN_FMT_STR "%11s" +#define QUOTA_NUM_LEN_FMT_STR "%15s" +#define QUOTA_PROC_MAX_LINE_LEN 2048 + +/* + * prints /proc/ve_dq header line + */ +static int print_proc_header(char * buffer) +{ + return sprintf(buffer, + "%-11s" + QUOTA_NUM_LEN_FMT_STR + QUOTA_NUM_LEN_FMT_STR + QUOTA_NUM_LEN_FMT_STR + QUOTA_TIME_LEN_FMT_STR + QUOTA_TIME_LEN_FMT_STR + "\n", + "qid: path", + "usage", "softlimit", "hardlimit", "time", "expire"); +} + +/* + * prints proc master record id, dentry path + */ +static int print_proc_master_id(char * buffer, char * path_buf, + struct vz_quota_master * qp) +{ + char *path; + int over; + + path = NULL; + switch (qp->dq_state) { + case VZDQ_WORKING: + if (!path_buf) { + path = ""; + break; + } + path = d_path(&qp->dq_root_path, path_buf, PAGE_SIZE); + if (IS_ERR(path)) { + path = ""; + break; + } + /* do not print large path, truncate it */ + over = strlen(path) - + (QUOTA_PROC_MAX_LINE_LEN - 3 - 3 - + QUOTA_UINT_LEN); + if (over > 0) { + path += over - 3; + path[0] = path[1] = path[3] = '.'; + } + break; + case VZDQ_STARTING: + path = "-- started --"; + break; + case VZDQ_STOPING: + path = "-- stopped --"; + break; + } + + return sprintf(buffer, "%u: %s\n", qp->dq_id, path); +} + +/* + * prints struct vz_quota_stat data + */ +static int print_proc_stat(char * buffer, struct dq_stat *qs, + struct dq_info *qi) +{ + return sprintf(buffer, + "%11s" + QUOTA_NUM_LEN_FMT_ULL + QUOTA_NUM_LEN_FMT_ULL + QUOTA_NUM_LEN_FMT_ULL + QUOTA_TIME_LEN_FMT_UINT + QUOTA_TIME_LEN_FMT_UINT + "\n" + "%11s" + QUOTA_NUM_LEN_FMT_UINT + QUOTA_NUM_LEN_FMT_UINT + QUOTA_NUM_LEN_FMT_UINT + QUOTA_TIME_LEN_FMT_UINT + QUOTA_TIME_LEN_FMT_UINT + "\n", + "1k-blocks", + (unsigned long long)qs->bcurrent >> 10, + (unsigned long long)qs->bsoftlimit >> 10, + (unsigned long long)qs->bhardlimit >> 10, + (unsigned int)qs->btime, + (unsigned int)qi->bexpire, + "inodes", + qs->icurrent, + qs->isoftlimit, + qs->ihardlimit, + (unsigned int)qs->itime, + (unsigned int)qi->iexpire); +} + + +/* + * for /proc filesystem output + */ +static int vzquota_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int len, i; + off_t printed = 0; + char *p = page; + struct vz_quota_master *qp; + struct vz_quota_ilink *ql2; + struct list_head *listp; + char *path_buf; + + path_buf = (char*)__get_free_page(GFP_KERNEL); + if (path_buf == NULL) + return -ENOMEM; + + len = print_proc_header(p); + printed += len; + if (off < printed) /* keep header in output */ { + *start = p + off; + p += len; + } + + down(&vz_quota_sem); + + /* traverse master hash table for all records */ + for (i = 0; i < vzquota_hash_size; i++) { + list_for_each(listp, &vzquota_hash_table[i]) { + qp = list_entry(listp, + struct vz_quota_master, dq_hash); + + /* Skip other VE's information if not root of VE0 */ + if ((!capable(CAP_SYS_ADMIN) || + !capable(CAP_SYS_RESOURCE))) { + ql2 = INODE_QLNK(current->fs->root.dentry->d_inode); + if (ql2 == NULL || qp != ql2->qmblk) + continue; + } + /* + * Now print the next record + */ + len = 0; + /* we print quotaid and path only in VE0 */ + if (capable(CAP_SYS_ADMIN)) + len += print_proc_master_id(p+len,path_buf, qp); + len += print_proc_stat(p+len, &qp->dq_stat, + &qp->dq_info); + printed += len; + /* skip unnecessary lines */ + if (printed <= off) + continue; + p += len; + /* provide start offset */ + if (*start == NULL) + *start = p + (off - printed); + /* have we printed all requested size? */ + if (PAGE_SIZE - (p - page) < QUOTA_PROC_MAX_LINE_LEN || + (p - *start) >= count) + goto out; + } + } + + *eof = 1; /* checked all hash */ +out: + up(&vz_quota_sem); + + len = 0; + if (*start != NULL) { + len = (p - *start); + if (len > count) + len = count; + } + + if (path_buf) + free_page((unsigned long) path_buf); + + return len; +} + +/* + * Register procfs read callback + */ +int vzquota_proc_init(void) +{ + struct proc_dir_entry *de; + + de = proc_create("vzquota", S_IFREG|S_IRUSR, proc_vz_dir, NULL); + if (de == NULL) + return -EBUSY; + + de->read_proc = vzquota_read_proc; + de->data = NULL; + return 0; +} + +void vzquota_proc_release(void) +{ + /* Unregister procfs read callback */ + remove_proc_entry("vzquota", proc_vz_dir); +} + +#endif diff --git a/fs/vzdq_ops.c b/fs/vzdq_ops.c new file mode 100644 index 0000000..408bd28 --- /dev/null +++ b/fs/vzdq_ops.c @@ -0,0 +1,632 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + */ + +#include +#include +#include +#include +#include +#include + + +/* ---------------------------------------------------------------------- + * Quota superblock operations - helper functions. + * --------------------------------------------------------------------- */ + +static inline void vzquota_incr_inodes(struct dq_stat *dqstat, + unsigned long number) +{ + dqstat->icurrent += number; +} + +static inline void vzquota_incr_space(struct dq_stat *dqstat, + __u64 number) +{ + dqstat->bcurrent += number; +} + +static inline void vzquota_decr_inodes(struct dq_stat *dqstat, + unsigned long number) +{ + if (dqstat->icurrent > number) + dqstat->icurrent -= number; + else + dqstat->icurrent = 0; + if (dqstat->icurrent < dqstat->isoftlimit) + dqstat->itime = (time_t) 0; +} + +static inline void vzquota_decr_space(struct dq_stat *dqstat, + __u64 number) +{ + if (dqstat->bcurrent > number) + dqstat->bcurrent -= number; + else + dqstat->bcurrent = 0; + if (dqstat->bcurrent < dqstat->bsoftlimit) + dqstat->btime = (time_t) 0; +} + +/* + * better printk() message or use /proc/vzquotamsg interface + * similar to /proc/kmsg + */ +static inline void vzquota_warn(struct dq_info *dq_info, int dq_id, int flag, + const char *fmt) +{ + if (dq_info->flags & flag) /* warning already printed for this + masterblock */ + return; + printk(fmt, dq_id); + dq_info->flags |= flag; +} + +/* + * ignore_hardlimit - + * + * Intended to allow superuser of VE0 to overwrite hardlimits. + * + * ignore_hardlimit() has a very bad feature: + * + * writepage() operation for writable mapping of a file with holes + * may trigger get_block() with wrong current and as a consequence, + * opens a possibility to overcommit hardlimits + */ +/* for the reason above, it is disabled now */ +static inline int ignore_hardlimit(struct dq_info *dqstat) +{ +#if 0 + return ve_is_super(get_exec_env()) && + capable(CAP_SYS_RESOURCE) && + (dqstat->options & VZ_QUOTA_OPT_RSQUASH); +#else + return 0; +#endif +} + +static int vzquota_check_inodes(struct dq_info *dq_info, + struct dq_stat *dqstat, + unsigned long number, int dq_id) +{ + if (number == 0) + return QUOTA_OK; + + if (dqstat->icurrent + number > dqstat->ihardlimit && + !ignore_hardlimit(dq_info)) { + vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, + "VZ QUOTA: file hardlimit reached for id=%d\n"); + return NO_QUOTA; + } + + if (dqstat->icurrent + number > dqstat->isoftlimit) { + if (dqstat->itime == (time_t)0) { + vzquota_warn(dq_info, dq_id, 0, + "VZ QUOTA: file softlimit exceeded " + "for id=%d\n"); + dqstat->itime = CURRENT_TIME_SECONDS + + dq_info->iexpire; + } else if (CURRENT_TIME_SECONDS >= dqstat->itime && + !ignore_hardlimit(dq_info)) { + vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, + "VZ QUOTA: file softlimit expired " + "for id=%d\n"); + return NO_QUOTA; + } + } + + return QUOTA_OK; +} + +static int vzquota_check_space(struct dq_info *dq_info, + struct dq_stat *dqstat, + __u64 number, int dq_id, char prealloc) +{ + if (number == 0) + return QUOTA_OK; + + if (prealloc == DQUOT_CMD_FORCE) + return QUOTA_OK; + + if (dqstat->bcurrent + number > dqstat->bhardlimit && + !ignore_hardlimit(dq_info)) { + if (!prealloc) + vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, + "VZ QUOTA: disk hardlimit reached " + "for id=%d\n"); + return NO_QUOTA; + } + + if (dqstat->bcurrent + number > dqstat->bsoftlimit) { + if (dqstat->btime == (time_t)0) { + if (!prealloc) { + vzquota_warn(dq_info, dq_id, 0, + "VZ QUOTA: disk softlimit exceeded " + "for id=%d\n"); + dqstat->btime = CURRENT_TIME_SECONDS + + dq_info->bexpire; + } else { + /* + * Original Linux quota doesn't allow + * preallocation to exceed softlimit so + * exceeding will be always printed + */ + return NO_QUOTA; + } + } else if (CURRENT_TIME_SECONDS >= dqstat->btime && + !ignore_hardlimit(dq_info)) { + if (!prealloc) + vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, + "VZ QUOTA: disk quota " + "softlimit expired " + "for id=%d\n"); + return NO_QUOTA; + } + } + + return QUOTA_OK; +} + +#ifdef CONFIG_VZ_QUOTA_UGID +static int vzquota_check_ugid_inodes(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid[], + int type, unsigned long number) +{ + struct dq_info *dqinfo; + struct dq_stat *dqstat; + + if (qugid[type] == NULL) + return QUOTA_OK; + if (qugid[type] == VZ_QUOTA_UGBAD) + return NO_QUOTA; + + if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) + return QUOTA_OK; + if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) + return QUOTA_OK; + if (number == 0) + return QUOTA_OK; + + dqinfo = &qmblk->dq_ugid_info[type]; + dqstat = &qugid[type]->qugid_stat; + + if (dqstat->ihardlimit != 0 && + dqstat->icurrent + number > dqstat->ihardlimit) + return NO_QUOTA; + + if (dqstat->isoftlimit != 0 && + dqstat->icurrent + number > dqstat->isoftlimit) { + if (dqstat->itime == (time_t)0) + dqstat->itime = CURRENT_TIME_SECONDS + + dqinfo->iexpire; + else if (CURRENT_TIME_SECONDS >= dqstat->itime) + return NO_QUOTA; + } + + return QUOTA_OK; +} + +static int vzquota_check_ugid_space(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid[], + int type, __u64 number, char prealloc) +{ + struct dq_info *dqinfo; + struct dq_stat *dqstat; + + if (prealloc == DQUOT_CMD_FORCE) + return QUOTA_OK; + + if (qugid[type] == NULL) + return QUOTA_OK; + if (qugid[type] == VZ_QUOTA_UGBAD) + return NO_QUOTA; + + if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) + return QUOTA_OK; + if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) + return QUOTA_OK; + if (number == 0) + return QUOTA_OK; + + dqinfo = &qmblk->dq_ugid_info[type]; + dqstat = &qugid[type]->qugid_stat; + + if (dqstat->bhardlimit != 0 && + dqstat->bcurrent + number > dqstat->bhardlimit) + return NO_QUOTA; + + if (dqstat->bsoftlimit != 0 && + dqstat->bcurrent + number > dqstat->bsoftlimit) { + if (dqstat->btime == (time_t)0) { + if (!prealloc) + dqstat->btime = CURRENT_TIME_SECONDS + + dqinfo->bexpire; + else + /* + * Original Linux quota doesn't allow + * preallocation to exceed softlimit so + * exceeding will be always printed + */ + return NO_QUOTA; + } else if (CURRENT_TIME_SECONDS >= dqstat->btime) + return NO_QUOTA; + } + + return QUOTA_OK; +} +#endif + +/* ---------------------------------------------------------------------- + * Quota superblock operations + * --------------------------------------------------------------------- */ + +/* + * S_NOQUOTA note. + * In the current kernel (2.6.8.1), S_NOQUOTA flag is set only for + * - quota file (absent in our case) + * - after explicit DQUOT_DROP (earlier than clear_inode) in functions like + * filesystem-specific new_inode, before the inode gets outside links. + * For the latter case, the only quota operation where care about S_NOQUOTA + * might be required is vzquota_drop, but there S_NOQUOTA has already been + * checked in DQUOT_DROP(). + * So, S_NOQUOTA may be ignored for now in the VZDQ code. + * + * The above note is not entirely correct. + * Both for ext2 and ext3 filesystems, DQUOT_FREE_INODE is called from + * delete_inode if new_inode fails (for example, because of inode quota + * limits), so S_NOQUOTA check is needed in free_inode. + * This seems to be the dark corner of the current quota API. + */ + +/* + * Initialize quota operations for the specified inode. + */ +static int vzquota_initialize(struct inode *inode, int type) +{ + vzquota_inode_init_call(inode); + return 0; /* ignored by caller */ +} + +/* + * Release quota for the specified inode. + */ +static int vzquota_drop(struct inode *inode) +{ + vzquota_inode_drop_call(inode); + return 0; /* ignored by caller */ +} + +/* + * Allocate block callback. + * + * If (prealloc) disk quota exceeding warning is not printed. + * See Linux quota to know why. + * + * Return: + * QUOTA_OK == 0 on SUCCESS + * NO_QUOTA == 1 if allocation should fail + */ +static int vzquota_alloc_space(struct inode *inode, + qsize_t number, int prealloc) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + int ret = QUOTA_OK; + + qmblk = vzquota_inode_data(inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid[MAXQUOTAS]; +#endif + + /* checking first */ + ret = vzquota_check_space(&qmblk->dq_info, &qmblk->dq_stat, + number, qmblk->dq_id, prealloc); + if (ret == NO_QUOTA) + goto no_quota; +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; + ret = vzquota_check_ugid_space(qmblk, qugid, + cnt, number, prealloc); + if (ret == NO_QUOTA) + goto no_quota; + } + /* check ok, may increment */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (qugid[cnt] == NULL) + continue; + vzquota_incr_space(&qugid[cnt]->qugid_stat, number); + } +#endif + vzquota_incr_space(&qmblk->dq_stat, number); + vzquota_data_unlock(inode, &data); + } + + inode_add_bytes(inode, number); + might_sleep(); + return QUOTA_OK; + +no_quota: + vzquota_data_unlock(inode, &data); + return NO_QUOTA; +} + +/* + * Allocate inodes callback. + * + * Return: + * QUOTA_OK == 0 on SUCCESS + * NO_QUOTA == 1 if allocation should fail + */ +static int vzquota_alloc_inode(const struct inode *inode, unsigned long number) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + int ret = QUOTA_OK; + + qmblk = vzquota_inode_data((struct inode *)inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid *qugid[MAXQUOTAS]; +#endif + + /* checking first */ + ret = vzquota_check_inodes(&qmblk->dq_info, &qmblk->dq_stat, + number, qmblk->dq_id); + if (ret == NO_QUOTA) + goto no_quota; +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; + ret = vzquota_check_ugid_inodes(qmblk, qugid, + cnt, number); + if (ret == NO_QUOTA) + goto no_quota; + } + /* check ok, may increment */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (qugid[cnt] == NULL) + continue; + vzquota_incr_inodes(&qugid[cnt]->qugid_stat, number); + } +#endif + vzquota_incr_inodes(&qmblk->dq_stat, number); + vzquota_data_unlock((struct inode *)inode, &data); + } + + might_sleep(); + return QUOTA_OK; + +no_quota: + vzquota_data_unlock((struct inode *)inode, &data); + return NO_QUOTA; +} + +/* + * Free space callback. + */ +static int vzquota_free_space(struct inode *inode, qsize_t number) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + qmblk = vzquota_inode_data(inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; /* isn't checked by the caller */ + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid; +#endif + + vzquota_decr_space(&qmblk->dq_stat, number); +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid = INODE_QLNK(inode)->qugid[cnt]; + if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) + continue; + vzquota_decr_space(&qugid->qugid_stat, number); + } +#endif + vzquota_data_unlock(inode, &data); + } + inode_sub_bytes(inode, number); + might_sleep(); + return QUOTA_OK; +} + +/* + * Free inodes callback. + */ +static int vzquota_free_inode(const struct inode *inode, unsigned long number) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + qmblk = vzquota_inode_data((struct inode *)inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid; +#endif + + vzquota_decr_inodes(&qmblk->dq_stat, number); +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid = INODE_QLNK(inode)->qugid[cnt]; + if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) + continue; + vzquota_decr_inodes(&qugid->qugid_stat, number); + } +#endif + vzquota_data_unlock((struct inode *)inode, &data); + } + might_sleep(); + return QUOTA_OK; +} + +void vzquota_inode_off(struct inode * inode) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + /* The call is made through virtinfo, it can be an inode + * not controlled by vzquota. + */ + if (inode->i_sb->dq_op != &vz_quota_operations) + return; + + qmblk = vzquota_inode_data(inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return; + + if (qmblk == NULL) { + /* Tricky place. If qmblk == NULL, it means that this inode + * is not in area controlled by vzquota (except for rare + * case of already set S_NOQUOTA). But we have to set + * S_NOQUOTA in any case because vzquota can be turned + * on later, when this inode is invalid from viewpoint + * of vzquota. + * + * To be safe, we reacquire vzquota lock. + */ + inode_qmblk_lock(inode->i_sb); + inode->i_flags |= S_NOQUOTA; + inode_qmblk_unlock(inode->i_sb); + return; + } else { + loff_t bytes = inode_get_bytes(inode); +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid; +#endif + + inode->i_flags |= S_NOQUOTA; + + vzquota_decr_space(&qmblk->dq_stat, bytes); + vzquota_decr_inodes(&qmblk->dq_stat, 1); +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid = INODE_QLNK(inode)->qugid[cnt]; + if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) + continue; + vzquota_decr_space(&qugid->qugid_stat, bytes); + vzquota_decr_inodes(&qugid->qugid_stat, 1); + } +#endif + + vzquota_data_unlock(inode, &data); + + vzquota_inode_drop_call(inode); + } +} + + +#ifdef CONFIG_VZ_QUOTA_UGID + +/* + * helper function for quota_transfer + * check that we can add inode to this quota_id + */ +static int vzquota_transfer_check(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid[], + unsigned int type, __u64 size) +{ + if (vzquota_check_ugid_space(qmblk, qugid, type, size, 0) != QUOTA_OK || + vzquota_check_ugid_inodes(qmblk, qugid, type, 1) != QUOTA_OK) + return -1; + return 0; +} + +int vzquota_transfer_usage(struct inode *inode, + int mask, + struct vz_quota_ilink *qlnk) +{ + struct vz_quota_ugid *qugid_old; + __u64 space; + int i; + + space = inode_get_bytes(inode); + for (i = 0; i < MAXQUOTAS; i++) { + if (!(mask & (1 << i))) + continue; + /* + * Do not permit chown a file if its owner does not have + * ugid record. This might happen if we somehow exceeded + * the UID/GID (e.g. set uglimit less than number of users). + */ + if (INODE_QLNK(inode)->qugid[i] == VZ_QUOTA_UGBAD) + return -1; + if (vzquota_transfer_check(qlnk->qmblk, qlnk->qugid, i, space)) + return -1; + } + + for (i = 0; i < MAXQUOTAS; i++) { + if (!(mask & (1 << i))) + continue; + qugid_old = INODE_QLNK(inode)->qugid[i]; + vzquota_decr_space(&qugid_old->qugid_stat, space); + vzquota_decr_inodes(&qugid_old->qugid_stat, 1); + vzquota_incr_space(&qlnk->qugid[i]->qugid_stat, space); + vzquota_incr_inodes(&qlnk->qugid[i]->qugid_stat, 1); + } + return 0; +} + +/* + * Transfer the inode between diffent user/group quotas. + */ +static int vzquota_transfer(struct inode *inode, struct iattr *iattr) +{ + return vzquota_inode_transfer_call(inode, iattr) ? + NO_QUOTA : QUOTA_OK; +} + +#else /* CONFIG_VZ_QUOTA_UGID */ + +static int vzquota_transfer(struct inode *inode, struct iattr *iattr) +{ + return QUOTA_OK; +} + +#endif + +/* + * Called under following semaphores: + * old_d->d_inode->i_sb->s_vfs_rename_sem + * old_d->d_inode->i_sem + * new_d->d_inode->i_sem + * [not verified --SAW] + */ +static int vzquota_rename(struct inode *inode, + struct inode *old_dir, struct inode *new_dir) +{ + return vzquota_rename_check(inode, old_dir, new_dir) ? + NO_QUOTA : QUOTA_OK; +} + +/* + * Structure of superblock diskquota operations. + */ +struct dquot_operations vz_quota_operations = { + .initialize = vzquota_initialize, + .drop = vzquota_drop, + .alloc_space = vzquota_alloc_space, + .alloc_inode = vzquota_alloc_inode, + .free_space = vzquota_free_space, + .free_inode = vzquota_free_inode, + .transfer = vzquota_transfer, + .rename = vzquota_rename, +}; diff --git a/fs/vzdq_tree.c b/fs/vzdq_tree.c new file mode 100644 index 0000000..f4f2152 --- /dev/null +++ b/fs/vzdq_tree.c @@ -0,0 +1,286 @@ +/* + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo quota tree implementation + */ + +#include +#include +#include + +struct quotatree_tree *quotatree_alloc(void) +{ + int l; + struct quotatree_tree *tree; + + tree = kmalloc(sizeof(struct quotatree_tree), GFP_KERNEL); + if (tree == NULL) + goto out; + + for (l = 0; l < QUOTATREE_DEPTH; l++) { + INIT_LIST_HEAD(&tree->levels[l].usedlh); + INIT_LIST_HEAD(&tree->levels[l].freelh); + tree->levels[l].freenum = 0; + } + tree->root = NULL; + tree->leaf_num = 0; +out: + return tree; +} + +static struct quotatree_node * +quotatree_follow(struct quotatree_tree *tree, quotaid_t id, int level, + struct quotatree_find_state *st) +{ + void **block; + struct quotatree_node *parent; + int l, index; + + parent = NULL; + block = (void **)&tree->root; + l = 0; + while (l < level && *block != NULL) { + index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; + parent = *block; + block = parent->blocks + index; + l++; + } + if (st != NULL) { + st->block = block; + st->level = l; + } + + return parent; +} + +void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st) +{ + quotatree_follow(tree, id, QUOTATREE_DEPTH, st); + if (st->level == QUOTATREE_DEPTH) + return *st->block; + else + return NULL; +} + +void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index) +{ + int i, count; + struct quotatree_node *p; + void *leaf; + + if (QTREE_LEAFNUM(tree) <= index) + return NULL; + + count = 0; + list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { + for (i = 0; i < QUOTATREE_BSIZE; i++) { + leaf = p->blocks[i]; + if (leaf == NULL) + continue; + if (count == index) + return leaf; + count++; + } + } + return NULL; +} + +/* returns data leaf (vz_quota_ugid) after _existent_ ugid (@id) + * in the tree... */ +void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id) +{ + int off; + struct quotatree_node *parent, *p; + struct list_head *lh; + + /* get parent refering correct quota tree node of the last level */ + parent = quotatree_follow(tree, id, QUOTATREE_DEPTH, NULL); + if (!parent) + return NULL; + + off = (id & QUOTATREE_BMASK) + 1; /* next ugid */ + lh = &parent->list; + do { + p = list_entry(lh, struct quotatree_node, list); + for ( ; off < QUOTATREE_BSIZE; off++) + if (p->blocks[off]) + return p->blocks[off]; + off = 0; + lh = lh->next; + } while (lh != &QTREE_LEAFLVL(tree)->usedlh); + + return NULL; +} + +int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st, void *data) +{ + struct quotatree_node *p; + int l, index; + + while (st->level < QUOTATREE_DEPTH) { + l = st->level; + if (!list_empty(&tree->levels[l].freelh)) { + p = list_entry(tree->levels[l].freelh.next, + struct quotatree_node, list); + list_del(&p->list); + } else { + p = kmalloc(sizeof(struct quotatree_node), GFP_NOFS | __GFP_NOFAIL); + if (p == NULL) + return -ENOMEM; + /* save block number in the l-level + * it uses for quota file generation */ + p->num = tree->levels[l].freenum++; + } + list_add(&p->list, &tree->levels[l].usedlh); + memset(p->blocks, 0, sizeof(p->blocks)); + *st->block = p; + + index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; + st->block = p->blocks + index; + st->level++; + } + tree->leaf_num++; + *st->block = data; + + return 0; +} + +static struct quotatree_node * +quotatree_remove_ptr(struct quotatree_tree *tree, quotaid_t id, + int level) +{ + struct quotatree_node *parent; + struct quotatree_find_state st; + + parent = quotatree_follow(tree, id, level, &st); + if (st.level == QUOTATREE_DEPTH) + tree->leaf_num--; + *st.block = NULL; + return parent; +} + +void quotatree_remove(struct quotatree_tree *tree, quotaid_t id) +{ + struct quotatree_node *p; + int level, i; + + p = quotatree_remove_ptr(tree, id, QUOTATREE_DEPTH); + for (level = QUOTATREE_DEPTH - 1; level >= QUOTATREE_CDEPTH; level--) { + for (i = 0; i < QUOTATREE_BSIZE; i++) + if (p->blocks[i] != NULL) + return; + list_move(&p->list, &tree->levels[level].freelh); + p = quotatree_remove_ptr(tree, id, level); + } +} + +#if 0 +static void quotatree_walk(struct quotatree_tree *tree, + struct quotatree_node *node_start, + quotaid_t id_start, + int level_start, int level_end, + int (*callback)(struct quotatree_tree *, + quotaid_t id, + int level, + void *ptr, + void *data), + void *data) +{ + struct quotatree_node *p; + int l, shift, index; + quotaid_t id; + struct quotatree_find_state st; + + p = node_start; + l = level_start; + shift = (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; + id = id_start; + index = 0; + + /* + * Invariants: + * shift == (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; + * id & ((1 << shift) - 1) == 0 + * p is l-level node corresponding to id + */ + do { + if (!p) + break; + + if (l < level_end) { + for (; index < QUOTATREE_BSIZE; index++) + if (p->blocks[index] != NULL) + break; + if (index < QUOTATREE_BSIZE) { + /* descend */ + p = p->blocks[index]; + l++; + shift -= QUOTAID_BBITS; + id += (quotaid_t)index << shift; + index = 0; + continue; + } + } + + if ((*callback)(tree, id, l, p, data)) + break; + + /* ascend and to the next node */ + p = quotatree_follow(tree, id, l, &st); + + index = ((id >> shift) & QUOTATREE_BMASK) + 1; + l--; + shift += QUOTAID_BBITS; + id &= ~(((quotaid_t)1 << shift) - 1); + } while (l >= level_start); +} +#endif + +static void free_list(struct list_head *node_list) +{ + struct quotatree_node *p, *tmp; + + list_for_each_entry_safe(p, tmp, node_list, list) { + list_del(&p->list); + kfree(p); + } +} + +static inline void quotatree_free_nodes(struct quotatree_tree *tree) +{ + int i; + + for (i = 0; i < QUOTATREE_DEPTH; i++) { + free_list(&tree->levels[i].usedlh); + free_list(&tree->levels[i].freelh); + } +} + +static void quotatree_free_leafs(struct quotatree_tree *tree, + void (*dtor)(void *)) +{ + int i; + struct quotatree_node *p; + + list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { + for (i = 0; i < QUOTATREE_BSIZE; i++) { + if (p->blocks[i] == NULL) + continue; + + dtor(p->blocks[i]); + } + } +} + +void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)) +{ + quotatree_free_leafs(tree, dtor); + quotatree_free_nodes(tree); + kfree(tree); +} diff --git a/fs/vzdq_ugid.c b/fs/vzdq_ugid.c new file mode 100644 index 0000000..1031149 --- /dev/null +++ b/fs/vzdq_ugid.c @@ -0,0 +1,1221 @@ +/* + * Copyright (C) 2002 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo UID/GID disk quota implementation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * XXX + * may be something is needed for sb->s_dquot->info[]? + */ + +#define USRQUOTA_MASK (1 << USRQUOTA) +#define GRPQUOTA_MASK (1 << GRPQUOTA) +#define QTYPE2MASK(type) (1 << (type)) + +static struct kmem_cache *vz_quota_ugid_cachep; + +/* guard to protect vz_quota_master from destroy in quota_on/off. Also protects + * list on the hash table */ +extern struct semaphore vz_quota_sem; + +inline struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid) +{ + if (qugid != VZ_QUOTA_UGBAD) + atomic_inc(&qugid->qugid_count); + return qugid; +} + +/* we don't limit users with zero limits */ +static inline int vzquota_fake_stat(struct dq_stat *stat) +{ + return stat->bhardlimit == 0 && stat->bsoftlimit == 0 && + stat->ihardlimit == 0 && stat->isoftlimit == 0; +} + +/* callback function for quotatree_free() */ +static inline void vzquota_free_qugid(void *ptr) +{ + kmem_cache_free(vz_quota_ugid_cachep, ptr); +} + +/* + * destroy ugid, if it have zero refcount, limits and usage + * must be called under qmblk->dq_sem + */ +void vzquota_put_ugid(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid) +{ + if (qugid == VZ_QUOTA_UGBAD) + return; + qmblk_data_read_lock(qmblk); + if (atomic_dec_and_test(&qugid->qugid_count) && + (qmblk->dq_flags & VZDQUG_FIXED_SET) == 0 && + vzquota_fake_stat(&qugid->qugid_stat) && + qugid->qugid_stat.bcurrent == 0 && + qugid->qugid_stat.icurrent == 0) { + quotatree_remove(QUGID_TREE(qmblk, qugid->qugid_type), + qugid->qugid_id); + qmblk->dq_ugid_count--; + vzquota_free_qugid(qugid); + } + qmblk_data_read_unlock(qmblk); +} + +/* + * Get ugid block by its index, like it would present in array. + * In reality, this is not array - this is leafs chain of the tree. + * NULL if index is out of range. + * qmblk semaphore is required to protect the tree. + */ +static inline struct vz_quota_ugid * +vzquota_get_byindex(struct vz_quota_master *qmblk, unsigned int index, int type) +{ + return quotatree_leaf_byindex(QUGID_TREE(qmblk, type), index); +} + +/* + * get next element from ugid "virtual array" + * ugid must be in current array and this array may not be changed between + * two accesses (quaranteed by "stopped" quota state and quota semaphore) + * qmblk semaphore is required to protect the tree + */ +static inline struct vz_quota_ugid * +vzquota_get_next(struct vz_quota_master *qmblk, struct vz_quota_ugid *qugid) +{ + return quotatree_get_next(QUGID_TREE(qmblk, qugid->qugid_type), + qugid->qugid_id); +} + +/* + * requires dq_sem + */ +struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags) +{ + struct vz_quota_ugid *qugid; + struct quotatree_tree *tree; + struct quotatree_find_state st; + + tree = QUGID_TREE(qmblk, type); + qugid = quotatree_find(tree, quota_id, &st); + if (qugid) + goto success; + + /* caller does not want alloc */ + if (flags & VZDQUG_FIND_DONT_ALLOC) + goto fail; + + if (flags & VZDQUG_FIND_FAKE) + goto doit; + + /* check limit */ + if (qmblk->dq_ugid_count >= qmblk->dq_ugid_max) + goto fail; + + /* see comment at VZDQUG_FIXED_SET define */ + if (qmblk->dq_flags & VZDQUG_FIXED_SET) + goto fail; + +doit: + /* alloc new structure */ + qugid = kmem_cache_alloc(vz_quota_ugid_cachep, + GFP_NOFS | __GFP_NOFAIL); + if (qugid == NULL) + goto fail; + + /* initialize new structure */ + qugid->qugid_id = quota_id; + memset(&qugid->qugid_stat, 0, sizeof(qugid->qugid_stat)); + qugid->qugid_type = type; + atomic_set(&qugid->qugid_count, 0); + + /* insert in tree */ + if (quotatree_insert(tree, quota_id, &st, qugid) < 0) + goto fail_insert; + qmblk->dq_ugid_count++; + +success: + vzquota_get_ugid(qugid); + return qugid; + +fail_insert: + vzquota_free_qugid(qugid); +fail: + return VZ_QUOTA_UGBAD; +} + +/* + * takes dq_sem, may schedule + */ +struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags) +{ + struct vz_quota_ugid *qugid; + + down(&qmblk->dq_sem); + qugid = __vzquota_find_ugid(qmblk, quota_id, type, flags); + up(&qmblk->dq_sem); + + return qugid; +} + +/* + * destroy all ugid records on given quota master + */ +void vzquota_kill_ugid(struct vz_quota_master *qmblk) +{ + BUG_ON((qmblk->dq_gid_tree == NULL && qmblk->dq_uid_tree != NULL) || + (qmblk->dq_uid_tree == NULL && qmblk->dq_gid_tree != NULL)); + + if (qmblk->dq_uid_tree != NULL) { + quotatree_free(qmblk->dq_uid_tree, vzquota_free_qugid); + quotatree_free(qmblk->dq_gid_tree, vzquota_free_qugid); + } +} + + +/* ---------------------------------------------------------------------- + * Management interface to ugid quota for (super)users. + * --------------------------------------------------------------------- */ + +static int vzquota_initialize2(struct inode *inode, int type) +{ + return QUOTA_OK; +} + +static int vzquota_drop2(struct inode *inode) +{ + return QUOTA_OK; +} + +static int vzquota_alloc_space2(struct inode *inode, + qsize_t number, int prealloc) +{ + inode_add_bytes(inode, number); + return QUOTA_OK; +} + +static int vzquota_alloc_inode2(const struct inode *inode, unsigned long number) +{ + return QUOTA_OK; +} + +static int vzquota_free_space2(struct inode *inode, qsize_t number) +{ + inode_sub_bytes(inode, number); + return QUOTA_OK; +} + +static int vzquota_free_inode2(const struct inode *inode, unsigned long number) +{ + return QUOTA_OK; +} + +static int vzquota_transfer2(struct inode *inode, struct iattr *iattr) +{ + return QUOTA_OK; +} + +struct dquot_operations vz_quota_operations2 = { + .initialize = vzquota_initialize2, + .drop = vzquota_drop2, + .alloc_space = vzquota_alloc_space2, + .alloc_inode = vzquota_alloc_inode2, + .free_space = vzquota_free_space2, + .free_inode = vzquota_free_inode2, + .transfer = vzquota_transfer2, +}; + + +asmlinkage long sys_unlink(const char __user * pathname); +asmlinkage long sys_rename(const char __user * oldname, + const char __user * newname); +asmlinkage long sys_symlink(const char __user * oldname, + const char __user * newname); + +/* called under sb->s_umount semaphore */ +static int vz_restore_symlink(struct super_block *sb, char *path, int type) +{ + mm_segment_t oldfs; + char *newpath; + char dest[64]; + const char *names[] = { + [USRQUOTA] "aquota.user", + [GRPQUOTA] "aquota.group" + }; + int err; + + newpath = kmalloc(strlen(path) + sizeof(".new"), GFP_KERNEL); + if (newpath == NULL) + return -ENOMEM; + + strcpy(newpath, path); + strcat(newpath, ".new"); + + sprintf(dest, "/proc/vz/vzaquota/%08x/%s", + new_encode_dev(sb->s_dev), names[type]); + + /* + * Lockdep will learn unneeded dependency while unlink(2): + * ->s_umount => ->i_mutex/1 => ->i_mutex + * Reverse dependency is, + * open_namei() => ->i_mutex => lookup_hash() => __lookup_hash() + * => ->lookup() \eq vzdq_aquotq_lookup() => find_qmblk_by_dev() + * => user_get_super() => ->s_umount + * + * However, first set of ->i_mutex'es belong to /, second to /proc . + * Right fix is to get rid of vz_restore_symlink(), of course. + */ + up_read(&sb->s_umount); + + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_unlink(newpath); + if (err < 0 && err != -ENOENT) + goto out_restore; + err = sys_symlink(dest, newpath); + if (err < 0) + goto out_restore; + err = sys_rename(newpath, path); +out_restore: + set_fs(oldfs); + + down_read(&sb->s_umount); + /* umounted meanwhile? */ + if (err == 0 && !sb->s_root) + err = -ENODEV; + + kfree(newpath); + return err; +} + +/* called under sb->s_umount semaphore */ +static int vz_quota_on(struct super_block *sb, int type, + int format_id, char *path, int remount) +{ + struct vz_quota_master *qmblk; + int mask, mask2; + int err; + + qmblk = vzquota_find_qmblk(sb); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = vz_restore_symlink(sb, path, type); + if (err < 0) + goto out_put; + + down(&vz_quota_sem); + mask = 0; + mask2 = 0; + sb->dq_op = &vz_quota_operations2; + sb->s_qcop = &vz_quotactl_operations; + if (type == USRQUOTA) { + mask = DQUOT_USR_ENABLED; + mask2 = VZDQ_USRQUOTA; + } + if (type == GRPQUOTA) { + mask = DQUOT_GRP_ENABLED; + mask2 = VZDQ_GRPQUOTA; + } + err = -EBUSY; + if (qmblk->dq_flags & mask2) + goto out_sem; + + err = 0; + qmblk->dq_flags |= mask2; + sb->s_dquot.flags |= mask; + +out_sem: + up(&vz_quota_sem); +out_put: + qmblk_put(qmblk); +out: + return err; +} + +static int vz_quota_off(struct super_block *sb, int type, int remount) +{ + struct vz_quota_master *qmblk; + int mask2; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + mask2 = 0; + if (type == USRQUOTA) + mask2 = VZDQ_USRQUOTA; + if (type == GRPQUOTA) + mask2 = VZDQ_GRPQUOTA; + err = -EINVAL; + if (!(qmblk->dq_flags & mask2)) + goto out; + + qmblk->dq_flags &= ~mask2; + err = 0; + +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +static int vz_quota_sync(struct super_block *sb, int type) +{ + return 0; /* vz quota is always uptodate */ +} + +static int vz_get_dqblk(struct super_block *sb, int type, + qid_t id, struct if_dqblk *di) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid *ugid; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = 0; + ugid = vzquota_find_ugid(qmblk, id, type, VZDQUG_FIND_DONT_ALLOC); + if (ugid != VZ_QUOTA_UGBAD) { + qmblk_data_read_lock(qmblk); + di->dqb_bhardlimit = ugid->qugid_stat.bhardlimit >> 10; + di->dqb_bsoftlimit = ugid->qugid_stat.bsoftlimit >> 10; + di->dqb_curspace = ugid->qugid_stat.bcurrent; + di->dqb_ihardlimit = ugid->qugid_stat.ihardlimit; + di->dqb_isoftlimit = ugid->qugid_stat.isoftlimit; + di->dqb_curinodes = ugid->qugid_stat.icurrent; + di->dqb_btime = ugid->qugid_stat.btime; + di->dqb_itime = ugid->qugid_stat.itime; + qmblk_data_read_unlock(qmblk); + di->dqb_valid = QIF_ALL; + vzquota_put_ugid(qmblk, ugid); + } else { + memset(di, 0, sizeof(*di)); + di->dqb_valid = QIF_ALL; + } + +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +/* must be called under vz_quota_sem */ +static int __vz_set_dqblk(struct vz_quota_master *qmblk, + int type, qid_t id, struct if_dqblk *di) +{ + struct vz_quota_ugid *ugid; + + ugid = vzquota_find_ugid(qmblk, id, type, 0); + if (ugid == VZ_QUOTA_UGBAD) + return -ESRCH; + + qmblk_data_write_lock(qmblk); + /* + * Subtle compatibility breakage. + * + * Some old non-vz kernel quota didn't start grace period + * if the new soft limit happens to be below the usage. + * Non-vz kernel quota in 2.4.20 starts the grace period + * (if it hasn't been started). + * Current non-vz kernel performs even more complicated + * manipulations... + * + * Also, current non-vz kernels have inconsistency related to + * the grace time start. In regular operations the grace period + * is started if the usage is greater than the soft limit (and, + * strangely, is cancelled if the usage is less). + * However, set_dqblk starts the grace period if the usage is greater + * or equal to the soft limit. + * + * Here we try to mimic the behavior of the current non-vz kernel. + */ + if (di->dqb_valid & QIF_BLIMITS) { + ugid->qugid_stat.bhardlimit = + (__u64)di->dqb_bhardlimit << 10; + ugid->qugid_stat.bsoftlimit = + (__u64)di->dqb_bsoftlimit << 10; + if (di->dqb_bsoftlimit == 0 || + ugid->qugid_stat.bcurrent < ugid->qugid_stat.bsoftlimit) + ugid->qugid_stat.btime = 0; + else if (!(di->dqb_valid & QIF_BTIME)) + ugid->qugid_stat.btime = CURRENT_TIME_SECONDS + + qmblk->dq_ugid_info[type].bexpire; + else + ugid->qugid_stat.btime = di->dqb_btime; + } + if (di->dqb_valid & QIF_ILIMITS) { + ugid->qugid_stat.ihardlimit = di->dqb_ihardlimit; + ugid->qugid_stat.isoftlimit = di->dqb_isoftlimit; + if (di->dqb_isoftlimit == 0 || + ugid->qugid_stat.icurrent < ugid->qugid_stat.isoftlimit) + ugid->qugid_stat.itime = 0; + else if (!(di->dqb_valid & QIF_ITIME)) + ugid->qugid_stat.itime = CURRENT_TIME_SECONDS + + qmblk->dq_ugid_info[type].iexpire; + else + ugid->qugid_stat.itime = di->dqb_itime; + } + qmblk_data_write_unlock(qmblk); + vzquota_put_ugid(qmblk, ugid); + + return 0; +} + +static int vz_set_dqblk(struct super_block *sb, int type, + qid_t id, struct if_dqblk *di) +{ + struct vz_quota_master *qmblk; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + err = __vz_set_dqblk(qmblk, type, id, di); +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +static int vz_get_dqinfo(struct super_block *sb, int type, + struct if_dqinfo *ii) +{ + struct vz_quota_master *qmblk; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = 0; + ii->dqi_bgrace = qmblk->dq_ugid_info[type].bexpire; + ii->dqi_igrace = qmblk->dq_ugid_info[type].iexpire; + ii->dqi_flags = 0; + ii->dqi_valid = IIF_ALL; + +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +/* must be called under vz_quota_sem */ +static int __vz_set_dqinfo(struct vz_quota_master *qmblk, + int type, struct if_dqinfo *ii) +{ + if (ii->dqi_valid & IIF_FLAGS) + if (ii->dqi_flags & DQF_MASK) + return -EINVAL; + + if (ii->dqi_valid & IIF_BGRACE) + qmblk->dq_ugid_info[type].bexpire = ii->dqi_bgrace; + if (ii->dqi_valid & IIF_IGRACE) + qmblk->dq_ugid_info[type].iexpire = ii->dqi_igrace; + return 0; +} + +static int vz_set_dqinfo(struct super_block *sb, int type, + struct if_dqinfo *ii) +{ + struct vz_quota_master *qmblk; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + err = __vz_set_dqinfo(qmblk, type, ii); +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +#ifdef CONFIG_QUOTA_COMPAT + +#define Q_GETQUOTI_SIZE 1024 + +#define UGID2DQBLK(dst, src) \ + do { \ + (dst)->dqb_ihardlimit = (src)->qugid_stat.ihardlimit; \ + (dst)->dqb_isoftlimit = (src)->qugid_stat.isoftlimit; \ + (dst)->dqb_curinodes = (src)->qugid_stat.icurrent; \ + /* in 1K blocks */ \ + (dst)->dqb_bhardlimit = (src)->qugid_stat.bhardlimit >> 10; \ + /* in 1K blocks */ \ + (dst)->dqb_bsoftlimit = (src)->qugid_stat.bsoftlimit >> 10; \ + /* in bytes, 64 bit */ \ + (dst)->dqb_curspace = (src)->qugid_stat.bcurrent; \ + (dst)->dqb_btime = (src)->qugid_stat.btime; \ + (dst)->dqb_itime = (src)->qugid_stat.itime; \ + } while (0) + +static int vz_get_quoti(struct super_block *sb, int type, qid_t idx, + struct v2_disk_dqblk __user *dqblk) +{ + struct vz_quota_master *qmblk; + struct v2_disk_dqblk *data, *kbuf; + struct vz_quota_ugid *ugid; + int count; + int err; + + qmblk = vzquota_find_qmblk(sb); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = -ENOMEM; + kbuf = vmalloc(Q_GETQUOTI_SIZE * sizeof(*kbuf)); + if (!kbuf) + goto out; + + down(&vz_quota_sem); + down(&qmblk->dq_sem); + for (ugid = vzquota_get_byindex(qmblk, idx, type), count = 0; + ugid != NULL && count < Q_GETQUOTI_SIZE; + count++) + { + data = kbuf + count; + qmblk_data_read_lock(qmblk); + UGID2DQBLK(data, ugid); + qmblk_data_read_unlock(qmblk); + data->dqb_id = ugid->qugid_id; + + /* Find next entry */ + ugid = vzquota_get_next(qmblk, ugid); + BUG_ON(ugid != NULL && ugid->qugid_type != type); + } + up(&qmblk->dq_sem); + up(&vz_quota_sem); + + err = count; + if (copy_to_user(dqblk, kbuf, count * sizeof(*kbuf))) + err = -EFAULT; + + vfree(kbuf); +out: + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + + return err; +} + +#endif + +struct quotactl_ops vz_quotactl_operations = { + .quota_on = vz_quota_on, + .quota_off = vz_quota_off, + .quota_sync = vz_quota_sync, + .get_info = vz_get_dqinfo, + .set_info = vz_set_dqinfo, + .get_dqblk = vz_get_dqblk, + .set_dqblk = vz_set_dqblk, +#ifdef CONFIG_QUOTA_COMPAT + .get_quoti = vz_get_quoti, +#endif +}; + + +/* ---------------------------------------------------------------------- + * Management interface for host system admins. + * --------------------------------------------------------------------- */ + +static int quota_ugid_addstat(unsigned int quota_id, unsigned int ugid_size, + struct vz_quota_iface __user *u_ugid_buf, int compat) +{ + struct vz_quota_master *qmblk; + int ret; + + down(&vz_quota_sem); + + ret = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + ret = -EBUSY; + if (qmblk->dq_state != VZDQ_STARTING) + goto out; /* working quota doesn't accept new ugids */ + + ret = 0; + /* start to add ugids */ + for (ret = 0; ret < ugid_size; ret++) { + struct vz_quota_iface ugid_buf; + struct vz_quota_ugid *ugid; + + if (!compat) { + if (copy_from_user(&ugid_buf, u_ugid_buf, + sizeof(ugid_buf))) + break; + u_ugid_buf++; /* next user buffer */ + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_iface oqif; + if (copy_from_user(&oqif, u_ugid_buf, + sizeof(oqif))) + break; + ugid_buf.qi_id = oqif.qi_id; + ugid_buf.qi_type = oqif.qi_type; + compat_dqstat2dqstat(&oqif.qi_stat, &ugid_buf.qi_stat); + u_ugid_buf = (struct vz_quota_iface __user *) + (((void *)u_ugid_buf) + sizeof(oqif)); +#endif + } + + if (ugid_buf.qi_type >= MAXQUOTAS) + break; /* bad quota type - this is the only check */ + + ugid = vzquota_find_ugid(qmblk, + ugid_buf.qi_id, ugid_buf.qi_type, 0); + if (ugid == VZ_QUOTA_UGBAD) { + qmblk->dq_flags |= VZDQUG_FIXED_SET; + break; /* limit reached */ + } + + /* update usage/limits + * we can copy the data without the lock, because the data + * cannot be modified in VZDQ_STARTING state */ + ugid->qugid_stat = ugid_buf.qi_stat; + + vzquota_put_ugid(qmblk, ugid); + } +out: + up(&vz_quota_sem); + + return ret; +} + +static int quota_ugid_setgrace(unsigned int quota_id, + struct dq_info __user u_dq_info[], int compat) +{ + struct vz_quota_master *qmblk; + struct dq_info dq_info[MAXQUOTAS]; + struct dq_info *target; + int err, type; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EBUSY; + if (qmblk->dq_state != VZDQ_STARTING) + goto out; /* working quota doesn't accept changing options */ + + err = -EFAULT; + if (!compat) { + if (copy_from_user(dq_info, u_dq_info, sizeof(dq_info))) + goto out; + } else { +#ifdef CONFIG_COMPAT + struct compat_dq_info odqi[MAXQUOTAS]; + if (copy_from_user(odqi, u_dq_info, sizeof(odqi))) + goto out; + for (type = 0; type < MAXQUOTAS; type++) + compat_dqinfo2dqinfo(&odqi[type], &dq_info[type]); +#endif + } + + err = 0; + + /* update in qmblk */ + for (type = 0; type < MAXQUOTAS; type++) { + target = &qmblk->dq_ugid_info[type]; + target->bexpire = dq_info[type].bexpire; + target->iexpire = dq_info[type].iexpire; + } +out: + up(&vz_quota_sem); + + return err; +} + +static int do_quota_ugid_getstat(struct vz_quota_master *qmblk, int index, int size, + struct vz_quota_iface *u_ugid_buf) +{ + int type, count; + struct vz_quota_ugid *ugid; + + if (QTREE_LEAFNUM(qmblk->dq_uid_tree) + + QTREE_LEAFNUM(qmblk->dq_gid_tree) + <= index) + return 0; + + count = 0; + + type = index < QTREE_LEAFNUM(qmblk->dq_uid_tree) ? USRQUOTA : GRPQUOTA; + if (type == GRPQUOTA) + index -= QTREE_LEAFNUM(qmblk->dq_uid_tree); + + /* loop through ugid and then qgid quota */ +repeat: + for (ugid = vzquota_get_byindex(qmblk, index, type); + ugid != NULL && count < size; + ugid = vzquota_get_next(qmblk, ugid), count++) + { + struct vz_quota_iface ugid_buf; + + /* form interface buffer and send in to user-level */ + qmblk_data_read_lock(qmblk); + memcpy(&ugid_buf.qi_stat, &ugid->qugid_stat, + sizeof(ugid_buf.qi_stat)); + qmblk_data_read_unlock(qmblk); + ugid_buf.qi_id = ugid->qugid_id; + ugid_buf.qi_type = ugid->qugid_type; + + memcpy(u_ugid_buf, &ugid_buf, sizeof(ugid_buf)); + u_ugid_buf++; /* next portion of user buffer */ + } + + if (type == USRQUOTA && count < size) { + type = GRPQUOTA; + index = 0; + goto repeat; + } + + return count; +} + +static int quota_ugid_getstat(unsigned int quota_id, + int index, int size, struct vz_quota_iface __user *u_ugid_buf, + int compat) +{ + struct vz_quota_master *qmblk; + struct vz_quota_iface *k_ugid_buf; + int err; + + if (index < 0 || size < 0) + return -EINVAL; + + if (size > INT_MAX / sizeof(struct vz_quota_iface)) + return -EINVAL; + + k_ugid_buf = vmalloc(size * sizeof(struct vz_quota_iface)); + if (k_ugid_buf == NULL) + return -ENOMEM; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + down(&qmblk->dq_sem); + err = do_quota_ugid_getstat(qmblk, index, size, k_ugid_buf); + up(&qmblk->dq_sem); + if (err < 0) + goto out; + + if (!compat) { + if (copy_to_user(u_ugid_buf, k_ugid_buf, + err * sizeof(struct vz_quota_iface))) + err = -EFAULT; + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_iface oqif; + int i; + for (i = 0; i < err; i++) { + oqif.qi_id = k_ugid_buf[i].qi_id; + oqif.qi_type = k_ugid_buf[i].qi_type; + dqstat2compat_dqstat(&k_ugid_buf[i].qi_stat, + &oqif.qi_stat); + if (copy_to_user(u_ugid_buf, &oqif, sizeof(oqif))) + err = -EFAULT; + u_ugid_buf = (struct vz_quota_iface __user *) + (((void *)u_ugid_buf) + sizeof(oqif)); + } +#endif + } + +out: + up(&vz_quota_sem); + vfree(k_ugid_buf); + return err; +} + +static int quota_ugid_getgrace(unsigned int quota_id, + struct dq_info __user u_dq_info[], int compat) +{ + struct vz_quota_master *qmblk; + struct dq_info dq_info[MAXQUOTAS]; + struct dq_info *target; + int err, type; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = 0; + /* update from qmblk */ + for (type = 0; type < MAXQUOTAS; type ++) { + target = &qmblk->dq_ugid_info[type]; + dq_info[type].bexpire = target->bexpire; + dq_info[type].iexpire = target->iexpire; + dq_info[type].flags = target->flags; + } + + if (!compat) { + if (copy_to_user(u_dq_info, dq_info, sizeof(dq_info))) + err = -EFAULT; + } else { +#ifdef CONFIG_COMPAT + struct compat_dq_info odqi[MAXQUOTAS]; + for (type = 0; type < MAXQUOTAS; type ++) + dqinfo2compat_dqinfo(&dq_info[type], &odqi[type]); + if (copy_to_user(u_dq_info, odqi, sizeof(odqi))) + err = -EFAULT; +#endif + } +out: + up(&vz_quota_sem); + + return err; +} + +static int quota_ugid_getconfig(unsigned int quota_id, + struct vz_quota_ugid_stat __user *info) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_stat kinfo; + int err; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = 0; + kinfo.limit = qmblk->dq_ugid_max; + kinfo.count = qmblk->dq_ugid_count; + kinfo.flags = qmblk->dq_flags; + + if (copy_to_user(info, &kinfo, sizeof(kinfo))) + err = -EFAULT; +out: + up(&vz_quota_sem); + + return err; +} + +static int quota_ugid_setconfig(unsigned int quota_id, + struct vz_quota_ugid_stat __user *info) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_stat kinfo; + int err; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (copy_from_user(&kinfo, info, sizeof(kinfo))) + goto out; + + err = 0; + qmblk->dq_ugid_max = kinfo.limit; + if (qmblk->dq_state == VZDQ_STARTING) { + qmblk->dq_flags = kinfo.flags; + if (qmblk->dq_flags & VZDQUG_ON) + qmblk->dq_flags |= VZDQ_USRQUOTA | VZDQ_GRPQUOTA; + } + +out: + up(&vz_quota_sem); + + return err; +} + +static int quota_ugid_setlimit(unsigned int quota_id, + struct vz_quota_ugid_setlimit __user *u_lim) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_setlimit lim; + int err; + + down(&vz_quota_sem); + + err = -ESRCH; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (copy_from_user(&lim, u_lim, sizeof(lim))) + goto out; + + err = __vz_set_dqblk(qmblk, lim.type, lim.id, &lim.dqb); + +out: + up(&vz_quota_sem); + + return err; +} + +static int quota_ugid_setinfo(unsigned int quota_id, + struct vz_quota_ugid_setinfo __user *u_info) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_setinfo info; + int err; + + down(&vz_quota_sem); + + err = -ESRCH; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (copy_from_user(&info, u_info, sizeof(info))) + goto out; + + err = __vz_set_dqinfo(qmblk, info.type, &info.dqi); + +out: + up(&vz_quota_sem); + + return err; +} + +/* + * This is a system call to maintain UGID quotas + * Note this call is allowed to run ONLY from VE0 + */ +long do_vzquotaugidctl(int cmd, unsigned int quota_id, + unsigned int ugid_index, unsigned int ugid_size, + void *addr, int compat) +{ + int ret; + + ret = -EPERM; + /* access allowed only from root of VE0 */ + if (!capable(CAP_SYS_RESOURCE) || + !capable(CAP_SYS_ADMIN)) + goto out; + + switch (cmd) { + case VZ_DQ_UGID_GETSTAT: + ret = quota_ugid_getstat(quota_id, + ugid_index, ugid_size, + (struct vz_quota_iface __user *)addr, + compat); + break; + case VZ_DQ_UGID_ADDSTAT: + ret = quota_ugid_addstat(quota_id, ugid_size, + (struct vz_quota_iface __user *) addr, + compat); + break; + case VZ_DQ_UGID_GETGRACE: + ret = quota_ugid_getgrace(quota_id, + (struct dq_info __user *)addr, compat); + break; + case VZ_DQ_UGID_SETGRACE: + ret = quota_ugid_setgrace(quota_id, + (struct dq_info __user *)addr, compat); + break; + case VZ_DQ_UGID_GETCONFIG: + ret = quota_ugid_getconfig(quota_id, + (struct vz_quota_ugid_stat __user *) + addr); + break; + case VZ_DQ_UGID_SETCONFIG: + ret = quota_ugid_setconfig(quota_id, + (struct vz_quota_ugid_stat __user *) + addr); + break; + case VZ_DQ_UGID_SETLIMIT: + ret = quota_ugid_setlimit(quota_id, + (struct vz_quota_ugid_setlimit __user *) + addr); + break; + case VZ_DQ_UGID_SETINFO: + ret = quota_ugid_setinfo(quota_id, + (struct vz_quota_ugid_setinfo __user *) + addr); + break; + default: + ret = -EINVAL; + goto out; + } +out: + return ret; +} + +static void ugid_quota_on_sb(struct super_block *sb) +{ + struct super_block *real_sb; + struct vz_quota_master *qmblk; + + if (!sb->s_op->get_quota_root) + return; + + real_sb = sb->s_op->get_quota_root(sb)->i_sb; + if (real_sb->dq_op != &vz_quota_operations) + return; + + sb->dq_op = &vz_quota_operations2; + sb->s_qcop = &vz_quotactl_operations; + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; + sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; + + qmblk = vzquota_find_qmblk(sb); + if ((qmblk == NULL) || (qmblk == VZ_QUOTA_BAD)) + return; + down(&vz_quota_sem); + if (qmblk->dq_flags & VZDQ_USRQUOTA) + sb->s_dquot.flags |= DQUOT_USR_ENABLED; + if (qmblk->dq_flags & VZDQ_GRPQUOTA) + sb->s_dquot.flags |= DQUOT_GRP_ENABLED; + up(&vz_quota_sem); + qmblk_put(qmblk); +} + +static void ugid_quota_off_sb(struct super_block *sb) +{ + /* can't make quota off on mounted super block */ + BUG_ON(sb->s_root != NULL); +} + +static int ugid_notifier_call(struct vnotifier_block *self, + unsigned long n, void *data, int old_ret) +{ + struct virt_info_quota *viq; + + viq = (struct virt_info_quota *)data; + + switch (n) { + case VIRTINFO_QUOTA_ON: + ugid_quota_on_sb(viq->super); + break; + case VIRTINFO_QUOTA_OFF: + ugid_quota_off_sb(viq->super); + break; + case VIRTINFO_QUOTA_GETSTAT: + break; + default: + return old_ret; + } + return NOTIFY_OK; +} + +static struct vnotifier_block ugid_notifier_block = { + .notifier_call = ugid_notifier_call, +}; + +/* ---------------------------------------------------------------------- + * Init/exit. + * --------------------------------------------------------------------- */ + +int vzquota_ugid_init(void) +{ + int err; + + vz_quota_ugid_cachep = kmem_cache_create("vz_quota_ugid", + sizeof(struct vz_quota_ugid), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (vz_quota_ugid_cachep == NULL) + goto err_slab; + + err = register_quota_format(&vz_quota_empty_v2_format); + if (err) + goto err_reg; + + virtinfo_notifier_register(VITYPE_QUOTA, &ugid_notifier_block); + return 0; + +err_reg: + kmem_cache_destroy(vz_quota_ugid_cachep); + return err; + +err_slab: + printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); + return -ENOMEM; +} + +void vzquota_ugid_release(void) +{ + virtinfo_notifier_unregister(VITYPE_QUOTA, &ugid_notifier_block); + unregister_quota_format(&vz_quota_empty_v2_format); + + kmem_cache_destroy(vz_quota_ugid_cachep); +} diff --git a/fs/vzdquot.c b/fs/vzdquot.c new file mode 100644 index 0000000..a6605dd --- /dev/null +++ b/fs/vzdquot.c @@ -0,0 +1,1954 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains the core of Virtuozzo disk quota implementation: + * maintenance of VZDQ information in inodes, + * external interfaces, + * module entry. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* ---------------------------------------------------------------------- + * + * Locking + * + * ---------------------------------------------------------------------- */ + +/* + * Serializes on/off and all other do_vzquotactl operations. + * Protects qmblk hash. + */ +struct semaphore vz_quota_sem; + +/* + * Data access locks + * inode_qmblk + * protects qmblk pointers in all inodes and qlnk content in general + * (but not qmblk content); + * also protects related qmblk invalidation procedures; + * can't be per-inode because of vzquota_dtree_qmblk complications + * and problems with serialization with quota_on, + * but can be per-superblock; + * qmblk_data + * protects qmblk fields (such as current usage) + * quota_data + * protects charge/uncharge operations, thus, implies + * qmblk_data lock and, if CONFIG_VZ_QUOTA_UGID, inode_qmblk lock + * (to protect ugid pointers). + * + * Lock order: + * inode_qmblk_lock -> dcache_lock + * inode_qmblk_lock -> qmblk_data + */ +static DEFINE_SPINLOCK(vzdq_qmblk_lock); + +inline void inode_qmblk_lock(struct super_block *sb) +{ + spin_lock(&vzdq_qmblk_lock); +} + +inline void inode_qmblk_unlock(struct super_block *sb) +{ + spin_unlock(&vzdq_qmblk_lock); +} + +inline void qmblk_data_read_lock(struct vz_quota_master *qmblk) +{ + spin_lock(&qmblk->dq_data_lock); +} + +inline void qmblk_data_read_unlock(struct vz_quota_master *qmblk) +{ + spin_unlock(&qmblk->dq_data_lock); +} + +inline void qmblk_data_write_lock(struct vz_quota_master *qmblk) +{ + spin_lock(&qmblk->dq_data_lock); +} + +inline void qmblk_data_write_unlock(struct vz_quota_master *qmblk) +{ + spin_unlock(&qmblk->dq_data_lock); +} + +struct quota_format_type vz_quota_empty_v2_format = { + .qf_fmt_id = QFMT_VFS_V0, + .qf_ops = NULL, + .qf_owner = THIS_MODULE, +}; + +/* ---------------------------------------------------------------------- + * + * Master hash table handling. + * + * SMP not safe, serialied by vz_quota_sem within quota syscalls + * + * --------------------------------------------------------------------- */ + +static struct kmem_cache *vzquota_cachep; + +/* + * Hash function. + */ +#define QHASH_BITS 6 +#define VZ_QUOTA_HASH_SIZE (1 << QHASH_BITS) +#define QHASH_MASK (VZ_QUOTA_HASH_SIZE - 1) + +struct list_head vzquota_hash_table[VZ_QUOTA_HASH_SIZE]; +int vzquota_hash_size = VZ_QUOTA_HASH_SIZE; + +static inline int vzquota_hash_func(unsigned int qid) +{ + return (((qid >> QHASH_BITS) ^ qid) & QHASH_MASK); +} + +/** + * vzquota_alloc_master - alloc and instantiate master quota record + * + * Returns: + * pointer to newly created record if SUCCESS + * -ENOMEM if out of memory + * -EEXIST if record with given quota_id already exist + */ +struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, + struct vz_quota_stat *qstat) +{ + int err; + struct vz_quota_master *qmblk; + + err = -EEXIST; + if (vzquota_find_master(quota_id) != NULL) + goto out; + + err = -ENOMEM; + qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL); + if (qmblk == NULL) + goto out; +#ifdef CONFIG_VZ_QUOTA_UGID + qmblk->dq_uid_tree = quotatree_alloc(); + if (!qmblk->dq_uid_tree) + goto out_free; + + qmblk->dq_gid_tree = quotatree_alloc(); + if (!qmblk->dq_gid_tree) + goto out_free_tree; +#endif + + qmblk->dq_state = VZDQ_STARTING; + init_MUTEX(&qmblk->dq_sem); + spin_lock_init(&qmblk->dq_data_lock); + + qmblk->dq_id = quota_id; + qmblk->dq_stat = qstat->dq_stat; + qmblk->dq_info = qstat->dq_info; + qmblk->dq_root_path.dentry = NULL; + qmblk->dq_root_path.mnt = NULL; + qmblk->dq_sb = NULL; + qmblk->dq_ugid_count = 0; + qmblk->dq_ugid_max = 0; + qmblk->dq_flags = 0; + memset(qmblk->dq_ugid_info, 0, sizeof(qmblk->dq_ugid_info)); + INIT_LIST_HEAD(&qmblk->dq_ilink_list); + + atomic_set(&qmblk->dq_count, 1); + + /* insert in hash chain */ + list_add(&qmblk->dq_hash, + &vzquota_hash_table[vzquota_hash_func(quota_id)]); + + /* success */ + return qmblk; + +#ifdef CONFIG_VZ_QUOTA_UGID +out_free_tree: + quotatree_free(qmblk->dq_uid_tree, NULL); +out_free: + kmem_cache_free(vzquota_cachep, qmblk); +#endif +out: + return ERR_PTR(err); +} + +static struct vz_quota_master *vzquota_alloc_fake(void) +{ + struct vz_quota_master *qmblk; + + qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL); + if (qmblk == NULL) + return NULL; + memset(qmblk, 0, sizeof(*qmblk)); + qmblk->dq_state = VZDQ_STOPING; + qmblk->dq_flags = VZDQ_NOQUOT; + spin_lock_init(&qmblk->dq_data_lock); + INIT_LIST_HEAD(&qmblk->dq_ilink_list); + atomic_set(&qmblk->dq_count, 1); + return qmblk; +} + +/** + * vzquota_find_master - find master record with given id + * + * Returns qmblk without touching its refcounter. + * Called under vz_quota_sem. + */ +struct vz_quota_master *vzquota_find_master(unsigned int quota_id) +{ + int i; + struct vz_quota_master *qp; + + i = vzquota_hash_func(quota_id); + list_for_each_entry(qp, &vzquota_hash_table[i], dq_hash) { + if (qp->dq_id == quota_id) + return qp; + } + return NULL; +} + +/** + * vzquota_free_master - release resources taken by qmblk, freeing memory + * + * qmblk is assumed to be already taken out from the hash. + * Should be called outside vz_quota_sem. + */ +void vzquota_free_master(struct vz_quota_master *qmblk) +{ +#ifdef CONFIG_VZ_QUOTA_UGID + vzquota_kill_ugid(qmblk); +#endif + BUG_ON(!list_empty(&qmblk->dq_ilink_list)); + kmem_cache_free(vzquota_cachep, qmblk); +} + + +/* ---------------------------------------------------------------------- + * + * Passing quota information through current + * + * Used in inode -> qmblk lookup at inode creation stage (since at that + * time there are no links between the inode being created and its parent + * directory). + * + * --------------------------------------------------------------------- */ + +#define VZDQ_CUR_MAGIC 0x57d0fee2 + +static inline int vzquota_cur_qmblk_check(void) +{ + return current->magic == VZDQ_CUR_MAGIC; +} + +static inline struct inode *vzquota_cur_qmblk_fetch(void) +{ + return current->ino; +} + +static inline void vzquota_cur_qmblk_set(struct inode *data) +{ + struct task_struct *tsk; + + tsk = current; + tsk->magic = VZDQ_CUR_MAGIC; + tsk->ino = data; +} + +#if 0 +static inline void vzquota_cur_qmblk_reset(void) +{ + current->magic = 0; +} +#endif + + +/* ---------------------------------------------------------------------- + * + * Superblock quota operations + * + * --------------------------------------------------------------------- */ + +/* + * Kernel structure abuse. + * We use files[0] pointer as an int variable: + * reference counter of how many quota blocks uses this superblock. + * files[1] is used for generations structure which helps us to track + * when traversing of dentries is really required. + */ +#define __VZ_QUOTA_NOQUOTA(sb) sb->s_dquot.vzdq_master +#define __VZ_QUOTA_TSTAMP(sb) ((struct timeval *)\ + &sb->s_dquot.dqio_mutex) + +#if defined(VZ_QUOTA_UNLOAD) + +#define __VZ_QUOTA_SBREF(sb) sb->s_dquot.vzdq_count + +struct dquot_operations *orig_dq_op; +struct quotactl_ops *orig_dq_cop; + +/** + * quota_get_super - account for new a quoted tree under the superblock + * + * One superblock can have multiple directory subtrees with different VZ + * quotas. We keep a counter of such subtrees and set VZ quota operations or + * reset the default ones. + * + * Called under vz_quota_sem (from quota_on). + */ +int vzquota_get_super(struct super_block *sb) +{ + if (sb->dq_op != &vz_quota_operations) { + down(&sb->s_dquot.dqonoff_sem); + if (sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) { + up(&sb->s_dquot.dqonoff_sem); + return -EEXIST; + } + if (orig_dq_op == NULL && sb->dq_op != NULL) + orig_dq_op = sb->dq_op; + sb->dq_op = &vz_quota_operations; + if (orig_dq_cop == NULL && sb->s_qcop != NULL) + orig_dq_cop = sb->s_qcop; + /* XXX this may race with sys_quotactl */ +#ifdef CONFIG_VZ_QUOTA_UGID + sb->s_qcop = &vz_quotactl_operations; +#else + sb->s_qcop = NULL; +#endif + do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); + memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); + + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; + sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; + /* + * To get quotaops.h call us we need to mark superblock + * as having quota. These flags mark the moment when + * our dq_op start to be called. + * + * The ordering of dq_op and s_dquot.flags assignment + * needs to be enforced, but other CPUs do not do rmb() + * between s_dquot.flags and dq_op accesses. + */ + wmb(); synchronize_sched(); + sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; + __module_get(THIS_MODULE); + up(&sb->s_dquot.dqonoff_sem); + } + /* protected by vz_quota_sem */ + __VZ_QUOTA_SBREF(sb)++; + return 0; +} + +/** + * quota_put_super - release superblock when one quota tree goes away + * + * Called under vz_quota_sem. + */ +void vzquota_put_super(struct super_block *sb) +{ + int count; + + count = --__VZ_QUOTA_SBREF(sb); + if (count == 0) { + down(&sb->s_dquot.dqonoff_sem); + sb->s_dquot.flags = 0; + wmb(); synchronize_sched(); + sema_init(&sb->s_dquot.dqio_sem, 1); + sb->s_qcop = orig_dq_cop; + sb->dq_op = orig_dq_op; + inode_qmblk_lock(sb); + quota_gen_put(SB_QGEN(sb)); + SB_QGEN(sb) = NULL; + /* release qlnk's without qmblk */ + remove_inode_quota_links_list(&non_vzquota_inodes_lh, + sb, NULL); + /* + * Races with quota initialization: + * after this inode_qmblk_unlock all inode's generations are + * invalidated, quota_inode_qmblk checks superblock operations. + */ + inode_qmblk_unlock(sb); + /* + * Module refcounting: in theory, this is the best place + * to call module_put(THIS_MODULE). + * In reality, it can't be done because we can't be sure that + * other CPUs do not enter our code segment through dq_op + * cached long time ago. Quotaops interface isn't supposed to + * go into modules currently (that is, into unloadable + * modules). By omitting module_put, our module isn't + * unloadable. + */ + up(&sb->s_dquot.dqonoff_sem); + } +} + +#else + +struct vzquota_new_sop { + struct super_operations new_op; + const struct super_operations *old_op; +}; + +/** + * vzquota_shutdown_super - callback on umount + */ +void vzquota_shutdown_super(struct super_block *sb) +{ + struct vz_quota_master *qmblk; + struct vzquota_new_sop *sop; + + qmblk = __VZ_QUOTA_NOQUOTA(sb); + __VZ_QUOTA_NOQUOTA(sb) = NULL; + if (qmblk != NULL) + qmblk_put(qmblk); + sop = container_of(sb->s_op, struct vzquota_new_sop, new_op); + sb->s_op = sop->old_op; + kfree(sop); + if (sb->s_op->put_super != NULL) + (*sb->s_op->put_super)(sb); +} + +/** + * vzquota_get_super - account for new a quoted tree under the superblock + * + * One superblock can have multiple directory subtrees with different VZ + * quotas. + * + * Called under vz_quota_sem (from vzquota_on). + */ +int vzquota_get_super(struct super_block *sb) +{ + struct vz_quota_master *qnew; + struct vzquota_new_sop *sop; + int err; + + mutex_lock(&sb->s_dquot.dqonoff_mutex); + err = -EEXIST; + if ((sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) && + sb->dq_op != &vz_quota_operations) + goto out_up; + + /* + * This allocation code should be under sb->dq_op check below, but + * it doesn't really matter... + */ + if (__VZ_QUOTA_NOQUOTA(sb) == NULL) { + qnew = vzquota_alloc_fake(); + if (qnew == NULL) + goto out_up; + __VZ_QUOTA_NOQUOTA(sb) = qnew; + } + + if (sb->dq_op != &vz_quota_operations) { + sop = kmalloc(sizeof(*sop), GFP_KERNEL); + if (sop == NULL) { + vzquota_free_master(__VZ_QUOTA_NOQUOTA(sb)); + __VZ_QUOTA_NOQUOTA(sb) = NULL; + goto out_up; + } + memcpy(&sop->new_op, sb->s_op, sizeof(sop->new_op)); + sop->new_op.put_super = &vzquota_shutdown_super; + sop->old_op = sb->s_op; + sb->s_op = &sop->new_op; + + sb->dq_op = &vz_quota_operations; +#ifdef CONFIG_VZ_QUOTA_UGID + sb->s_qcop = &vz_quotactl_operations; +#else + sb->s_qcop = NULL; +#endif + do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); + + memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); + /* these 2 list heads are checked in sync_dquots() */ + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + sb->s_dquot.info[USRQUOTA].dqi_format = + &vz_quota_empty_v2_format; + sb->s_dquot.info[GRPQUOTA].dqi_format = + &vz_quota_empty_v2_format; + + /* + * To get quotaops.h to call us we need to mark superblock + * as having quota. These flags mark the moment when + * our dq_op start to be called. + * + * The ordering of dq_op and s_dquot.flags assignment + * needs to be enforced, but other CPUs do not do rmb() + * between s_dquot.flags and dq_op accesses. + */ + wmb(); synchronize_sched(); + sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; + } + err = 0; + +out_up: + mutex_unlock(&sb->s_dquot.dqonoff_mutex); + return err; +} + +/** + * vzquota_put_super - one quota tree less on this superblock + * + * Called under vz_quota_sem. + */ +void vzquota_put_super(struct super_block *sb) +{ + /* + * Even if this put is the last one, + * sb->s_dquot.flags can't be cleared, because otherwise vzquota_drop + * won't be called and the remaining qmblk references won't be put. + */ +} + +#endif + + +/* ---------------------------------------------------------------------- + * + * Helpers for inode -> qmblk link maintenance + * + * --------------------------------------------------------------------- */ + +#define __VZ_QUOTA_EMPTY ((void *)0xbdbdbdbd) +#define VZ_QUOTA_IS_NOQUOTA(qm, sb) ((qm)->dq_flags & VZDQ_NOQUOT) +#define VZ_QUOTA_EMPTY_IOPS (&vfs_empty_iops) +extern struct inode_operations vfs_empty_iops; + +static int VZ_QUOTA_IS_ACTUAL(struct inode *inode) +{ + struct vz_quota_master *qmblk; + + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk == VZ_QUOTA_BAD) + return 1; + if (qmblk == __VZ_QUOTA_EMPTY) + return 0; + if (qmblk->dq_flags & VZDQ_NOACT) + /* not actual (invalidated) qmblk */ + return 0; + return 1; +} + +static inline int vzquota_qlnk_is_empty(struct vz_quota_ilink *qlnk) +{ + return qlnk->qmblk == __VZ_QUOTA_EMPTY; +} + +static inline void set_qlnk_origin(struct vz_quota_ilink *qlnk, + unsigned char origin) +{ + qlnk->origin[0] = qlnk->origin[1]; + qlnk->origin[1] = origin; +} + +static inline void vzquota_qlnk_set_empty(struct vz_quota_ilink *qlnk) +{ + qlnk->qmblk = __VZ_QUOTA_EMPTY; + set_qlnk_origin(qlnk, VZ_QUOTAO_SETE); +} + +void vzquota_qlnk_init(struct vz_quota_ilink *qlnk) +{ + memset(qlnk, 0, sizeof(*qlnk)); + INIT_LIST_HEAD(&qlnk->list); + vzquota_qlnk_set_empty(qlnk); + set_qlnk_origin(qlnk, VZ_QUOTAO_INIT); +} + +void vzquota_qlnk_destroy(struct vz_quota_ilink *qlnk) +{ + might_sleep(); + if (vzquota_qlnk_is_empty(qlnk)) + return; +#if defined(CONFIG_VZ_QUOTA_UGID) + if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) { + struct vz_quota_master *qmblk; + struct vz_quota_ugid *quid, *qgid; + qmblk = qlnk->qmblk; + quid = qlnk->qugid[USRQUOTA]; + qgid = qlnk->qugid[GRPQUOTA]; + if (quid != NULL || qgid != NULL) { + down(&qmblk->dq_sem); + if (qgid != NULL) + vzquota_put_ugid(qmblk, qgid); + if (quid != NULL) + vzquota_put_ugid(qmblk, quid); + up(&qmblk->dq_sem); + } + } +#endif + if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) + qmblk_put(qlnk->qmblk); + set_qlnk_origin(qlnk, VZ_QUOTAO_DESTR); +} + +/** + * vzquota_qlnk_swap - swap inode's and temporary vz_quota_ilink contents + * @qlt: temporary + * @qli: inode's + * + * Locking is provided by the caller (depending on the context). + * After swap, @qli is inserted into the corresponding dq_ilink_list, + * @qlt list is reinitialized. + */ +static void vzquota_qlnk_swap(struct vz_quota_ilink *qlt, + struct vz_quota_ilink *qli) +{ + struct vz_quota_master *qb; + struct vz_quota_ugid *qu; + int i; + + qb = qlt->qmblk; + qlt->qmblk = qli->qmblk; + qli->qmblk = qb; + list_del_init(&qli->list); + if (qb != __VZ_QUOTA_EMPTY && qb != VZ_QUOTA_BAD) + list_add(&qli->list, &qb->dq_ilink_list); + INIT_LIST_HEAD(&qlt->list); + set_qlnk_origin(qli, VZ_QUOTAO_SWAP); + + for (i = 0; i < MAXQUOTAS; i++) { + qu = qlt->qugid[i]; + qlt->qugid[i] = qli->qugid[i]; + qli->qugid[i] = qu; + } +} + +/** + * vzquota_qlnk_reinit_locked - destroy qlnk content, called under locks + * + * Called under dcache_lock and inode_qmblk locks. + * Returns 1 if locks were dropped inside, 0 if atomic. + */ +static int vzquota_qlnk_reinit_locked(struct vz_quota_ilink *qlnk, + struct inode *inode) +{ + if (vzquota_qlnk_is_empty(qlnk)) + return 0; + if (qlnk->qmblk == VZ_QUOTA_BAD) { + vzquota_qlnk_set_empty(qlnk); + set_qlnk_origin(qlnk, VZ_QUOTAO_RE_LOCK); + return 0; + } + spin_unlock(&dcache_lock); + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(qlnk); + vzquota_qlnk_init(qlnk); + inode_qmblk_lock(inode->i_sb); + spin_lock(&dcache_lock); + return 1; +} + +#if defined(CONFIG_VZ_QUOTA_UGID) +/** + * vzquota_qlnk_reinit_attr - destroy and reinit qlnk content + * + * Similar to vzquota_qlnk_reinit_locked, called under different locks. + */ +static int vzquota_qlnk_reinit_attr(struct vz_quota_ilink *qlnk, + struct inode *inode, + struct vz_quota_master *qmblk) +{ + if (vzquota_qlnk_is_empty(qlnk)) + return 0; + /* may be optimized if qlnk->qugid all NULLs */ + qmblk_data_write_unlock(qmblk); + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(qlnk); + vzquota_qlnk_init(qlnk); + inode_qmblk_lock(inode->i_sb); + qmblk_data_write_lock(qmblk); + return 1; +} +#endif + +/** + * vzquota_qlnk_fill - fill vz_quota_ilink content + * @qlnk: vz_quota_ilink to fill + * @inode: inode for which @qlnk is filled (i_sb, i_uid, i_gid) + * @qmblk: qmblk to which this @qlnk will belong + * + * Called under dcache_lock and inode_qmblk locks. + * Returns 1 if locks were dropped inside, 0 if atomic. + * @qlnk is expected to be empty. + */ +static int vzquota_qlnk_fill(struct vz_quota_ilink *qlnk, + struct inode *inode, + struct vz_quota_master *qmblk) +{ + if (qmblk != VZ_QUOTA_BAD) + qmblk_get(qmblk); + qlnk->qmblk = qmblk; + +#if defined(CONFIG_VZ_QUOTA_UGID) + if (qmblk != VZ_QUOTA_BAD && + !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && + (qmblk->dq_flags & VZDQUG_ON)) { + struct vz_quota_ugid *quid, *qgid; + + spin_unlock(&dcache_lock); + inode_qmblk_unlock(inode->i_sb); + + down(&qmblk->dq_sem); + quid = __vzquota_find_ugid(qmblk, inode->i_uid, USRQUOTA, 0); + qgid = __vzquota_find_ugid(qmblk, inode->i_gid, GRPQUOTA, 0); + up(&qmblk->dq_sem); + + inode_qmblk_lock(inode->i_sb); + spin_lock(&dcache_lock); + qlnk->qugid[USRQUOTA] = quid; + qlnk->qugid[GRPQUOTA] = qgid; + return 1; + } +#endif + + return 0; +} + +#if defined(CONFIG_VZ_QUOTA_UGID) +/** + * vzquota_qlnk_fill_attr - fill vz_quota_ilink content for uid, gid + * + * This function is a helper for vzquota_transfer, and differs from + * vzquota_qlnk_fill only by locking. + */ +static int vzquota_qlnk_fill_attr(struct vz_quota_ilink *qlnk, + struct inode *inode, + struct iattr *iattr, + int mask, + struct vz_quota_master *qmblk) +{ + qmblk_get(qmblk); + qlnk->qmblk = qmblk; + + if (mask) { + struct vz_quota_ugid *quid, *qgid; + + quid = qgid = NULL; /* to make gcc happy */ + if (!(mask & (1 << USRQUOTA))) + quid = vzquota_get_ugid(INODE_QLNK(inode)-> + qugid[USRQUOTA]); + if (!(mask & (1 << GRPQUOTA))) + qgid = vzquota_get_ugid(INODE_QLNK(inode)-> + qugid[GRPQUOTA]); + + qmblk_data_write_unlock(qmblk); + inode_qmblk_unlock(inode->i_sb); + + down(&qmblk->dq_sem); + if (mask & (1 << USRQUOTA)) + quid = __vzquota_find_ugid(qmblk, iattr->ia_uid, + USRQUOTA, 0); + if (mask & (1 << GRPQUOTA)) + qgid = __vzquota_find_ugid(qmblk, iattr->ia_gid, + GRPQUOTA, 0); + up(&qmblk->dq_sem); + + inode_qmblk_lock(inode->i_sb); + qmblk_data_write_lock(qmblk); + qlnk->qugid[USRQUOTA] = quid; + qlnk->qugid[GRPQUOTA] = qgid; + return 1; + } + + return 0; +} +#endif + +/** + * __vzquota_inode_init - make sure inode's qlnk is initialized + * + * May be called if qlnk is already initialized, detects this situation itself. + * Called under inode_qmblk_lock. + */ +static void __vzquota_inode_init(struct inode *inode, unsigned char origin) +{ + if (inode->i_dquot[USRQUOTA] == NODQUOT) { + vzquota_qlnk_init(INODE_QLNK(inode)); + inode->i_dquot[USRQUOTA] = (void *)~(unsigned long)NODQUOT; + } + set_qlnk_origin(INODE_QLNK(inode), origin); +} + +/** + * vzquota_inode_drop - destroy VZ quota information in the inode + * + * Inode must not be externally accessible or dirty. + */ +static void vzquota_inode_drop(struct inode *inode) +{ + struct vz_quota_ilink qlnk; + + vzquota_qlnk_init(&qlnk); + inode_qmblk_lock(inode->i_sb); + vzquota_qlnk_swap(&qlnk, INODE_QLNK(inode)); + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DRCAL); + inode->i_dquot[USRQUOTA] = NODQUOT; + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&qlnk); +} + +/** + * vzquota_inode_qmblk_set - initialize inode's qlnk + * @inode: inode to be initialized + * @qmblk: quota master block to which this inode should belong (may be BAD) + * @qlnk: placeholder to store data to resolve locking issues + * + * Returns 1 if locks were dropped and rechecks possibly needed, 0 otherwise. + * Called under dcache_lock and inode_qmblk locks. + * @qlnk will be destroyed in the caller chain. + * + * It is not mandatory to restart parent checks since quota on/off currently + * shrinks dentry tree and checks that there are not outside references. + * But if at some time that shink is removed, restarts will be required. + * Additionally, the restarts prevent inconsistencies if the dentry tree + * changes (inode is moved). This is not a big deal, but anyway... + */ +static int vzquota_inode_qmblk_set(struct inode *inode, + struct vz_quota_master *qmblk, + struct vz_quota_ilink *qlnk) +{ + if (qmblk == NULL) { + printk(KERN_ERR "VZDQ: NULL in set, orig {%u, %u}, " + "dev %s, inode %lu, fs %s\n", + INODE_QLNK(inode)->origin[0], + INODE_QLNK(inode)->origin[1], + inode->i_sb->s_id, inode->i_ino, + inode->i_sb->s_type->name); + printk(KERN_ERR "current %d (%s), VE %d\n", + current->pid, current->comm, + VEID(get_exec_env())); + dump_stack(); + qmblk = VZ_QUOTA_BAD; + } + while (1) { + if (vzquota_qlnk_is_empty(qlnk) && + vzquota_qlnk_fill(qlnk, inode, qmblk)) + return 1; + if (qlnk->qmblk == qmblk) + break; + if (vzquota_qlnk_reinit_locked(qlnk, inode)) + return 1; + } + vzquota_qlnk_swap(qlnk, INODE_QLNK(inode)); + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_QSET); + return 0; +} + + +/* ---------------------------------------------------------------------- + * + * vzquota_inode_qmblk (inode -> qmblk lookup) parts + * + * --------------------------------------------------------------------- */ + +static int vzquota_dparents_check_attach(struct inode *inode) +{ + if (!list_empty(&inode->i_dentry)) + return 0; + printk(KERN_ERR "VZDQ: no parent for " + "dev %s, inode %lu, fs %s\n", + inode->i_sb->s_id, + inode->i_ino, + inode->i_sb->s_type->name); + return -1; +} + +static struct inode *vzquota_dparents_check_actual(struct inode *inode) +{ + struct dentry *de; + + list_for_each_entry(de, &inode->i_dentry, d_alias) { + if (de->d_parent == de) /* detached dentry, perhaps */ + continue; + /* first access to parent, make sure its qlnk initialized */ + __vzquota_inode_init(de->d_parent->d_inode, VZ_QUOTAO_ACT); + if (!VZ_QUOTA_IS_ACTUAL(de->d_parent->d_inode)) + return de->d_parent->d_inode; + } + return NULL; +} + +static struct vz_quota_master *vzquota_dparents_check_same(struct inode *inode) +{ + struct dentry *de; + struct vz_quota_master *qmblk; + + qmblk = NULL; + list_for_each_entry(de, &inode->i_dentry, d_alias) { + if (de->d_parent == de) /* detached dentry, perhaps */ + continue; + if (qmblk == NULL) { + qmblk = INODE_QLNK(de->d_parent->d_inode)->qmblk; + continue; + } + if (INODE_QLNK(de->d_parent->d_inode)->qmblk != qmblk) { + printk(KERN_WARNING "VZDQ: multiple quotas for " + "dev %s, inode %lu, fs %s\n", + inode->i_sb->s_id, + inode->i_ino, + inode->i_sb->s_type->name); + qmblk = VZ_QUOTA_BAD; + break; + } + } + if (qmblk == NULL) { + printk(KERN_WARNING "VZDQ: not attached to tree, " + "dev %s, inode %lu, fs %s\n", + inode->i_sb->s_id, + inode->i_ino, + inode->i_sb->s_type->name); + qmblk = VZ_QUOTA_BAD; + } + return qmblk; +} + +static void vzquota_dbranch_actualize(struct inode *inode, + struct inode *refinode) +{ + struct inode *pinode; + struct vz_quota_master *qmblk; + struct vz_quota_ilink qlnk; + + vzquota_qlnk_init(&qlnk); + +start: + if (inode == inode->i_sb->s_root->d_inode) { + /* filesystem root */ + atomic_inc(&inode->i_count); + do { + qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); + } while (vzquota_inode_qmblk_set(inode, qmblk, &qlnk)); + goto out; + } + + if (!vzquota_dparents_check_attach(inode)) { + pinode = vzquota_dparents_check_actual(inode); + if (pinode != NULL) { + inode = pinode; + goto start; + } + } + + atomic_inc(&inode->i_count); + while (1) { + if (VZ_QUOTA_IS_ACTUAL(inode)) /* actualized without us */ + break; + /* + * Need to check parents again if we have slept inside + * vzquota_inode_qmblk_set() in the loop. + * If the state of parents is different, just return and repeat + * the actualizing process again from the inode passed to + * vzquota_inode_qmblk_recalc(). + */ + if (!vzquota_dparents_check_attach(inode)) { + if (vzquota_dparents_check_actual(inode) != NULL) + break; + qmblk = vzquota_dparents_check_same(inode); + } else + qmblk = VZ_QUOTA_BAD; + if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)){/* success */ + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ACT); + break; + } + } + +out: + spin_unlock(&dcache_lock); + inode_qmblk_unlock(refinode->i_sb); + vzquota_qlnk_destroy(&qlnk); + iput(inode); + inode_qmblk_lock(refinode->i_sb); + spin_lock(&dcache_lock); +} + +static void vzquota_dtree_qmblk_recalc(struct inode *inode, + struct vz_quota_ilink *qlnk) +{ + struct inode *pinode; + struct vz_quota_master *qmblk; + + if (inode == inode->i_sb->s_root->d_inode) { + /* filesystem root */ + do { + qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); + } while (vzquota_inode_qmblk_set(inode, qmblk, qlnk)); + return; + } + +start: + if (VZ_QUOTA_IS_ACTUAL(inode)) + return; + /* + * Here qmblk is (re-)initialized for all ancestors. + * This is not a very efficient procedure, but it guarantees that + * the quota tree is consistent (that is, the inode doesn't have two + * ancestors with different qmblk). + */ + if (!vzquota_dparents_check_attach(inode)) { + pinode = vzquota_dparents_check_actual(inode); + if (pinode != NULL) { + vzquota_dbranch_actualize(pinode, inode); + goto start; + } + qmblk = vzquota_dparents_check_same(inode); + } else + qmblk = VZ_QUOTA_BAD; + + if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) + goto start; + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DTREE); +} + +static void vzquota_det_qmblk_recalc(struct inode *inode, + struct vz_quota_ilink *qlnk) +{ + struct inode *parent; + struct vz_quota_master *qmblk; + char *msg; + int cnt; + time_t timeout; + + cnt = 0; + parent = NULL; +start: + /* + * qmblk of detached inodes shouldn't be considered as not actual. + * They are not in any dentry tree, so quota on/off shouldn't affect + * them. + */ + if (!vzquota_qlnk_is_empty(INODE_QLNK(inode))) + return; + + timeout = 3; + qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); + /* + * Scenario: + * open + * unlink + * quotaon + * generic_delete_inode + * + * This is the first time vzquota sees inode. inode is outside of + * vzquota area of interest, otherwise quotaon would have got -EBUSY + * due to shrink_dcache_parent(). + * inode is almost completely destroyed, so don't intervene. + * + * dev@: + * However, there is a small race here... + * dput() first removes itself from all the lists, + * so shrink_dcache_parent() can succeed while dentry_iput is not + * done yet. + */ + if (inode->i_state & I_FREEING) + goto set; + + msg = "detached inode not in creation"; + if (inode->i_op != VZ_QUOTA_EMPTY_IOPS) + goto fail; + qmblk = VZ_QUOTA_BAD; + msg = "unexpected creation context"; + if (!vzquota_cur_qmblk_check()) + goto fail; + timeout = 0; + parent = vzquota_cur_qmblk_fetch(); + msg = "uninitialized parent"; + if (vzquota_qlnk_is_empty(INODE_QLNK(parent))) + goto fail; + msg = "parent not in tree"; + if (list_empty(&parent->i_dentry)) + goto fail; + msg = "parent has 0 refcount"; + if (!atomic_read(&parent->i_count)) + goto fail; + msg = "parent has different sb"; + if (parent->i_sb != inode->i_sb) + goto fail; + if (!VZ_QUOTA_IS_ACTUAL(parent)) { + vzquota_dbranch_actualize(parent, inode); + goto start; + } + + qmblk = INODE_QLNK(parent)->qmblk; +set: + if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) + goto start; + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DET); + return; + +fail: + { + struct timeval tv, tvo; + do_gettimeofday(&tv); + memcpy(&tvo, __VZ_QUOTA_TSTAMP(inode->i_sb), sizeof(tvo)); + tv.tv_sec -= tvo.tv_sec; + if (tv.tv_usec < tvo.tv_usec) { + tv.tv_sec--; + tv.tv_usec += USEC_PER_SEC - tvo.tv_usec; + } else + tv.tv_usec -= tvo.tv_usec; + if (tv.tv_sec < timeout) + goto set; + printk(KERN_ERR "VZDQ: %s, orig {%u, %u}," + " dev %s, inode %lu, fs %s\n", + msg, + INODE_QLNK(inode)->origin[0], + INODE_QLNK(inode)->origin[1], + inode->i_sb->s_id, inode->i_ino, + inode->i_sb->s_type->name); + printk(KERN_ERR "i_count %u, ", atomic_read(&inode->i_count)); + printk(KERN_ERR "i_mode %o, ", inode->i_mode); + printk(KERN_ERR "i_state %lx, ", inode->i_state); + printk(KERN_ERR "i_flags %x\n", inode->i_flags); + printk(KERN_ERR "i_op %p, vfs_empty_iops %p, " + "i_fop %p, i_mapping %p\n", + inode->i_op, &vfs_empty_iops, + inode->i_fop, inode->i_mapping); + if (!cnt++) { + printk(KERN_ERR "current %d (%s), VE %d," + " time %ld.%06ld\n", + current->pid, current->comm, + VEID(get_exec_env()), + tv.tv_sec, (long)tv.tv_usec); + dump_stack(); + } + if (parent != NULL) + printk(KERN_ERR "VZDQ: parent of %lu is %lu\n", + inode->i_ino, parent->i_ino); + } + goto set; +} + +static void vzquota_inode_qmblk_recalc(struct inode *inode, + struct vz_quota_ilink *qlnk) +{ + spin_lock(&dcache_lock); + if (!list_empty(&inode->i_dentry)) + vzquota_dtree_qmblk_recalc(inode, qlnk); + else + vzquota_det_qmblk_recalc(inode, qlnk); + spin_unlock(&dcache_lock); +} + +/** + * vzquota_inode_qmblk - obtain inode's qmblk + * + * Returns qmblk with refcounter taken, %NULL if not under + * VZ quota or %VZ_QUOTA_BAD. + * + * FIXME: This function should be removed when vzquota_find_qmblk / + * get_quota_root / vzquota_dstat code is cleaned up. + */ +struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ilink qlnk; + + might_sleep(); + + if (inode->i_sb->dq_op != &vz_quota_operations) + return NULL; +#if defined(VZ_QUOTA_UNLOAD) +#error Make sure qmblk does not disappear +#endif + + vzquota_qlnk_init(&qlnk); + inode_qmblk_lock(inode->i_sb); + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + + if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || + !VZ_QUOTA_IS_ACTUAL(inode)) + vzquota_inode_qmblk_recalc(inode, &qlnk); + + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk != VZ_QUOTA_BAD) { + if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) + qmblk_get(qmblk); + else + qmblk = NULL; + } + + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&qlnk); + return qmblk; +} + +/** + * vzquota_find_qmblk - helper to emulate quota on virtual filesystems + * + * This function finds a quota master block corresponding to the root of + * a virtual filesystem. + * Returns a quota master block with reference taken, or %NULL if not under + * quota, or %VZ_QUOTA_BAD if quota inconsistency is found (and all allocation + * operations will fail). + * + * Note: this function uses vzquota_inode_qmblk(). + * The latter is a rather confusing function: it returns qmblk that used to be + * on the inode some time ago (without guarantee that it still has any + * relations to the inode). So, vzquota_find_qmblk() leaves it up to the + * caller to think whether the inode could have changed its qmblk and what to + * do in that case. + * Currently, the callers appear to not care :( + */ +struct vz_quota_master *vzquota_find_qmblk(struct super_block *sb) +{ + struct inode *qrinode; + struct vz_quota_master *qmblk; + + qmblk = NULL; + qrinode = NULL; + if (sb->s_op->get_quota_root != NULL) + qrinode = sb->s_op->get_quota_root(sb); + if (qrinode != NULL) + qmblk = vzquota_inode_qmblk(qrinode); + return qmblk; +} + +/* ---------------------------------------------------------------------- + * + * Calls from quota operations + * + * --------------------------------------------------------------------- */ + +/** + * vzquota_inode_init_call - call from DQUOT_INIT + */ +void vzquota_inode_init_call(struct inode *inode) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + /* initializes inode's quota inside */ + qmblk = vzquota_inode_data(inode, &data); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + vzquota_data_unlock(inode, &data); + + /* + * The check is needed for repeated new_inode() calls from a single + * ext3 call like create or mkdir in case of -ENOSPC. + */ + spin_lock(&dcache_lock); + if (!list_empty(&inode->i_dentry)) + vzquota_cur_qmblk_set(inode); + spin_unlock(&dcache_lock); +} + +/** + * vzquota_inode_drop_call - call from DQUOT_DROP + */ +void vzquota_inode_drop_call(struct inode *inode) +{ + vzquota_inode_drop(inode); +} + +/** + * vzquota_inode_data - initialize (if nec.) and lock inode quota ptrs + * @inode: the inode + * @data: storage space + * + * Returns: qmblk is NULL or VZ_QUOTA_BAD or actualized qmblk. + * On return if qmblk is neither NULL nor VZ_QUOTA_BAD: + * qmblk in inode's qlnk is the same as returned, + * ugid pointers inside inode's qlnk are valid, + * some locks are taken (and should be released by vzquota_data_unlock). + * If qmblk is NULL or VZ_QUOTA_BAD, locks are NOT taken. + */ +struct vz_quota_master *vzquota_inode_data(struct inode *inode, + struct vz_quota_datast *data) +{ + struct vz_quota_master *qmblk; + + might_sleep(); + + vzquota_qlnk_init(&data->qlnk); + inode_qmblk_lock(inode->i_sb); + if (unlikely(inode->i_flags & S_NOQUOTA)) { + inode_qmblk_unlock(inode->i_sb); + return NULL; + } + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + + if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || + !VZ_QUOTA_IS_ACTUAL(inode)) + vzquota_inode_qmblk_recalc(inode, &data->qlnk); + + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk != VZ_QUOTA_BAD) { + if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) { + /* + * Note that in the current implementation, + * inode_qmblk_lock can theoretically be dropped here. + * This place is serialized with quota_off because + * quota_off fails when there are extra dentry + * references and syncs inodes before removing quota + * information from them. + * However, quota usage information should stop being + * updated immediately after vzquota_off. + */ + qmblk_data_write_lock(qmblk); + } else { + inode_qmblk_unlock(inode->i_sb); + qmblk = NULL; + } + } else { + inode_qmblk_unlock(inode->i_sb); + } + return qmblk; +} + +void vzquota_data_unlock(struct inode *inode, + struct vz_quota_datast *data) +{ + qmblk_data_write_unlock(INODE_QLNK(inode)->qmblk); + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&data->qlnk); +} + +#if defined(CONFIG_VZ_QUOTA_UGID) +/** + * vzquota_inode_transfer_call - call from vzquota_transfer + */ +int vzquota_inode_transfer_call(struct inode *inode, struct iattr *iattr) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + struct vz_quota_ilink qlnew; + int mask; + int ret; + + might_sleep(); + vzquota_qlnk_init(&qlnew); +start: + qmblk = vzquota_inode_data(inode, &data); + ret = NO_QUOTA; + if (qmblk == VZ_QUOTA_BAD) + goto out_destr; + ret = QUOTA_OK; + if (qmblk == NULL) + goto out_destr; + qmblk_get(qmblk); + + ret = QUOTA_OK; + if (!(qmblk->dq_flags & VZDQUG_ON)) + /* no ugid quotas */ + goto out_unlock; + + mask = 0; + if ((iattr->ia_valid & ATTR_UID) && iattr->ia_uid != inode->i_uid) + mask |= 1 << USRQUOTA; + if ((iattr->ia_valid & ATTR_GID) && iattr->ia_gid != inode->i_gid) + mask |= 1 << GRPQUOTA; + while (1) { + if (vzquota_qlnk_is_empty(&qlnew) && + vzquota_qlnk_fill_attr(&qlnew, inode, iattr, mask, qmblk)) + break; + if (qlnew.qmblk == INODE_QLNK(inode)->qmblk && + qlnew.qmblk == qmblk) + goto finish; + if (vzquota_qlnk_reinit_attr(&qlnew, inode, qmblk)) + break; + } + + /* prepare for restart */ + vzquota_data_unlock(inode, &data); + qmblk_put(qmblk); + goto start; + +finish: + /* all references obtained successfully */ + ret = vzquota_transfer_usage(inode, mask, &qlnew); + if (!ret) { + vzquota_qlnk_swap(&qlnew, INODE_QLNK(inode)); + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_TRANS); + } +out_unlock: + vzquota_data_unlock(inode, &data); + qmblk_put(qmblk); +out_destr: + vzquota_qlnk_destroy(&qlnew); + return ret; +} +#endif + +int vzquota_rename_check(struct inode *inode, + struct inode *old_dir, struct inode *new_dir) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ilink qlnk1, qlnk2, qlnk3; + int c, ret; + + if (inode->i_sb != old_dir->i_sb || inode->i_sb != new_dir->i_sb) + return -1; + + might_sleep(); + + vzquota_qlnk_init(&qlnk1); + vzquota_qlnk_init(&qlnk2); + vzquota_qlnk_init(&qlnk3); + inode_qmblk_lock(inode->i_sb); + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + __vzquota_inode_init(old_dir, VZ_QUOTAO_INICAL); + __vzquota_inode_init(new_dir, VZ_QUOTAO_INICAL); + + do { + c = 0; + if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || + !VZ_QUOTA_IS_ACTUAL(inode)) { + vzquota_inode_qmblk_recalc(inode, &qlnk1); + c++; + } + if (vzquota_qlnk_is_empty(INODE_QLNK(new_dir)) || + !VZ_QUOTA_IS_ACTUAL(new_dir)) { + vzquota_inode_qmblk_recalc(new_dir, &qlnk2); + c++; + } + } while (c); + + ret = 0; + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk != INODE_QLNK(new_dir)->qmblk) { + ret = -1; + while (vzquota_qlnk_is_empty(INODE_QLNK(old_dir)) || + !VZ_QUOTA_IS_ACTUAL(old_dir)) + vzquota_inode_qmblk_recalc(old_dir, &qlnk3); + if (qmblk != VZ_QUOTA_BAD && + !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && + qmblk->dq_root_path.dentry->d_inode == inode && + VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(new_dir)->qmblk, + inode->i_sb) && + VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(old_dir)->qmblk, + inode->i_sb)) + /* quota root rename is allowed */ + ret = 0; + } + + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&qlnk3); + vzquota_qlnk_destroy(&qlnk2); + vzquota_qlnk_destroy(&qlnk1); + return ret; +} + +/* + * Scan parent subdirs and find busy dentries names/path + * @parent: parent dentry + * @buf: buffer to store path. + */ +static void vzdquota_read_busy_dentries(struct path *parent, + char *buf, int buflen) +{ + struct dentry *this_parent = parent->dentry; + struct list_head *next; + char *res, *end, *start; + struct path root, path; + int len; + + if (!buf || buflen <= 0) + return; + + path.mnt = parent->mnt; + /* From d_path() ... */ + read_lock(¤t->fs->lock); + path_get(¤t->fs->root); + root = current->fs->root; + read_unlock(¤t->fs->lock); + + spin_lock(&dcache_lock); + + end = buf + buflen; + start = buf; +repeat: + next = this_parent->d_subdirs.next; +resume: + while (next != &this_parent->d_subdirs) { + struct list_head *tmp = next; + struct dentry *dentry; + int subdirs; + + dentry = list_entry(tmp, struct dentry, d_u.d_child); + next = tmp->next; + subdirs = !list_empty(&dentry->d_subdirs); + + if (atomic_read(&dentry->d_count) && !subdirs) { + if (!buflen) + goto out; + /* + * Note: __d_path will store filename at the + * end of buf. + */ + path.dentry = dentry; + res = __d_path(&path, &root, buf, buflen); + /* Exit if name is too long */ + if (IS_ERR(res)) + goto out; + + /* + * Move the string obtained by __d_path, + * behind the last dentry path in buf. + */ + len = end - res; + BUG_ON(len <= 0); + + memmove(buf, res, len); + + /* Trick: replace \0 by \n */ + if (buf != start) + *(char *)(buf - 1) = '\n'; + + buf += len; + buflen -= len; + } + + /* + * Descend a level if the d_subdirs list is non-empty. + */ + if (subdirs) { + this_parent = dentry; + goto repeat; + } + } + /* + * All done at this level ... ascend and resume the search. + */ + if (this_parent != parent->dentry) { + next = this_parent->d_u.d_child.next; + this_parent = this_parent->d_parent; + goto resume; + } +out: + /* From d_path() ... */ + spin_unlock(&dcache_lock); + path_put(&root); +} + +/* ---------------------------------------------------------------------- + * + * qmblk-related parts of on/off operations + * + * --------------------------------------------------------------------- */ + +/** + * vzquota_check_dtree - check dentry tree if quota on/off is allowed + * + * This function doesn't allow quota to be turned on/off if some dentries in + * the tree have external references. + * In addition to technical reasons, it enforces user-space correctness: + * current usage (taken from or reported to the user space) can be meaningful + * and accurate only if the tree is not being modified. + * Side effect: additional vfsmount structures referencing the tree (bind + * mounts of tree nodes to some other places) are not allowed at on/off time. + * + * Store busy dentries path to the buf (if passed) in case of vzquota_off + * ioctl fail. + */ +int vzquota_check_dtree(struct vz_quota_master *qmblk, int off, + char *buf, int buflen) +{ + struct dentry *dentry; + int err, count; + + err = -EBUSY; + dentry = qmblk->dq_root_path.dentry; + + if (d_unhashed(dentry) && dentry != dentry->d_sb->s_root) + goto unhashed; + + /* attempt to shrink */ + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dcache_lock); + inode_qmblk_unlock(dentry->d_sb); + shrink_dcache_parent(dentry); + inode_qmblk_lock(dentry->d_sb); + spin_lock(&dcache_lock); + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dcache_lock); + vzdquota_read_busy_dentries(&qmblk->dq_root_path, + buf, buflen); + spin_lock(&dcache_lock); + goto out; + } + + count = 1; + if (dentry == dentry->d_sb->s_root) + count += 2; /* sb and mnt refs */ + if (atomic_read(&dentry->d_count) < count) { + printk(KERN_ERR "%s: too small count %d vs %d.\n", + __FUNCTION__, + atomic_read(&dentry->d_count), count); + goto out; + } + if (atomic_read(&dentry->d_count) > count) + goto out; + } + + err = 0; +out: + return err; + +unhashed: + /* + * Quota root is removed. + * Allow to turn quota off, but not on. + */ + if (off) + err = 0; + goto out; +} + +int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, + struct vz_quota_master *qmblk, char __user *ubuf) +{ + struct vz_quota_ilink qlnk; + struct vz_quota_master *qold, *qnew; + int err; + char *buf; + + buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL; + + might_sleep(); + + qold = NULL; + qnew = vzquota_alloc_fake(); + if (qnew == NULL) { + free_page((unsigned long)buf); + return -ENOMEM; + } + + vzquota_qlnk_init(&qlnk); + inode_qmblk_lock(sb); + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + + spin_lock(&dcache_lock); + while (1) { + err = vzquota_check_dtree(qmblk, 0, buf, PAGE_SIZE); + if (err) + break; + if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)) + break; + } + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ON); + spin_unlock(&dcache_lock); + + if (!err) { + qold = __VZ_QUOTA_NOQUOTA(sb); + qold->dq_flags |= VZDQ_NOACT; + __VZ_QUOTA_NOQUOTA(sb) = qnew; + } + + inode_qmblk_unlock(sb); + vzquota_qlnk_destroy(&qlnk); + if (qold != NULL) + qmblk_put(qold); + + if (buf) { + if (copy_to_user(ubuf, buf, PAGE_SIZE)) + ; + free_page((unsigned long)buf); + } + return err; +} + +int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk, + char __user *ubuf, int force) +{ + int ret; + char *buf; + + buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL; + + ret = 0; + inode_qmblk_lock(sb); + + spin_lock(&dcache_lock); + if (vzquota_check_dtree(qmblk, 1, buf, PAGE_SIZE) && !force) + ret = -EBUSY; + spin_unlock(&dcache_lock); + + if (!ret) + qmblk->dq_flags |= VZDQ_NOACT | VZDQ_NOQUOT; + inode_qmblk_unlock(sb); + + if (buf) { + if (copy_to_user(ubuf, buf, PAGE_SIZE)) + ; + free_page((unsigned long)buf); + } + return ret; +} + + +/* ---------------------------------------------------------------------- + * + * External interfaces + * + * ---------------------------------------------------------------------*/ + +static int vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + switch (cmd) { + case VZCTL_QUOTA_NEW_CTL: { + struct vzctl_quotactl qb; + + err = -EFAULT; + if (copy_from_user(&qb, (void __user *)arg, sizeof(qb))) + break; + err = do_vzquotactl(qb.cmd, qb.quota_id, + qb.qstat, qb.ve_root, 0); + break; + } +#ifdef CONFIG_VZ_QUOTA_UGID + case VZCTL_QUOTA_UGID_CTL: { + struct vzctl_quotaugidctl qub; + + err = -EFAULT; + if (copy_from_user(&qub, (void __user *)arg, sizeof(qub))) + break; + err = do_vzquotaugidctl(qub.cmd, qub.quota_id, + qub.ugid_index, qub.ugid_size, qub.addr, 0); + break; + } +#endif + default: + err = -ENOTTY; + } + return err; +} + +#ifdef CONFIG_COMPAT +static int compat_vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + switch (cmd) { + case VZCTL_COMPAT_QUOTA_CTL: { + struct compat_vzctl_quotactl cs; + + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + err = do_vzquotactl(cs.cmd, cs.quota_id, + compat_ptr(cs.qstat), + compat_ptr(cs.ve_root), 1); + break; + } +#ifdef CONFIG_VZ_QUOTA_UGID + case VZCTL_COMPAT_QUOTA_UGID_CTL: { + struct compat_vzctl_quotaugidctl cs; + + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + err = do_vzquotaugidctl(cs.cmd, cs.quota_id, cs.ugid_index, + cs.ugid_size, compat_ptr(cs.addr), 1); + break; + } +#endif + default: + err = -ENOIOCTLCMD; + } + return err; +} +#endif + +static struct vzioctlinfo vzdqcalls = { + .type = VZDQCTLTYPE, + .ioctl = vzquota_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_vzquota_ioctl, +#endif + .owner = THIS_MODULE, +}; + +/** + * vzquota_dstat - get quota usage info for virtual superblock + */ +static int vzquota_dstat(struct super_block *super, struct dq_stat *qstat) +{ + struct vz_quota_master *qmblk; + + qmblk = vzquota_find_qmblk(super); + if (qmblk == NULL) + return -ENOENT; + if (qmblk == VZ_QUOTA_BAD) { + memset(qstat, 0, sizeof(*qstat)); + return 0; + } + + qmblk_data_read_lock(qmblk); + memcpy(qstat, &qmblk->dq_stat, sizeof(*qstat)); + qmblk_data_read_unlock(qmblk); + qmblk_put(qmblk); + return 0; +} + + +/* ---------------------------------------------------------------------- + * + * Init/exit helpers + * + * ---------------------------------------------------------------------*/ + +static int vzquota_cache_init(void) +{ + int i; + + vzquota_cachep = kmem_cache_create("vz_quota_master", + sizeof(struct vz_quota_master), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (vzquota_cachep == NULL) { + printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); + goto nomem2; + } + for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) + INIT_LIST_HEAD(&vzquota_hash_table[i]); + + return 0; + +nomem2: + return -ENOMEM; +} + +static void vzquota_cache_release(void) +{ + int i; + + /* sanity check */ + for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) + if (!list_empty(&vzquota_hash_table[i])) + BUG(); + + /* release caches */ + kmem_cache_destroy(vzquota_cachep); + vzquota_cachep = NULL; +} + +static int quota_notifier_call(struct vnotifier_block *self, + unsigned long n, void *data, int err) +{ + struct virt_info_quota *viq; + struct super_block *sb; + + viq = (struct virt_info_quota *)data; + switch (n) { + case VIRTINFO_QUOTA_ON: + err = NOTIFY_BAD; + if (!try_module_get(THIS_MODULE)) + break; + sb = viq->super; + memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + err = NOTIFY_OK; + break; + case VIRTINFO_QUOTA_OFF: + module_put(THIS_MODULE); + err = NOTIFY_OK; + break; + case VIRTINFO_QUOTA_GETSTAT: + err = NOTIFY_BAD; + if (vzquota_dstat(viq->super, viq->qstat)) + break; + err = NOTIFY_OK; + break; + case VIRTINFO_QUOTA_DISABLE: + err = NOTIFY_OK; + vzquota_inode_off((struct inode *)data); + break; + } + return err; +} + +struct vnotifier_block quota_notifier_block = { + .notifier_call = quota_notifier_call, + .priority = INT_MAX, +}; + +/* ---------------------------------------------------------------------- + * + * Init/exit procedures + * + * ---------------------------------------------------------------------*/ + +static int __init vzquota_init(void) +{ + int err; + + if ((err = vzquota_cache_init()) != 0) + goto out_cache; + + if ((err = vzquota_proc_init()) != 0) + goto out_proc; + +#ifdef CONFIG_VZ_QUOTA_UGID + if ((err = vzquota_ugid_init()) != 0) + goto out_ugid; +#endif + + init_MUTEX(&vz_quota_sem); + vzioctl_register(&vzdqcalls); + virtinfo_notifier_register(VITYPE_QUOTA, "a_notifier_block); +#if defined(CONFIG_VZ_QUOTA_UGID) && defined(CONFIG_PROC_FS) + vzaquota_init(); +#endif + + return 0; + +#ifdef CONFIG_VZ_QUOTA_UGID +out_ugid: + vzquota_proc_release(); +#endif +out_proc: + vzquota_cache_release(); +out_cache: + return err; +} + +#if defined(VZ_QUOTA_UNLOAD) +static void __exit vzquota_release(void) +{ + virtinfo_notifier_unregister(VITYPE_QUOTA, "a_notifier_block); + vzioctl_unregister(&vzdqcalls); +#ifdef CONFIG_VZ_QUOTA_UGID +#ifdef CONFIG_PROC_FS + vzaquota_fini(); +#endif + vzquota_ugid_release(); +#endif + vzquota_proc_release(); + vzquota_cache_release(); +} +#endif + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Disk Quota"); +MODULE_LICENSE("GPL v2"); + +module_init(vzquota_init) +#if defined(VZ_QUOTA_UNLOAD) +module_exit(vzquota_release) +#endif diff --git a/include/asm-x86/elf.h b/include/asm-x86/elf.h index 7be4733..a68dc33 100644 --- a/include/asm-x86/elf.h +++ b/include/asm-x86/elf.h @@ -279,7 +279,7 @@ struct task_struct; #define ARCH_DLINFO_IA32(vdso_enabled) \ do { \ - if (vdso_enabled) { \ + if (vdso_enabled && sysctl_at_vsyscall) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ @@ -324,9 +324,11 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack); + int executable_stack, + unsigned long map_address); -extern int syscall32_setup_pages(struct linux_binprm *, int exstack); +extern int syscall32_setup_pages(struct linux_binprm *, int exstack, + unsigned long map_address); #define compat_arch_setup_additional_pages syscall32_setup_pages extern unsigned long arch_randomize_brk(struct mm_struct *mm); diff --git a/include/asm-x86/mman.h b/include/asm-x86/mman.h index 90bc410..e370cc3 100644 --- a/include/asm-x86/mman.h +++ b/include/asm-x86/mman.h @@ -13,6 +13,7 @@ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_EXECPRIO 0x40000 /* soft ubc charge */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 21f8d02..34c101d 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -29,6 +29,10 @@ extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); +typedef int (*nmi_callback_t)(struct pt_regs *regs, int cpu); +void set_nmi_ipi_callback(nmi_callback_t callback); +void unset_nmi_ipi_callback(void); + extern void setup_apic_nmi_watchdog(void *); extern void stop_apic_nmi_watchdog(void *); extern void disable_timer_nmi_watchdog(void); diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h index d63ea43..e80d924 100644 --- a/include/asm-x86/pgalloc.h +++ b/include/asm-x86/pgalloc.h @@ -68,7 +68,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, #if PAGETABLE_LEVELS > 2 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + return (pmd_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) @@ -98,7 +98,7 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + return (pud_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h index 4df3e2f..42aaa5b 100644 --- a/include/asm-x86/processor.h +++ b/include/asm-x86/processor.h @@ -896,8 +896,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ - 0xc0000000 : 0xFFFFe000) +#define IA32_PAGE_OFFSET 0xc0000000 #define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ IA32_PAGE_OFFSET : TASK_SIZE64) diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index da0a675..e9f7812 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -91,6 +91,7 @@ struct thread_info { #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ #define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ +#define TIF_RESUME 29 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) @@ -112,6 +113,7 @@ struct thread_info { #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) +#define _TIF_RESUME (1< +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form. + */ +#define UB_MAXVALUE ( (1UL << (sizeof(unsigned long)*8-1)) - 1) + + +/* + * Resource management structures + * Serialization issues: + * beancounter list management is protected via ub_hash_lock + * task pointers are set only for current task and only once + * refcount is managed atomically + * value and limit comparison and change are protected by per-ub spinlock + */ + +struct page_beancounter; +struct task_beancounter; +struct sock_beancounter; + +struct page_private { + unsigned long ubp_unused_privvmpages; + unsigned long ubp_tmpfs_respages; + unsigned long ubp_swap_pages; + unsigned long long ubp_held_pages; +}; + +struct sock_private { + unsigned long ubp_rmem_thres; + unsigned long ubp_wmem_pressure; + unsigned long ubp_maxadvmss; + unsigned long ubp_rmem_pressure; + int ubp_tw_count; +#define UB_RMEM_EXPAND 0 +#define UB_RMEM_KEEP 1 +#define UB_RMEM_SHRINK 2 + struct list_head ubp_other_socks; + struct list_head ubp_tcp_socks; + atomic_t ubp_orphan_count; +}; + +struct ub_percpu_struct { + unsigned long unmap; + unsigned long swapin; +#ifdef CONFIG_BC_IO_ACCOUNTING + unsigned long long bytes_wrote; + unsigned long long bytes_read; + unsigned long long bytes_cancelled; +#endif +#ifdef CONFIG_BC_DEBUG_KMEM + long pages_charged; + long vmalloc_charged; + long pbcs; +#endif + unsigned long sync; + unsigned long sync_done; + + unsigned long fsync; + unsigned long fsync_done; + + unsigned long fdsync; + unsigned long fdsync_done; + + unsigned long frsync; + unsigned long frsync_done; + + unsigned long write; + unsigned long read; + unsigned long long wchar; + unsigned long long rchar; +}; + +struct user_beancounter +{ + unsigned long ub_magic; + atomic_t ub_refcount; + struct list_head ub_list; + struct hlist_node ub_hash; + + union { + struct rcu_head rcu; + struct execute_work cleanup; + }; + + spinlock_t ub_lock; + uid_t ub_uid; + + struct ub_rate_info ub_limit_rl; + int ub_oom_noproc; + + struct page_private ppriv; +#define ub_unused_privvmpages ppriv.ubp_unused_privvmpages +#define ub_tmpfs_respages ppriv.ubp_tmpfs_respages +#define ub_swap_pages ppriv.ubp_swap_pages +#define ub_held_pages ppriv.ubp_held_pages + struct sock_private spriv; +#define ub_rmem_thres spriv.ubp_rmem_thres +#define ub_maxadvmss spriv.ubp_maxadvmss +#define ub_rmem_pressure spriv.ubp_rmem_pressure +#define ub_wmem_pressure spriv.ubp_wmem_pressure +#define ub_tcp_sk_list spriv.ubp_tcp_socks +#define ub_other_sk_list spriv.ubp_other_socks +#define ub_orphan_count spriv.ubp_orphan_count +#define ub_tw_count spriv.ubp_tw_count + struct ub_iopriv iopriv; + + struct user_beancounter *parent; + void *private_data; + unsigned long ub_aflags; + +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc; +#endif + + /* resources statistic and settings */ + struct ubparm ub_parms[UB_RESOURCES]; + /* resources statistic for last interval */ + struct ubparm ub_store[UB_RESOURCES]; + + struct ub_percpu_struct *ub_percpu; +#ifdef CONFIG_BC_IO_ACCOUNTING + /* these are protected with pb_lock */ + unsigned long long bytes_wrote; + unsigned long long bytes_dirtied; + unsigned long long bytes_dirty_missed; + unsigned long io_pb_held; +#endif +#ifdef CONFIG_BC_DEBUG_KMEM + struct list_head ub_cclist; +#endif +}; + +enum ub_severity { UB_HARD, UB_SOFT, UB_FORCE }; + +#define UB_AFLAG_NOTIF_PAGEIN 0 + +static inline +struct user_beancounter *top_beancounter(struct user_beancounter *ub) +{ + while (ub->parent != NULL) + ub = ub->parent; + return ub; +} + +static inline int ub_barrier_hit(struct user_beancounter *ub, int resource) +{ + return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier; +} + +static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource) +{ + return (ub->ub_parms[resource].held > + ((ub->ub_parms[resource].barrier) >> 1)); +} + +static inline int ub_barrier_farnr(struct user_beancounter *ub, int resource) +{ + struct ubparm *p; + p = ub->ub_parms + resource; + return p->held <= (p->barrier >> 3); +} + +static inline int ub_barrier_farsz(struct user_beancounter *ub, int resource) +{ + struct ubparm *p; + p = ub->ub_parms + resource; + return p->held <= (p->barrier >> 3) && p->barrier >= 1024 * 1024; +} + +#ifndef CONFIG_BEANCOUNTERS + +#define ub_percpu_add(ub, f, v) do { } while (0) +#define ub_percpu_sub(ub, f, v) do { } while (0) +#define ub_percpu_inc(ub, f) do { } while (0) +#define ub_percpu_dec(ub, f) do { } while (0) + +#define mm_ub(mm) (NULL) + +extern inline struct user_beancounter *get_beancounter_byuid + (uid_t uid, int create) { return NULL; } +extern inline struct user_beancounter *get_beancounter + (struct user_beancounter *ub) { return NULL; } +extern inline void put_beancounter(struct user_beancounter *ub) { } + +static inline void ub_init_late(void) { }; +static inline void ub_init_early(void) { }; + +static inline int charge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val, + enum ub_severity strict) { return 0; } +static inline void uncharge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val) { } + +#else /* CONFIG_BEANCOUNTERS */ + +#define ub_percpu_add(ub, field, v) do { \ + per_cpu_ptr(ub->ub_percpu, get_cpu())->field += (v); \ + put_cpu(); \ + } while (0) +#define ub_percpu_inc(ub, field) ub_percpu_add(ub, field, 1) + +#define ub_percpu_sub(ub, field, v) do { \ + per_cpu_ptr(ub->ub_percpu, get_cpu())->field -= (v); \ + put_cpu(); \ + } while (0) +#define ub_percpu_dec(ub, field) ub_percpu_sub(ub, field, 1) + +#define mm_ub(mm) ((mm)->mm_ub) +/* + * Charge/uncharge operations + */ + +extern int __charge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val, enum ub_severity strict); + +extern void __uncharge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val); + +extern void put_beancounter_safe(struct user_beancounter *ub); +extern void __put_beancounter(struct user_beancounter *ub); + +extern void uncharge_warn(struct user_beancounter *ub, int resource, + unsigned long val, unsigned long held); + +extern const char *ub_rnames[]; +/* + * Put a beancounter reference + */ + +static inline void put_beancounter(struct user_beancounter *ub) +{ + if (unlikely(ub == NULL)) + return; + + /* FIXME - optimize not to disable interrupts and make call */ + __put_beancounter(ub); +} + +/* fast put, refcount can't reach zero */ +static inline void __put_beancounter_batch(struct user_beancounter *ub, int n) +{ + atomic_sub(n, &ub->ub_refcount); +} + +static inline void put_beancounter_batch(struct user_beancounter *ub, int n) +{ + if (n > 1) + __put_beancounter_batch(ub, n - 1); + __put_beancounter(ub); +} + +/* + * Create a new beancounter reference + */ +extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create); + +static inline +struct user_beancounter *get_beancounter(struct user_beancounter *ub) +{ + if (unlikely(ub == NULL)) + return NULL; + + atomic_inc(&ub->ub_refcount); + return ub; +} + +static inline +struct user_beancounter *get_beancounter_rcu(struct user_beancounter *ub) +{ + return atomic_inc_not_zero(&ub->ub_refcount) ? ub : NULL; +} + +static inline void get_beancounter_batch(struct user_beancounter *ub, int n) +{ + atomic_add(n, &ub->ub_refcount); +} + +extern struct user_beancounter *get_subbeancounter_byid( + struct user_beancounter *, + int id, int create); + +extern void ub_init_late(void); +extern void ub_init_early(void); + +extern int print_ub_uid(struct user_beancounter *ub, char *buf, int size); + +/* + * Resource charging + * Change user's account and compare against limits + */ + +static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource) +{ + if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held) + ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held; + if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held) + ub->ub_parms[resource].minheld = ub->ub_parms[resource].held; +} + +int charge_beancounter(struct user_beancounter *ub, int resource, + unsigned long val, enum ub_severity strict); +void uncharge_beancounter(struct user_beancounter *ub, int resource, + unsigned long val); +void __charge_beancounter_notop(struct user_beancounter *ub, int resource, + unsigned long val); +void __uncharge_beancounter_notop(struct user_beancounter *ub, int resource, + unsigned long val); + +static inline void charge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + if (ub->parent != NULL) + __charge_beancounter_notop(ub, resource, val); +} + +static inline void uncharge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + if (ub->parent != NULL) + __uncharge_beancounter_notop(ub, resource, val); +} + +#endif /* CONFIG_BEANCOUNTERS */ + +#ifndef CONFIG_BC_RSS_ACCOUNTING +static inline void ub_ini_pbc(void) { } +#else +extern void ub_init_pbc(void); +#endif +#endif /* __KERNEL__ */ +#endif /* _LINUX_BEANCOUNTER_H */ diff --git a/include/bc/dcache.h b/include/bc/dcache.h new file mode 100644 index 0000000..5ebefff --- /dev/null +++ b/include/bc/dcache.h @@ -0,0 +1,47 @@ +/* + * include/bc/dcache.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_DCACHE_H_ +#define __BC_DCACHE_H_ + +#include + +/* + * UB_DCACHESIZE accounting + */ + +struct dentry_beancounter +{ + /* + * d_inuse = + * + + * + * + * d_inuse == -1 means that dentry is unused + * state change -1 => 0 causes charge + * state change 0 => -1 causes uncharge + */ + atomic_t d_inuse; + /* charged size, including name length if name is not inline */ + unsigned long d_ubsize; + struct user_beancounter *d_ub; +}; + +#ifdef CONFIG_BEANCOUNTERS +#define ub_dget_testone(d) (atomic_inc_and_test(&(d)->dentry_bc.d_inuse)) +#define ub_dput_testzero(d) (atomic_add_negative(-1, &(d)->dentry_bc.d_inuse)) +#define INUSE_INIT 0 + +extern int ub_dentry_on; +#else +#define ub_dget_testone(d) (0) +#define ub_dput_testzero(d) (0) +#endif +#endif diff --git a/include/bc/dcache_op.h b/include/bc/dcache_op.h new file mode 100644 index 0000000..23306e9 --- /dev/null +++ b/include/bc/dcache_op.h @@ -0,0 +1,102 @@ +/* + * include/bc/dcache_op.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_DCACHE_OP_H_ +#define __BC_DCACHE_OP_H_ + +struct dentry; + +#ifdef CONFIG_BEANCOUNTERS + +#include +#include +#include + +extern int ub_dentry_alloc_barrier; +extern spinlock_t dcache_lock; + +static inline int ub_dentry_alloc(struct dentry *d) +{ + extern int __ub_dentry_alloc(struct dentry *); + + if (!ub_dentry_on) + return 0; + return __ub_dentry_alloc(d); +} + +static inline void ub_dentry_alloc_start(void) +{ + extern void __ub_dentry_alloc_start(void); + + if (ub_dentry_alloc_barrier) + __ub_dentry_alloc_start(); +} + +static inline void ub_dentry_alloc_end(void) +{ + extern void __ub_dentry_alloc_end(void); + + if (current->task_bc.dentry_alloc) + __ub_dentry_alloc_end(); +} + +static inline int ub_dentry_charge(struct dentry *d) +{ + extern int __ub_dentry_charge(struct dentry *); + + if (!ub_dentry_on) + return 0; + return __ub_dentry_charge(d); +} + +static inline void ub_dentry_charge_nofail(struct dentry *d) +{ + extern void __ub_dentry_charge_nofail(struct dentry *); + + if (!ub_dentry_on) + return; + __ub_dentry_charge_nofail(d); +} + +static inline void ub_dentry_uncharge_locked(struct dentry *d) +{ + extern void __ub_dentry_uncharge(struct dentry *); + + if (!ub_dentry_on) + return; + __ub_dentry_uncharge(d); +} + +static inline void ub_dentry_uncharge(struct dentry *d) +{ + extern void __ub_dentry_uncharge(struct dentry *); + + if (!ub_dentry_on) + return; + spin_lock(&dcache_lock); + __ub_dentry_uncharge(d); + spin_unlock(&dcache_lock); +} + +void uncharge_dcache(struct user_beancounter *ub, unsigned long size); +#else /* CONFIG_BEANCOUNTERS */ + +static inline int ub_dentry_alloc(struct dentry *d) { return 0; } +static inline void ub_dentry_alloc_start(void) { } +static inline void ub_dentry_alloc_end(void) { } +static inline int ub_dentry_charge(struct dentry *d) { return 0; } +static inline void ub_dentry_charge_nofail(struct dentry *d) { } +static inline void ub_dentry_uncharge_locked(struct dentry *d) { } +static inline void ub_dentry_uncharge(struct dentry *d) { } +static inline void uncharge_dcache(struct user_beancounter *ub, unsigned long size) { } + +#endif /* CONFIG_BEANCOUNTERS */ + +#endif /* __dcache_op.h_ */ diff --git a/include/bc/debug.h b/include/bc/debug.h new file mode 100644 index 0000000..7b1feb6 --- /dev/null +++ b/include/bc/debug.h @@ -0,0 +1,109 @@ +/* + * include/bc/debug.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_DEBUG_H_ +#define __BC_DEBUG_H_ + +/* + * general debugging + */ + +#define UBD_ALLOC 0x1 +#define UBD_CHARGE 0x2 +#define UBD_LIMIT 0x4 +#define UBD_TRACE 0x8 + +/* + * ub_net debugging + */ + +#define UBD_NET_SOCKET 0x10 +#define UBD_NET_SLEEP 0x20 +#define UBD_NET_SEND 0x40 +#define UBD_NET_RECV 0x80 + +/* + * Main routines + */ + +#define UB_DEBUG (0) +#define DEBUG_RESOURCE (0ULL) + +#define ub_dbg_cond(__cond, __str, args...) \ + do { \ + if ((__cond) != 0) \ + printk(__str, ##args); \ + } while(0) + +#define ub_debug(__section, __str, args...) \ + ub_dbg_cond(UB_DEBUG & (__section), __str, ##args) + +#define ub_debug_resource(__resource, __str, args...) \ + ub_dbg_cond((UB_DEBUG & UBD_CHARGE) && \ + (DEBUG_RESOURCE & (1 << (__resource))), \ + __str, ##args) + +#if UB_DEBUG & UBD_TRACE +#define ub_debug_trace(__cond, __b, __r) \ + do { \ + static struct ub_rate_info ri = { __b, __r }; \ + if ((__cond) != 0 && ub_ratelimit(&ri)) \ + dump_stack(); \ + } while(0) +#else +#define ub_debug_trace(__cond, __burst, __rate) +#endif + +#ifdef CONFIG_BC_DEBUG_KMEM +#include + +struct user_beancounter; +struct ub_cache_counter { + struct list_head ulist; + struct ub_cache_counter *next; + struct user_beancounter *ub; + struct kmem_cache *cachep; + unsigned long counter; +}; + +extern spinlock_t cc_lock; +extern void init_cache_counters(void); +extern void ub_free_counters(struct user_beancounter *); +extern void ub_kmemcache_free(struct kmem_cache *cachep); + +struct vm_struct; +#define inc_vmalloc_charged(vm, flags) do { \ + if (flags & __GFP_UBC) \ + ub_percpu_add(get_exec_ub(), vmalloc_charged, \ + vm->nr_pages); \ + } while (0) +#define dec_vmalloc_charged(vm) do { \ + struct user_beancounter *ub; \ + ub = page_ub(vm->pages[0]); \ + if (ub != NULL) \ + ub_percpu_sub(ub, vmalloc_charged, \ + vm->nr_pages); \ + } while (0) + +#define inc_pbc_count(ub) ub_percpu_inc(ub, pbcs) +#define dec_pbc_count(ub) ub_percpu_dec(ub, pbcs) +#else +#define init_cache_counters() do { } while (0) +#define inc_vmalloc_charged(vm, f) do { } while (0) +#define dec_vmalloc_charged(vm) do { } while (0) + +#define inc_pbc_count(ub) do { } while (0) +#define dec_pbc_count(ub) do { } while (0) + +#define ub_free_counters(ub) do { } while (0) +#define ub_kmemcache_free(cachep) do { } while (0) +#endif + +#endif diff --git a/include/bc/decl.h b/include/bc/decl.h new file mode 100644 index 0000000..6dd4cb9 --- /dev/null +++ b/include/bc/decl.h @@ -0,0 +1,41 @@ +/* + * include/bc/decl.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_DECL_H_ +#define __BC_DECL_H_ + +#ifdef __KERNEL__ + +/* + * Naming convension: + * ub__ + */ + +#ifdef CONFIG_BEANCOUNTERS + +#define UB_DECLARE_FUNC(ret_type, decl) extern ret_type decl; +#define UB_DECLARE_VOID_FUNC(decl) extern void decl; + +#else /* CONFIG_BEANCOUNTERS */ + +#define UB_DECLARE_FUNC(ret_type, decl) \ + static inline ret_type decl \ + { \ + return (ret_type)0; \ + } +#define UB_DECLARE_VOID_FUNC(decl) \ + static inline void decl \ + { \ + } + +#endif /* CONFIG_BEANCOUNTERS */ +#endif + +#endif diff --git a/include/bc/hash.h b/include/bc/hash.h new file mode 100644 index 0000000..b2afb69 --- /dev/null +++ b/include/bc/hash.h @@ -0,0 +1,36 @@ +/* + * include/bc/hash.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_UBHASH_H +#define _LINUX_UBHASH_H + +#ifdef __KERNEL__ + +#define UB_HASH_SIZE 256 + +extern struct hlist_head ub_hash[]; +extern spinlock_t ub_hash_lock; +extern struct list_head ub_list_head; + +#ifdef CONFIG_BEANCOUNTERS + +/* + * Iterate over beancounters + * @__ubp - beancounter ptr + * Can use break :) + */ +#define for_each_beancounter(__ubp) \ + list_for_each_entry_rcu(__ubp, &ub_list_head, ub_list) \ + +#define bc_hash_entry(ptr) hlist_entry(ptr, struct user_beancounter, ub_hash) + +#endif /* CONFIG_BEANCOUNTERS */ +#endif /* __KERNEL__ */ +#endif /* _LINUX_UBHASH_H */ diff --git a/include/bc/io_acct.h b/include/bc/io_acct.h new file mode 100644 index 0000000..d84bf5a --- /dev/null +++ b/include/bc/io_acct.h @@ -0,0 +1,113 @@ +/* + * include/bc/io_acct.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Pavel Emelianov + * + */ + +#ifndef __UB_IO_ACCT_H_ +#define __UB_IO_ACCT_H_ + +#ifdef CONFIG_BC_IO_ACCOUNTING +#include +#include + +#define page_iopb(page) ({ \ + struct page_beancounter *pb; \ + pb = page_pbc(page); \ + rmb(); \ + pb; \ + }) + +/* + * IO ub is required in task context only, so if exec_ub is set + * to NULL this means that uses doesn't need to charge some + * resources. nevertheless IO activity must be accounted, so we + * account it to current's task beancounter. + */ + +static inline struct user_beancounter *get_io_ub(void) +{ + struct user_beancounter *ub; + + ub = get_exec_ub(); + if (unlikely(ub == NULL)) + ub = get_task_ub(current); + + return top_beancounter(ub); +} + +extern struct page_beancounter **page_pblist(struct page *); + +extern void ub_io_save_context(struct page *, size_t); +extern void ub_io_release_context(struct page *pg, size_t size); + +#define PAGE_IO_MARK (0x1UL) + +static inline struct page_beancounter *iopb_to_pb(struct page_beancounter *pb) +{ + if (!((unsigned long)pb & PAGE_IO_MARK)) + return NULL; + + return (struct page_beancounter *)((unsigned long)pb & ~PAGE_IO_MARK); +} + +static inline void ub_io_account_read(size_t bytes) +{ + ub_percpu_add(get_io_ub(), bytes_read, bytes); +} + +static inline void ub_io_account_write(size_t bytes) +{ + ub_percpu_add(get_io_ub(), bytes_wrote, bytes); +} + +static inline void ub_io_account_dirty(struct page *page, size_t bytes) +{ + ub_io_save_context(page, bytes); +} + +static inline void ub_io_account_write_cancelled(size_t bytes) +{ + ub_percpu_add(get_io_ub(), bytes_cancelled, bytes); +} + +void ub_init_io(struct kmem_cache *); +#else /* BC_IO_ACCOUNTING */ +#define page_iopb(page) (NULL) +#define page_pblist(page) (&page_pbc(page)) + +static inline void ub_io_release_context(struct page *pg, size_t bytes) +{ +} + +static inline void ub_io_account_dirty(struct page *p, size_t bytes) +{ +} + +static inline void ub_io_account_read(size_t bytes) +{ +} + +static inline void ub_io_account_write(size_t bytes) +{ +} + +static inline void ub_io_account_write_cancelled(size_t bytes) +{ +} + +static inline void ub_init_io(struct kmem_cache *pb_cachep) { }; +#endif + +#ifdef CONFIG_BC_DEBUG_IO +extern void ub_io_release_debug(struct page *pg); +#else +#define ub_io_release_debug(pg) do { } while (0) +#endif +#endif diff --git a/include/bc/io_prio.h b/include/bc/io_prio.h new file mode 100644 index 0000000..8c1d1e3 --- /dev/null +++ b/include/bc/io_prio.h @@ -0,0 +1,82 @@ +/* + * include/bc/io_prio.h + * + * Copyright (C) 2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Vasily Tarasov + * + */ + +#ifndef _UB_IO_PRIO_H +#define _UB_IO_PRIO_H + +#include +#include +#include + +#define UB_IOPRIO_MIN 0 +#define UB_IOPRIO_MAX IOPRIO_BE_NR +#define UB_IOPRIO_BASE 4 + +struct ub_iopriv { + struct list_head cfq_bc_head; + rwlock_t cfq_bc_list_lock; + + unsigned int ioprio; +}; + +struct cfq_data; +struct cfq_queue; + +#ifdef CONFIG_BC_IO_SCHED +extern void bc_init_ioprio(struct ub_iopriv *); +extern void bc_fini_ioprio(struct ub_iopriv *); +extern struct cfq_bc_data * bc_find_cfq_bc(struct ub_iopriv *, + struct cfq_data *); +extern struct cfq_bc_data * bc_findcreate_cfq_bc(struct ub_iopriv *, + struct cfq_data *, gfp_t gfp_mask); +extern void bc_cfq_exit_queue(struct cfq_data *); +extern int bc_expired(struct cfq_data *); +extern void bc_schedule_active(struct cfq_data *); +extern void bc_inc_rqnum(struct cfq_queue *); +extern void bc_dec_rqnum(struct cfq_queue *); +extern unsigned long bc_set_ioprio(int, int); +extern struct cfq_bc_data * +__find_cfq_bc(struct ub_iopriv *iopriv, struct cfq_data *cfqd); +extern struct user_beancounter *bc_io_switch_context(struct page *); +extern void bc_io_restore_context(struct user_beancounter *); +#else +#include +static inline void bc_init_ioprio(struct ub_iopriv *iopriv) { ; } +static inline void bc_fini_ioprio(struct ub_iopriv *iopriv) { ; } +static inline struct cfq_bc_data * +bc_findcreate_cfq_bc(struct ub_iopriv *iopriv, + struct cfq_data *cfqd, gfp_t mask) +{ + return &cfqd->cfq_bc; +} +static inline void bc_cfq_exit_queue(struct cfq_data *cfqd) { ; } +static inline int bc_expired(struct cfq_data *cfqd) { return 0; } +static inline void bc_schedule_active(struct cfq_data *cfqd) +{ + cfqd->active_cfq_bc = &cfqd->cfq_bc; +} +static inline void bc_inc_rqnum(struct cfq_queue *cfqq) { ; } +static inline void bc_dec_rqnum(struct cfq_queue *cfqq) { ; } +static inline unsigned long bc_set_ioprio(int ubid, int ioprio) +{ + return -EINVAL; +} +static inline struct cfq_bc_data * +__find_cfq_bc(struct ub_iopriv *iopriv, struct cfq_data *cfqd) +{ + return &cfqd->cfq_bc; +} +static inline struct user_beancounter * +bc_io_switch_context(struct page *page) { return NULL; } +static inline void bc_io_restore_context(struct user_beancounter *ub) { ; } +#endif /* CONFIG_BC_IO_SCHED */ +#endif /* _UB_IO_PRIO_H */ diff --git a/include/bc/kmem.h b/include/bc/kmem.h new file mode 100644 index 0000000..c0ea26a --- /dev/null +++ b/include/bc/kmem.h @@ -0,0 +1,69 @@ +/* + * include/bc/kmem.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_SLAB_H_ +#define __UB_SLAB_H_ + +#include +#include + +/* + * UB_KMEMSIZE accounting + */ + +#ifdef CONFIG_BC_DEBUG_ITEMS +#define CHARGE_ORDER(__o) (1 << (__o)) +#define CHARGE_SIZE(__s) 1 +#else +#define CHARGE_ORDER(__o) (PAGE_SIZE << (__o)) +#define CHARGE_SIZE(__s) (__s) +#endif + +#ifdef CONFIG_BEANCOUNTERS +#define page_ub(__page) ((__page)->bc.page_ub) +#else +#define page_ub(__page) NULL +#endif + +struct mm_struct; +struct page; +struct kmem_cache; + +UB_DECLARE_FUNC(struct user_beancounter *, vmalloc_ub(void *obj)) +UB_DECLARE_FUNC(struct user_beancounter *, mem_ub(void *obj)) + +UB_DECLARE_FUNC(int, ub_kmemsize_charge(struct user_beancounter *ub, + unsigned long size, enum ub_severity strict)) +UB_DECLARE_VOID_FUNC(ub_kmemsize_uncharge(struct user_beancounter *ub, + unsigned long size)) + +UB_DECLARE_FUNC(int, ub_page_charge(struct page *page, int order, gfp_t mask)) +UB_DECLARE_VOID_FUNC(ub_page_uncharge(struct page *page, int order)) +UB_DECLARE_FUNC(int, ub_slab_charge(struct kmem_cache *cachep, + void *objp, gfp_t flags)) +UB_DECLARE_VOID_FUNC(ub_slab_uncharge(struct kmem_cache *cachep, void *obj)) + +#ifdef CONFIG_BEANCOUNTERS +static inline int should_charge(struct kmem_cache *cachep, gfp_t flags) +{ + if (!(cachep->flags & SLAB_UBC)) + return 0; + if ((cachep->flags & SLAB_NO_CHARGE) && !(flags & __GFP_UBC)) + return 0; + return 1; +} + +#define should_uncharge(cachep) should_charge(cachep, __GFP_UBC) +#else +#define should_charge(cache, f) 0 +#define should_uncharge(cache) 0 +#endif + +#endif /* __UB_SLAB_H_ */ diff --git a/include/bc/misc.h b/include/bc/misc.h new file mode 100644 index 0000000..84082b2 --- /dev/null +++ b/include/bc/misc.h @@ -0,0 +1,55 @@ +/* + * include/bc/misc.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_MISC_H_ +#define __BC_MISC_H_ + +#include + +struct tty_struct; +struct file; +struct file_lock; +struct sigqueue; + +UB_DECLARE_FUNC(int, ub_file_charge(struct file *f)) +UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f)) +UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard)) +UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl)) +UB_DECLARE_FUNC(int, ub_siginfo_charge(struct sigqueue *q, + struct user_beancounter *ub)) +UB_DECLARE_VOID_FUNC(ub_siginfo_uncharge(struct sigqueue *q)) +UB_DECLARE_FUNC(int, ub_task_charge(struct task_struct *parent, + struct task_struct *task)) +UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct task_struct *task)) +UB_DECLARE_VOID_FUNC(ub_task_put(struct task_struct *task)) +UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty)) +UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty)) + +#ifdef CONFIG_BEANCOUNTERS +#define set_flock_charged(fl) do { (fl)->fl_charged = 1; } while (0) +#define unset_flock_charged(fl) do { \ + WARN_ON((fl)->fl_charged == 0); \ + (fl)->fl_charged = 0; \ + } while (0) +#define set_mm_ub(mm, tsk) do { \ + (mm)->mm_ub = get_beancounter(tsk != current ? \ + tsk->task_bc.task_ub : get_exec_ub()); \ + } while (0) +#define put_mm_ub(mm) do { \ + put_beancounter((mm)->mm_ub); \ + (mm)->mm_ub = NULL; \ + } while (0) +#else +#define set_flock_charged(fl) do { } while (0) +#define unset_flock_charged(fl) do { } while (0) +#define set_mm_ub(mm, tsk) do { } while (0) +#define put_mm_ub(mm) do { } while (0) +#endif +#endif diff --git a/include/bc/net.h b/include/bc/net.h new file mode 100644 index 0000000..7c4c894 --- /dev/null +++ b/include/bc/net.h @@ -0,0 +1,213 @@ +/* + * include/bc/net.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_NET_H_ +#define __BC_NET_H_ + +/* + * UB_NUMXXXSOCK, UB_XXXBUF accounting + */ + +#include +#include +#include + +#define bid2sid(__bufid) \ + ((__bufid) == UB_TCPSNDBUF ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK) + +#define SOCK_MIN_UBCSPACE ((int)((2048 - sizeof(struct skb_shared_info)) & \ + ~(SMP_CACHE_BYTES-1))) +#define SOCK_MIN_UBCSPACE_CH skb_charge_size(SOCK_MIN_UBCSPACE) + +static inline int ub_skb_alloc_bc(struct sk_buff *skb, gfp_t gfp_mask) +{ +#ifdef CONFIG_BEANCOUNTERS + memset(skb_bc(skb), 0, sizeof(struct skb_beancounter)); +#endif + return 0; +} + +static inline void ub_skb_free_bc(struct sk_buff *skb) +{ +} + +#define IS_TCP_SOCK(__family, __type) \ + (((__family) == PF_INET || (__family) == PF_INET6) && (__type) == SOCK_STREAM) + +/* number of sockets */ +UB_DECLARE_FUNC(int, ub_sock_charge(struct sock *sk, int family, int type)) +UB_DECLARE_FUNC(int, ub_tcp_sock_charge(struct sock *sk)) +UB_DECLARE_FUNC(int, ub_other_sock_charge(struct sock *sk)) +UB_DECLARE_VOID_FUNC(ub_sock_uncharge(struct sock *sk)) + +/* management of queue for send space */ +UB_DECLARE_FUNC(long, ub_sock_wait_for_space(struct sock *sk, long timeo, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_sock_snd_queue_add(struct sock *sk, int resource, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_sock_sndqueuedel(struct sock *sk)) + +/* send space */ +UB_DECLARE_FUNC(int, ub_sock_make_wreserv(struct sock *sk, int bufid, + unsigned long size)) +UB_DECLARE_FUNC(int, ub_sock_get_wreserv(struct sock *sk, int bufid, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_sock_ret_wreserv(struct sock *sk, int bufid, + unsigned long size, unsigned long ressize)) +UB_DECLARE_FUNC(int, ub_sock_tcp_chargesend(struct sock *sk, + struct sk_buff *skb, enum ub_severity strict)) +UB_DECLARE_FUNC(int, ub_sock_tcp_chargepage(struct sock *sk)) +UB_DECLARE_VOID_FUNC(ub_sock_tcp_detachpage(struct sock *sk)) + +UB_DECLARE_FUNC(int, ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk)) + +/* receive space */ +UB_DECLARE_FUNC(int, ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb)) +UB_DECLARE_FUNC(int, ub_sock_tcp_chargerecv(struct sock *sk, + struct sk_buff *skb, enum ub_severity strict)) + +/* skb destructor */ +UB_DECLARE_VOID_FUNC(ub_skb_uncharge(struct sk_buff *skb)) + +static inline int ub_sock_makewres_other(struct sock *sk, unsigned long size) +{ + return ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size); +} + +static inline int ub_sock_makewres_tcp(struct sock *sk, unsigned long size) +{ + return ub_sock_make_wreserv(sk, UB_TCPSNDBUF, size); +} + +UB_DECLARE_FUNC(int, ub_sock_getwres_other(struct sock *sk, + unsigned long size)) + +static inline int ub_sock_getwres_tcp(struct sock *sk, unsigned long size) +{ + return ub_sock_get_wreserv(sk, UB_TCPSNDBUF, size); +} + +UB_DECLARE_VOID_FUNC(ub_sock_retwres_other(struct sock *sk, + unsigned long size, unsigned long ressize)) + +static inline void ub_sock_retwres_tcp(struct sock *sk, unsigned long size, + unsigned long ressize) +{ + ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, ressize); +} + +static inline void ub_sock_sndqueueadd_other(struct sock *sk, unsigned long sz) +{ + ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, sz); +} + +static inline void ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz) +{ + ub_sock_snd_queue_add(sk, UB_TCPSNDBUF, sz); +} + +static inline int ub_tcpsndbuf_charge(struct sock *sk, + struct sk_buff *skb) +{ + return ub_sock_tcp_chargesend(sk, skb, UB_HARD); +} + +static inline int ub_tcpsndbuf_charge_forced(struct sock *sk, + struct sk_buff *skb) +{ + return ub_sock_tcp_chargesend(sk, skb, UB_FORCE); +} + +static inline int ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb) +{ + return ub_sock_tcp_chargerecv(sk, skb, UB_SOFT); +} + +static inline int ub_tcprcvbuf_charge_forced(struct sock *sk, + struct sk_buff *skb) +{ + return ub_sock_tcp_chargerecv(sk, skb, UB_FORCE); +} + +/* Charge size */ +static inline unsigned long skb_charge_datalen(unsigned long chargesize) +{ +#ifdef CONFIG_BEANCOUNTERS + unsigned long slabsize; + + chargesize -= sizeof(struct sk_buff); + slabsize = 64; + do { + slabsize <<= 1; + } while (slabsize <= chargesize); + + slabsize >>= 1; + return (slabsize - sizeof(struct skb_shared_info)) & + ~(SMP_CACHE_BYTES-1); +#else + return 0; +#endif +} + +static inline unsigned long skb_charge_size_gen(unsigned long size) +{ +#ifdef CONFIG_BEANCOUNTERS + unsigned int slabsize; + + size = SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info); + slabsize = 32; /* min size is 64 because of skb_shared_info */ + do { + slabsize <<= 1; + } while (slabsize < size); + + return slabsize + sizeof(struct sk_buff); +#else + return 0; +#endif + +} + +static inline unsigned long skb_charge_size_const(unsigned long size) +{ +#ifdef CONFIG_BEANCOUNTERS + unsigned int ret; + if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 64) + ret = 64 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 128) + ret = 128 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 256) + ret = 256 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 512) + ret = 512 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 1024) + ret = 1024 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 2048) + ret = 2048 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 4096) + ret = 4096 + sizeof(struct sk_buff); + else + ret = skb_charge_size_gen(size); + return ret; +#else + return 0; +#endif +} + + +#define skb_charge_size(__size) \ + (__builtin_constant_p(__size) ? \ + skb_charge_size_const(__size) : \ + skb_charge_size_gen(__size)) + +UB_DECLARE_FUNC(int, skb_charge_fullsize(struct sk_buff *skb)) +UB_DECLARE_VOID_FUNC(ub_skb_set_charge(struct sk_buff *skb, + struct sock *sk, unsigned long size, int res)) + +#endif diff --git a/include/bc/oom_kill.h b/include/bc/oom_kill.h new file mode 100644 index 0000000..c07608f --- /dev/null +++ b/include/bc/oom_kill.h @@ -0,0 +1,26 @@ +#include +#include + +UB_DECLARE_FUNC(int, ub_oom_lock(void)) +UB_DECLARE_FUNC(struct user_beancounter *, ub_oom_select_worst(void)) +UB_DECLARE_VOID_FUNC(ub_oom_mm_killed(struct user_beancounter *ub)) +UB_DECLARE_VOID_FUNC(ub_oom_unlock(void)) +UB_DECLARE_VOID_FUNC(ub_out_of_memory(struct user_beancounter *ub)) +UB_DECLARE_VOID_FUNC(ub_oom_task_dead(struct task_struct *tsk)) +UB_DECLARE_FUNC(int, ub_oom_task_skip(struct user_beancounter *ub, + struct task_struct *tsk)) + +#ifdef CONFIG_BEANCOUNTERS +extern int oom_generation; +extern int oom_kill_counter; +#define ub_oom_start() do { \ + current->task_bc.oom_generation = oom_generation; \ + } while (0) +#define ub_oom_task_killed(p) do { \ + oom_kill_counter++; \ + wake_up_process(p); \ + } while (0) +#else +#define ub_oom_start() do { } while (0) +#define ub_oom_task_killed(p) do { } while (0) +#endif diff --git a/include/bc/proc.h b/include/bc/proc.h new file mode 100644 index 0000000..f244523 --- /dev/null +++ b/include/bc/proc.h @@ -0,0 +1,40 @@ +/* + * include/bc/proc.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_PROC_H_ +#define __UB_PROC_H_ + +#include + +struct bc_proc_entry { + char *name; + union { + int (*show)(struct seq_file *, void *); + struct file_operations *fops; + } u; + struct bc_proc_entry *next; + int cookie; +}; + +struct user_beancounter; + +void bc_register_proc_entry(struct bc_proc_entry *); +void bc_register_proc_root_entry(struct bc_proc_entry *); + +static inline struct user_beancounter *seq_beancounter(struct seq_file *f) +{ + return (struct user_beancounter *)(f->private); +} + +extern const char *bc_proc_lu_fmt; +extern const char *bc_proc_lu_lfmt; +extern const char *bc_proc_llu_fmt; +extern const char *bc_proc_lu_lu_fmt; +#endif diff --git a/include/bc/rss_pages.h b/include/bc/rss_pages.h new file mode 100644 index 0000000..b195961 --- /dev/null +++ b/include/bc/rss_pages.h @@ -0,0 +1,57 @@ +/* + * include/bc/rss_pages.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __RSS_PAGES_H_ +#define __RSS_PAGES_H_ + +/* + * Page_beancounters + */ + +struct page; +struct user_beancounter; + +#define PB_MAGIC 0x62700001UL + +struct page_beancounter { + unsigned long pb_magic; + struct page *page; + struct user_beancounter *ub; + union { + struct page_beancounter *next_hash; + struct page_beancounter *page_pb_list; + }; + union { + unsigned refcount; + unsigned io_debug; + }; + union { + struct list_head page_list; + struct list_head io_list; + }; +}; + +#define PB_REFCOUNT_BITS 24 +#define PB_SHIFT_GET(c) ((c) >> PB_REFCOUNT_BITS) +#define PB_SHIFT_INC(c) ((c) += (1 << PB_REFCOUNT_BITS)) +#define PB_SHIFT_DEC(c) ((c) -= (1 << PB_REFCOUNT_BITS)) +#define PB_COUNT_GET(c) ((c) & ((1 << PB_REFCOUNT_BITS) - 1)) +#define PB_COUNT_INC(c) ((c)++) +#define PB_COUNT_DEC(c) ((c)--) +#define PB_REFCOUNT_MAKE(s, c) (((s) << PB_REFCOUNT_BITS) + (c)) + +#define page_pbc(__page) ((__page)->bc.page_pb) + +extern spinlock_t pb_lock; + +struct address_space; +extern int is_shmem_mapping(struct address_space *); + +#endif diff --git a/include/bc/sock.h b/include/bc/sock.h new file mode 100644 index 0000000..b314c9b --- /dev/null +++ b/include/bc/sock.h @@ -0,0 +1,47 @@ +/* + * include/bc/sock.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_SOCK_H_ +#define __BC_SOCK_H_ + +#include + +struct sock; +struct sk_buff; + +struct skb_beancounter { + struct user_beancounter *ub; + unsigned long charged:27, resource:5; +}; + +struct sock_beancounter { + struct user_beancounter *ub; + /* + * poll_reserv accounts space already charged for future sends. + * It is required to make poll agree with sendmsg. + * Additionally, it makes real charges (with taking bc spinlock) + * in the send path rarer, speeding networking up. + * For TCP (only): changes are protected by socket lock (not bc!) + * For all proto: may be read without serialization in poll. + */ + unsigned long poll_reserv; + unsigned long forw_space; + /* fields below are protected by bc spinlock */ + unsigned long ub_waitspc; /* space waiting for */ + unsigned long ub_wcharged; + struct list_head ub_sock_list; +}; + +#define sock_bc(__sk) (&(__sk)->sk_bc) +#define skb_bc(__skb) (&(__skb)->skb_bc) +#define skbc_sock(__skbc) (container_of(__skbc, struct sock, sk_bc)) +#define sock_has_ubc(__sk) (sock_bc(__sk)->ub != NULL) + +#endif diff --git a/include/bc/sock_orphan.h b/include/bc/sock_orphan.h new file mode 100644 index 0000000..038d52b --- /dev/null +++ b/include/bc/sock_orphan.h @@ -0,0 +1,106 @@ +/* + * include/bc/sock_orphan.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_SOCK_ORPHAN_H_ +#define __BC_SOCK_ORPHAN_H_ + +#include + +#include "bc/beancounter.h" +#include "bc/net.h" + + +static inline atomic_t *__ub_get_orphan_count_ptr(struct sock *sk) +{ +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) + return &sock_bc(sk)->ub->ub_orphan_count; +#endif + return sk->sk_prot->orphan_count; +} + +static inline void ub_inc_orphan_count(struct sock *sk) +{ + atomic_inc(__ub_get_orphan_count_ptr(sk)); +} + +static inline void ub_dec_orphan_count(struct sock *sk) +{ + atomic_dec(__ub_get_orphan_count_ptr(sk)); +} + +static inline int ub_get_orphan_count(struct sock *sk) +{ + return atomic_read(__ub_get_orphan_count_ptr(sk)); +} + +extern int __ub_too_many_orphans(struct sock *sk, int count); +static inline int ub_too_many_orphans(struct sock *sk, int count) +{ +#ifdef CONFIG_BEANCOUNTERS + if (__ub_too_many_orphans(sk, count)) + return 1; +#endif + return (ub_get_orphan_count(sk) > sysctl_tcp_max_orphans || + (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])); +} + +#include + +struct inet_timewait_sock; + +static inline void ub_timewait_mod(struct inet_timewait_sock *tw, int incdec) +{ +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *ub; + + ub = slab_ub(tw); + if (ub != NULL) + ub->ub_tw_count += incdec; +#endif +} + +static inline int __ub_timewait_check(struct sock *sk) +{ +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *ub; + unsigned long mem_max, mem; + int tw_count; + + ub = sock_bc(sk)->ub; + if (ub == NULL) + return 1; + + tw_count = ub->ub_tw_count; + mem_max = sysctl_tcp_max_tw_kmem_fraction * + ((ub->ub_parms[UB_KMEMSIZE].limit >> 10) + 1); + mem = kmem_cache_objuse(sk->sk_prot_creator->twsk_prot->twsk_slab); + mem *= tw_count; + return tw_count < sysctl_tcp_max_tw_buckets_ub && mem < mem_max; +#else + return 1; +#endif +} + +#define ub_timewait_inc(tw, twdr) do { \ + if ((twdr)->ub_managed) \ + ub_timewait_mod(tw, 1); \ + } while (0) + +#define ub_timewait_dec(tw, twdr) do { \ + if ((twdr)->ub_managed) \ + ub_timewait_mod(tw, -1); \ + } while (0) + +#define ub_timewait_check(sk, twdr) ((!(twdr)->ub_managed) || \ + __ub_timewait_check(sk)) + +#endif diff --git a/include/bc/statd.h b/include/bc/statd.h new file mode 100644 index 0000000..9dafc5e --- /dev/null +++ b/include/bc/statd.h @@ -0,0 +1,70 @@ +/* + * include/bc/statd.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_STATD_H_ +#define __BC_STATD_H_ + +/* sys_ubstat commands list */ +#define UBSTAT_READ_ONE 0x010000 +#define UBSTAT_READ_ALL 0x020000 +#define UBSTAT_READ_FULL 0x030000 +#define UBSTAT_UBLIST 0x040000 +#define UBSTAT_UBPARMNUM 0x050000 +#define UBSTAT_GETTIME 0x060000 + +#define UBSTAT_CMD(func) ((func) & 0xF0000) +#define UBSTAT_PARMID(func) ((func) & 0x0FFFF) + +#define TIME_MAX_SEC (LONG_MAX / HZ) +#define TIME_MAX_JIF (TIME_MAX_SEC * HZ) + +typedef unsigned long ubstattime_t; + +typedef struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstattime_t cur_time; +} ubgettime_t; + +typedef struct { + long maxinterval; + int signum; +} ubnotifrq_t; + +typedef struct { + unsigned long maxheld; + unsigned long failcnt; +} ubstatparm_t; + +typedef struct { + unsigned long barrier; + unsigned long limit; + unsigned long held; + unsigned long maxheld; + unsigned long minheld; + unsigned long failcnt; + unsigned long __unused1; + unsigned long __unused2; +} ubstatparmf_t; + +typedef struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparmf_t param[0]; +} ubstatfull_t; + +#ifdef __KERNEL__ +struct ub_stat_notify { + struct list_head list; + struct task_struct *task; + int signum; +}; +#endif +#endif diff --git a/include/bc/task.h b/include/bc/task.h new file mode 100644 index 0000000..f5a2915 --- /dev/null +++ b/include/bc/task.h @@ -0,0 +1,69 @@ +/* + * include/bc/task.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_TASK_H_ +#define __BC_TASK_H_ + +struct user_beancounter; + + +#ifdef CONFIG_BEANCOUNTERS +struct task_beancounter { + struct user_beancounter *exec_ub; + struct user_beancounter *saved_ub; + struct user_beancounter *task_ub; + struct user_beancounter *fork_sub; + unsigned long file_precharged, file_quant, file_count; + unsigned long kmem_precharged; + char dentry_alloc, pgfault_handle; + void *task_fnode, *task_freserv; + unsigned long oom_generation; + unsigned long task_data[4]; + unsigned long pgfault_allot; +}; + +#define get_task_ub(__task) ((__task)->task_bc.task_ub) + +extern struct user_beancounter ub0; +#define get_ub0() (&ub0) + +#define ub_save_context(t) do { \ + t->task_bc.saved_ub = t->task_bc.exec_ub; \ + t->task_bc.exec_ub = get_ub0(); \ + } while (0) +#define ub_restore_context(t) do { \ + t->task_bc.exec_ub = t->task_bc.saved_ub; \ + } while (0) + +#define get_exec_ub() (current->task_bc.exec_ub) +#define set_exec_ub(__newub) \ +({ \ + struct user_beancounter *old; \ + struct task_beancounter *tbc; \ + \ + tbc = ¤t->task_bc; \ + old = tbc->exec_ub; \ + tbc->exec_ub = __newub; \ + old; \ +}) + +void ub_init_task_bc(struct task_beancounter *); + +#else /* CONFIG_BEANCOUNTERS */ + +#define get_ub0() (NULL) +#define get_exec_ub() (NULL) +#define get_task_ub(task) (NULL) +#define set_exec_ub(__ub) (NULL) +#define ub_save_context(t) do { } while (0) +#define ub_restore_context(t) do { } while (0) + +#endif /* CONFIG_BEANCOUNTERS */ +#endif /* __task.h_ */ diff --git a/include/bc/tcp.h b/include/bc/tcp.h new file mode 100644 index 0000000..d2bf748 --- /dev/null +++ b/include/bc/tcp.h @@ -0,0 +1,76 @@ +/* + * include/bc/tcp.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_TCP_H_ +#define __BC_TCP_H_ + +/* + * UB_NUMXXXSOCK, UB_XXXBUF accounting + */ + +#include +#include + +static inline void ub_tcp_update_maxadvmss(struct sock *sk) +{ +#ifdef CONFIG_BEANCOUNTERS + if (!sock_has_ubc(sk)) + return; + if (sock_bc(sk)->ub->ub_maxadvmss >= tcp_sk(sk)->advmss) + return; + + sock_bc(sk)->ub->ub_maxadvmss = + skb_charge_size(MAX_HEADER + sizeof(struct iphdr) + + sizeof(struct tcphdr) + tcp_sk(sk)->advmss); +#endif +} + +static inline int ub_tcp_rmem_allows_expand(struct sock *sk) +{ + if (tcp_memory_pressure) + return 0; +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) { + struct user_beancounter *ub; + + ub = sock_bc(sk)->ub; + if (ub->ub_rmem_pressure == UB_RMEM_EXPAND) + return 1; + if (ub->ub_rmem_pressure == UB_RMEM_SHRINK) + return 0; + return sk->sk_rcvbuf <= ub->ub_rmem_thres; + } +#endif + return 1; +} + +static inline int ub_tcp_memory_pressure(struct sock *sk) +{ + if (tcp_memory_pressure) + return 1; +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) + return sock_bc(sk)->ub->ub_rmem_pressure != UB_RMEM_EXPAND; +#endif + return 0; +} + +static inline int ub_tcp_shrink_rcvbuf(struct sock *sk) +{ + if (tcp_memory_pressure) + return 1; +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) + return sock_bc(sk)->ub->ub_rmem_pressure == UB_RMEM_SHRINK; +#endif + return 0; +} + +#endif diff --git a/include/bc/vmpages.h b/include/bc/vmpages.h new file mode 100644 index 0000000..09642e3 --- /dev/null +++ b/include/bc/vmpages.h @@ -0,0 +1,152 @@ +/* + * include/bc/vmpages.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_PAGES_H_ +#define __UB_PAGES_H_ + +#include +#include +#include + +/* + * Check whether vma has private or copy-on-write mapping. + * Should match checks in ub_protected_charge(). + */ +#define VM_UB_PRIVATE(__flags, __file) \ + ( ((__flags) & VM_WRITE) ? \ + (__file) == NULL || !((__flags) & VM_SHARED) : \ + 0 \ + ) + +/* Mprotect charging result */ +#define PRIVVM_ERROR -1 +#define PRIVVM_NO_CHARGE 0 /* UB_DECLARE_FUNC retval with ubc off */ +#define PRIVVM_TO_PRIVATE 1 +#define PRIVVM_TO_SHARED 2 + +UB_DECLARE_FUNC(int, ub_protected_charge(struct mm_struct *mm, + unsigned long size, + unsigned long newflags, + struct vm_area_struct *vma)) + +UB_DECLARE_VOID_FUNC(ub_unused_privvm_add(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long num)) +#define ub_unused_privvm_inc(mm, vma) ub_unused_privvm_add(mm, vma, 1) +UB_DECLARE_VOID_FUNC(ub_unused_privvm_sub(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long num)) +#define ub_unused_privvm_dec(mm, vma) ub_unused_privvm_sub(mm, vma, 1) + +UB_DECLARE_VOID_FUNC(__ub_unused_privvm_dec(struct mm_struct *mm, + long sz)) + +UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm, + unsigned long size, + unsigned vm_flags, + struct file *vm_file, + int strict)) +UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm, + unsigned long size, + unsigned vm_flags, + struct file *vm_file)) + +struct shmem_inode_info; +UB_DECLARE_FUNC(int, ub_shmpages_charge(struct shmem_inode_info *i, + unsigned long sz)) +UB_DECLARE_VOID_FUNC(ub_shmpages_uncharge(struct shmem_inode_info *i, + unsigned long sz)) +UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_inc(struct shmem_inode_info *shi)) +UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_sub(struct shmem_inode_info *shi, + unsigned long size)) +#define ub_tmpfs_respages_dec(shi) ub_tmpfs_respages_sub(shi, 1) + +#ifdef CONFIG_BEANCOUNTERS +#define shmi_ub_set(shi, ub) do { \ + (shi)->shmi_ub = get_beancounter(ub); \ + } while (0) +#define shmi_ub_put(shi) do { \ + put_beancounter((shi)->shmi_ub); \ + (shi)->shmi_ub = NULL; \ + } while (0) +#else +#define shmi_ub_set(shi, ub) do { } while (0) +#define shmi_ub_put(shi) do { } while (0) +#endif + +UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm, + unsigned long size)) +UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi, + unsigned long size)) + +UB_DECLARE_FUNC(unsigned long, pages_in_vma_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end)) +#define pages_in_vma(vma) (pages_in_vma_range(vma, \ + vma->vm_start, vma->vm_end)) + +#define UB_PAGE_WEIGHT_SHIFT 24 +#define UB_PAGE_WEIGHT (1 << UB_PAGE_WEIGHT_SHIFT) + +struct page_beancounter; +#define PBC_COPY_SAME ((struct page_beancounter *) 1) + +/* Mprotect charging result */ +#define PRIVVM_ERROR -1 +#define PRIVVM_NO_CHARGE 0 +#define PRIVVM_TO_PRIVATE 1 +#define PRIVVM_TO_SHARED 2 + +extern void __ub_update_physpages(struct user_beancounter *ub); +extern void __ub_update_oomguarpages(struct user_beancounter *ub); +extern void __ub_update_privvm(struct user_beancounter *ub); + +#ifdef CONFIG_BC_RSS_ACCOUNTING +#define PB_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) +#define PB_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) +#else +#define PB_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} +#define PB_DECLARE_VOID_FUNC(decl) static inline void decl { } +#endif + +PB_DECLARE_FUNC(int, pb_alloc(struct page_beancounter **pbc)) +PB_DECLARE_FUNC(int, pb_alloc_list(struct page_beancounter **pbc, int num)) +PB_DECLARE_FUNC(int, pb_alloc_all(struct page_beancounter **pbc)) +PB_DECLARE_VOID_FUNC(pb_add_ref(struct page *page, + struct mm_struct *mm, + struct page_beancounter **pbc)) +PB_DECLARE_VOID_FUNC(pb_dup_ref(struct page *page, + struct mm_struct *mm, + struct page_beancounter **pbc)) +PB_DECLARE_VOID_FUNC(pb_free_list(struct page_beancounter **pb)) +PB_DECLARE_VOID_FUNC(pb_free(struct page_beancounter **pb)) +PB_DECLARE_VOID_FUNC(pb_remove_ref(struct page *page, + struct mm_struct *mm)) + +PB_DECLARE_FUNC(struct user_beancounter *, pb_grab_page_ub(struct page *page)) +#endif + +#ifdef CONFIG_BC_SWAP_ACCOUNTING +#define SWP_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) +#define SWP_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) +#else +#define SWP_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} +#define SWP_DECLARE_VOID_FUNC(decl) static inline void decl { } +#endif + +struct swap_info_struct; +SWP_DECLARE_FUNC(int, ub_swap_init(struct swap_info_struct *si, pgoff_t n)) +SWP_DECLARE_VOID_FUNC(ub_swap_fini(struct swap_info_struct *si)) +SWP_DECLARE_VOID_FUNC(ub_swapentry_inc(struct swap_info_struct *si, pgoff_t n, + struct user_beancounter *ub)) +SWP_DECLARE_VOID_FUNC(ub_swapentry_dec(struct swap_info_struct *si, pgoff_t n)) diff --git a/include/linux/aio.h b/include/linux/aio.h index 09b276c..bd4b515 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -224,4 +224,8 @@ static inline struct kiocb *list_kiocb(struct list_head *h) extern unsigned long aio_nr; extern unsigned long aio_max_nr; +void wait_for_all_aios(struct kioctx *ctx); +extern struct kmem_cache *kioctx_cachep; +extern void aio_kick_handler(struct work_struct *); + #endif /* __LINUX__AIO_H */ diff --git a/include/linux/capability.h b/include/linux/capability.h index 9d1fe30..8a90af2 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -186,12 +186,9 @@ typedef struct kernel_cap_struct { #define CAP_NET_BROADCAST 11 -/* Allow interface configuration */ /* Allow administration of IP firewall, masquerading and accounting */ /* Allow setting debug option on sockets */ /* Allow modification of routing tables */ -/* Allow setting arbitrary process / process group ownership on - sockets */ /* Allow binding to any address for transparent proxying */ /* Allow setting TOS (type of service) */ /* Allow setting promiscuous mode */ @@ -221,6 +218,7 @@ typedef struct kernel_cap_struct { #define CAP_SYS_MODULE 16 /* Allow ioperm/iopl access */ +/* Allow O_DIRECT access */ /* Allow sending USB messages to any device via /proc/bus/usb */ #define CAP_SYS_RAWIO 17 @@ -239,24 +237,19 @@ typedef struct kernel_cap_struct { /* Allow configuration of the secure attention key */ /* Allow administration of the random device */ -/* Allow examination and configuration of disk quotas */ /* Allow configuring the kernel's syslog (printk behaviour) */ /* Allow setting the domainname */ /* Allow setting the hostname */ /* Allow calling bdflush() */ -/* Allow mount() and umount(), setting up new smb connection */ +/* Allow setting up new smb connection */ /* Allow some autofs root ioctls */ /* Allow nfsservctl */ /* Allow VM86_REQUEST_IRQ */ /* Allow to read/write pci config on alpha */ /* Allow irix_prctl on mips (setstacksize) */ /* Allow flushing all cache on m68k (sys_cacheflush) */ -/* Allow removing semaphores */ -/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores - and shared memory */ /* Allow locking/unlocking of shared memory segment */ /* Allow turning swap on/off */ -/* Allow forged pids on socket credentials passing */ /* Allow setting readahead and flushing buffers on block devices */ /* Allow setting geometry in floppy driver */ /* Allow turning DMA on/off in xd driver */ @@ -329,6 +322,50 @@ typedef struct kernel_cap_struct { #define CAP_SETFCAP 31 +#ifdef __KERNEL__ +/* + * Important note: VZ capabilities do intersect with CAP_AUDIT + * this is due to compatibility reasons. Nothing bad. + * Both VZ and Audit/SELinux caps are disabled in VPSs. + */ + +/* Allow access to all information. In the other case some structures will be + hiding to ensure different Virtual Environment non-interaction on the same + node */ +#define CAP_SETVEID 29 + +#define CAP_VE_ADMIN 30 + +#ifdef CONFIG_VE + +/* Replacement for CAP_NET_ADMIN: + delegated rights to the Virtual environment of its network administration. + For now the following rights have been delegated: + + Allow setting arbitrary process / process group ownership on sockets + Allow interface configuration + */ +#define CAP_VE_NET_ADMIN CAP_VE_ADMIN + +/* Replacement for CAP_SYS_ADMIN: + delegated rights to the Virtual environment of its administration. + For now the following rights have been delegated: + */ +/* Allow mount/umount/remount */ +/* Allow examination and configuration of disk quotas */ +/* Allow removing semaphores */ +/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores + and shared memory */ +/* Allow locking/unlocking of shared memory segment */ +/* Allow forged pids on socket credentials passing */ + +#define CAP_VE_SYS_ADMIN CAP_VE_ADMIN +#else +#define CAP_VE_NET_ADMIN CAP_NET_ADMIN +#define CAP_VE_SYS_ADMIN CAP_SYS_ADMIN +#endif +#endif + /* Override MAC access. The base kernel enforces no MAC policy. An LSM may enforce a MAC policy, and if it does and it chooses @@ -390,7 +427,16 @@ typedef struct kernel_cap_struct { #define CAP_INIT_INH_SET CAP_EMPTY_SET # define cap_clear(c) do { (c) = __cap_empty_set; } while (0) +#ifndef CONFIG_VE # define cap_set_full(c) do { (c) = __cap_full_set; } while (0) +#else +# define cap_set_full(c) do { \ + if (ve_is_super(get_exec_env())) \ + (c) = __cap_full_set; \ + else \ + (c) = get_exec_env()->ve_cap_bset;\ + } while (0) +#endif # define cap_set_init_eff(c) do { (c) = __cap_init_eff_set; } while (0) #define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag)) @@ -503,6 +549,10 @@ extern const kernel_cap_t __cap_init_eff_set; kernel_cap_t cap_set_effective(const kernel_cap_t pE_new); +#include + +extern spinlock_t task_capability_lock; + /** * has_capability - Determine if a task has a superior capability available * @t: The task in question diff --git a/include/linux/cfq-iosched.h b/include/linux/cfq-iosched.h new file mode 100644 index 0000000..4e2afed --- /dev/null +++ b/include/linux/cfq-iosched.h @@ -0,0 +1,149 @@ +#ifndef _LINUX_CFQ_IOSCHED_H +#define _LINUX_CFQ_IOSCHED_H + +#include +#include +#include + +extern struct kmem_cache *cfq_pool; + +#define CFQ_PRIO_LISTS IOPRIO_BE_NR + +/* + * Most of our rbtree usage is for sorting with min extraction, so + * if we cache the leftmost node we don't have to walk down the tree + * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should + * move this into the elevator for the rq sorting as well. + */ +struct cfq_rb_root { + struct rb_root rb; + struct rb_node *left; +}; +#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } + +/* + * Per (Device, UBC) queue data + */ +struct cfq_bc_data { + /* for ub.iopriv->cfq_bc_head */ + struct list_head cfq_bc_list; + /* for cfqd->act_cfq_bc_head */ + struct list_head act_cfq_bc_list; + + struct cfq_data *cfqd; + struct ub_iopriv *ub_iopriv; + + /* + * rr list of queues with requests and the count of them + */ + struct cfq_rb_root service_tree; + + int cur_prio; + int cur_end_prio; + + unsigned long rqnum; + unsigned long on_dispatch; + + /* + * async queue for each priority case + */ + struct cfq_queue *async_cfqq[2][CFQ_PRIO_LISTS]; + struct cfq_queue *async_idle_cfqq; +}; + +/* + * Per block device queue structure + */ +struct cfq_data { + struct request_queue *queue; + +#ifndef CONFIG_BC_IO_SCHED + struct cfq_bc_data cfq_bc; +#endif + unsigned int busy_queues; + + int rq_in_driver; + int sync_flight; + int hw_tag; + + /* + * idle window management + */ + struct timer_list idle_slice_timer; + struct work_struct unplug_work; + + struct cfq_queue *active_queue; + struct cfq_io_context *active_cic; + + sector_t last_position; + unsigned long last_end_request; + + /* + * tunables, see top of file + */ + unsigned int cfq_quantum; + unsigned int cfq_fifo_expire[2]; + unsigned int cfq_back_penalty; + unsigned int cfq_back_max; + unsigned int cfq_slice[2]; + unsigned int cfq_slice_async_rq; + unsigned int cfq_slice_idle; + + struct list_head cic_list; + + /* list of ub that have requests */ + struct list_head act_cfq_bc_head; + /* ub that owns a timeslice at the moment */ + struct cfq_bc_data *active_cfq_bc; + unsigned int cfq_ub_slice; + unsigned long slice_end; + int virt_mode; + int write_virt_mode; +}; + +/* + * Per process-grouping structure + */ +struct cfq_queue { + /* reference count */ + atomic_t ref; + /* various state flags, see below */ + unsigned int flags; + /* parent cfq_data */ + struct cfq_data *cfqd; + /* service_tree member */ + struct rb_node rb_node; + /* service_tree key */ + unsigned long rb_key; + /* sorted list of pending requests */ + struct rb_root sort_list; + /* if fifo isn't expired, next request to serve */ + struct request *next_rq; + /* requests queued in sort_list */ + int queued[2]; + /* currently allocated requests */ + int allocated[2]; + /* fifo list of requests in sort_list */ + struct list_head fifo; + + unsigned long slice_end; + long slice_resid; + + /* pending metadata requests */ + int meta_pending; + /* number of requests that are on the dispatch list or inside driver */ + int dispatched; + + /* io prio of this group */ + unsigned short ioprio, org_ioprio; + unsigned short ioprio_class, org_ioprio_class; + + pid_t pid; + struct cfq_bc_data *cfq_bc; +}; + +static void inline cfq_init_cfq_bc(struct cfq_bc_data *cfq_bc) +{ + cfq_bc->service_tree = CFQ_RB_ROOT; +} +#endif /* _LINUX_CFQ_IOSCHED_H */ diff --git a/include/linux/compat.h b/include/linux/compat.h index cf8d11c..3c778e2 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -238,6 +238,7 @@ extern int put_compat_itimerspec(struct compat_itimerspec __user *dst, asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp); extern int compat_printk(const char *fmt, ...); +extern int ve_compat_printk(int dst, const char *fmt, ...); extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, diff --git a/include/linux/cpt_image.h b/include/linux/cpt_image.h new file mode 100644 index 0000000..6a39f32 --- /dev/null +++ b/include/linux/cpt_image.h @@ -0,0 +1,1763 @@ +/* + * + * include/linux/cpt_image.h + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __CPT_IMAGE_H_ +#define __CPT_IMAGE_H_ 1 + +#define CPT_NULL (~0ULL) +#define CPT_NOINDEX (~0U) + +/* + * Image file layout. + * + * - major header + * - sections[] + * + * Each section is: + * - section header + * - array of objects + * + * All data records are arch independent, 64 bit aligned. + */ + +enum _cpt_object_type +{ + CPT_OBJ_TASK = 0, + CPT_OBJ_MM, + CPT_OBJ_FS, + CPT_OBJ_FILES, + CPT_OBJ_FILE, + CPT_OBJ_SIGHAND_STRUCT, + CPT_OBJ_SIGNAL_STRUCT, + CPT_OBJ_TTY, + CPT_OBJ_SOCKET, + CPT_OBJ_SYSVSEM_UNDO, + CPT_OBJ_NAMESPACE, + CPT_OBJ_SYSV_SHM, + CPT_OBJ_INODE, + CPT_OBJ_UBC, + CPT_OBJ_SLM_SGREG, + CPT_OBJ_SLM_REGOBJ, + CPT_OBJ_SLM_MM, + CPT_OBJ_MAX, + /* The objects above are stored in memory while checkpointing */ + + CPT_OBJ_VMA = 1024, + CPT_OBJ_FILEDESC, + CPT_OBJ_SIGHANDLER, + CPT_OBJ_SIGINFO, + CPT_OBJ_LASTSIGINFO, + CPT_OBJ_SYSV_SEM, + CPT_OBJ_SKB, + CPT_OBJ_FLOCK, + CPT_OBJ_OPENREQ, + CPT_OBJ_VFSMOUNT, + CPT_OBJ_TRAILER, + CPT_OBJ_SYSVSEM_UNDO_REC, + CPT_OBJ_NET_DEVICE, + CPT_OBJ_NET_IFADDR, + CPT_OBJ_NET_ROUTE, + CPT_OBJ_NET_CONNTRACK, + CPT_OBJ_NET_CONNTRACK_EXPECT, + CPT_OBJ_AIO_CONTEXT, + CPT_OBJ_VEINFO, + CPT_OBJ_EPOLL, + CPT_OBJ_EPOLL_FILE, + CPT_OBJ_SKFILTER, + CPT_OBJ_SIGALTSTACK, + CPT_OBJ_SOCK_MCADDR, + CPT_OBJ_BIND_MNT, + CPT_OBJ_SYSVMSG, + CPT_OBJ_SYSVMSG_MSG, + + CPT_OBJ_X86_REGS = 4096, + CPT_OBJ_X86_64_REGS, + CPT_OBJ_PAGES, + CPT_OBJ_COPYPAGES, + CPT_OBJ_REMAPPAGES, + CPT_OBJ_LAZYPAGES, + CPT_OBJ_NAME, + CPT_OBJ_BITS, + CPT_OBJ_REF, + CPT_OBJ_ITERPAGES, + CPT_OBJ_ITERYOUNGPAGES, + CPT_OBJ_VSYSCALL, + CPT_OBJ_IA64_REGS, + CPT_OBJ_INOTIFY, + CPT_OBJ_INOTIFY_WATCH, + CPT_OBJ_INOTIFY_EVENT, + CPT_OBJ_TASK_AUX, + CPT_OBJ_NET_TUNTAP, + CPT_OBJ_NET_HWADDR, + CPT_OBJ_NET_VETH, + CPT_OBJ_NET_STATS, +}; + +#define CPT_ALIGN(n) (((n)+7)&~7) + +struct cpt_major_hdr +{ + __u8 cpt_signature[4]; /* Magic number */ + __u16 cpt_hdrlen; /* Length of this header */ + __u16 cpt_image_version; /* Format of this file */ +#define CPT_VERSION_MINOR(a) ((a) & 0xf) +#define CPT_VERSION_8 0 +#define CPT_VERSION_9 0x100 +#define CPT_VERSION_9_1 0x101 +#define CPT_VERSION_9_2 0x102 +#define CPT_VERSION_16 0x200 +#define CPT_VERSION_18 0x300 +#define CPT_VERSION_18_1 0x301 +#define CPT_VERSION_20 0x400 +#define CPT_VERSION_24 0x500 +#define CPT_VERSION_26 0x600 +#define CPT_VERSION_27 0x700 + __u16 cpt_os_arch; /* Architecture */ +#define CPT_OS_ARCH_I386 0 +#define CPT_OS_ARCH_EMT64 1 +#define CPT_OS_ARCH_IA64 2 + __u16 __cpt_pad1; + __u32 cpt_ve_features; /* VE features */ + __u32 cpt_ve_features2; /* VE features */ + __u16 cpt_pagesize; /* Page size used by OS */ + __u16 cpt_hz; /* HZ used by OS */ + __u64 cpt_start_jiffies64; /* Jiffies */ + __u32 cpt_start_sec; /* Seconds */ + __u32 cpt_start_nsec; /* Nanoseconds */ + __u32 cpt_cpu_caps[4]; /* CPU capabilities */ + __u32 cpt_kernel_config[4]; /* Kernel config */ + __u64 cpt_iptables_mask; /* Used netfilter modules */ +} __attribute__ ((aligned (8))); + +#define CPT_SIGNATURE0 0x79 +#define CPT_SIGNATURE1 0x1c +#define CPT_SIGNATURE2 0x01 +#define CPT_SIGNATURE3 0x63 + +/* CPU capabilities */ +#define CPT_CPU_X86_CMOV 0 +#define CPT_CPU_X86_FXSR 1 +#define CPT_CPU_X86_SSE 2 +#define CPT_CPU_X86_SSE2 3 +#define CPT_CPU_X86_MMX 4 +#define CPT_CPU_X86_3DNOW 5 +#define CPT_CPU_X86_3DNOW2 6 +#define CPT_CPU_X86_SEP 7 +#define CPT_CPU_X86_EMT64 8 +#define CPT_CPU_X86_IA64 9 +#define CPT_CPU_X86_SYSCALL 10 +#define CPT_CPU_X86_SYSCALL32 11 +#define CPT_CPU_X86_SEP32 12 + +/* Unsupported features */ +#define CPT_EXTERNAL_PROCESS 16 +#define CPT_NAMESPACES 17 +#define CPT_SCHEDULER_POLICY 18 +#define CPT_PTRACED_FROM_VE0 19 +#define CPT_UNSUPPORTED_FSTYPE 20 +#define CPT_BIND_MOUNT 21 +#define CPT_UNSUPPORTED_NETDEV 22 +#define CPT_UNSUPPORTED_MISC 23 + +/* This mask is used to determine whether VE + has some unsupported features or not */ +#define CPT_UNSUPPORTED_MASK 0xffff0000UL + +#define CPT_KERNEL_CONFIG_PAE 0 + +struct cpt_section_hdr +{ + __u64 cpt_next; + __u32 cpt_section; + __u16 cpt_hdrlen; + __u16 cpt_align; +} __attribute__ ((aligned (8))); + +enum +{ + CPT_SECT_ERROR, /* Error section, content is string */ + CPT_SECT_VEINFO, + CPT_SECT_FILES, /* Files. Content is array of file objects */ + CPT_SECT_TASKS, + CPT_SECT_MM, + CPT_SECT_FILES_STRUCT, + CPT_SECT_FS, + CPT_SECT_SIGHAND_STRUCT, + CPT_SECT_TTY, + CPT_SECT_SOCKET, + CPT_SECT_NAMESPACE, + CPT_SECT_SYSVSEM_UNDO, + CPT_SECT_INODE, /* Inodes with i->i_nlink==0 and + * deleted dentires with inodes not + * referenced inside dumped process. + */ + CPT_SECT_SYSV_SHM, + CPT_SECT_SYSV_SEM, + CPT_SECT_ORPHANS, + CPT_SECT_NET_DEVICE, + CPT_SECT_NET_IFADDR, + CPT_SECT_NET_ROUTE, + CPT_SECT_NET_IPTABLES, + CPT_SECT_NET_CONNTRACK, + CPT_SECT_NET_CONNTRACK_VE0, + CPT_SECT_UTSNAME, + CPT_SECT_TRAILER, + CPT_SECT_UBC, + CPT_SECT_SLM_SGREGS, + CPT_SECT_SLM_REGOBJS, +/* Due to silly mistake we cannot index sections beyond this value */ +#define CPT_SECT_MAX_INDEX (CPT_SECT_SLM_REGOBJS+1) + CPT_SECT_EPOLL, + CPT_SECT_VSYSCALL, + CPT_SECT_INOTIFY, + CPT_SECT_SYSV_MSG, + CPT_SECT_MAX +}; + +struct cpt_major_tail +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_lazypages; + __u32 cpt_64bit; + __u64 cpt_sections[CPT_SECT_MAX_INDEX]; + __u32 cpt_nsect; + __u8 cpt_signature[4]; /* Magic number */ +} __attribute__ ((aligned (8))); + + +/* Common object header. */ +struct cpt_object_hdr +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; +} __attribute__ ((aligned (8))); + +enum _cpt_content_type { + CPT_CONTENT_VOID, + CPT_CONTENT_ARRAY, + CPT_CONTENT_DATA, + CPT_CONTENT_NAME, + + CPT_CONTENT_STACK, + CPT_CONTENT_X86_FPUSTATE_OLD, + CPT_CONTENT_X86_FPUSTATE, + CPT_CONTENT_MM_CONTEXT, + CPT_CONTENT_SEMARRAY, + CPT_CONTENT_SEMUNDO, + CPT_CONTENT_NLMARRAY, + CPT_CONTENT_MAX +}; + +/* CPT_OBJ_BITS: encode array of bytes */ +struct cpt_obj_bits +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_size; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_REF: a reference to another object */ +struct cpt_obj_ref +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_pos; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_VEINFO: various ve specific data */ +struct cpt_veinfo_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + /* ipc ctls */ + __u32 shm_ctl_max; + __u32 shm_ctl_all; + __u32 shm_ctl_mni; + __u32 msg_ctl_max; + __u32 msg_ctl_mni; + __u32 msg_ctl_mnb; + __u32 sem_ctl_arr[4]; + + /* start time */ + __u64 start_timespec_delta; + __u64 start_jiffies_delta; + + /* later extension */ + __u32 last_pid; + __u32 pad1; + __u64 reserved[8]; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_FILE: one struct file */ +struct cpt_file_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_flags; + __u32 cpt_mode; + __u64 cpt_pos; + __u32 cpt_uid; + __u32 cpt_gid; + + __u32 cpt_i_mode; + __u32 cpt_lflags; +#define CPT_DENTRY_DELETED 1 +#define CPT_DENTRY_ROOT 2 +#define CPT_DENTRY_CLONING 4 +#define CPT_DENTRY_PROC 8 +#define CPT_DENTRY_EPOLL 0x10 +#define CPT_DENTRY_REPLACED 0x20 +#define CPT_DENTRY_INOTIFY 0x40 +#define CPT_DENTRY_FUTEX 0x80 +#define CPT_DENTRY_TUNTAP 0x100 + __u64 cpt_inode; + __u64 cpt_priv; + + __u32 cpt_fown_fd; + __u32 cpt_fown_pid; +#define CPT_FOWN_STRAY_PID 0 + __u32 cpt_fown_uid; + __u32 cpt_fown_euid; + __u32 cpt_fown_signo; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by file name, encoded as CPT_OBJ_NAME */ + +struct cpt_epoll_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; +} __attribute__ ((aligned (8))); +/* Followed by array of struct cpt_epoll_file */ + +struct cpt_epoll_file_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_fd; + __u32 cpt_events; + __u64 cpt_data; + __u32 cpt_revents; + __u32 cpt_ready; +} __attribute__ ((aligned (8))); + +struct cpt_inotify_wd_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_wd; + __u32 cpt_mask; +} __attribute__ ((aligned (8))); +/* Followed by cpt_file_image of inode to watch */ + +struct cpt_inotify_ev_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_wd; + __u32 cpt_mask; + __u32 cpt_cookie; + __u32 cpt_namelen; +} __attribute__ ((aligned (8))); +/* Followed by name */ + +struct cpt_inotify_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_user; + __u32 cpt_max_events; + __u32 cpt_last_wd; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by array of struct cpt_inotify_wd_image and cpt_inotify_ev_image */ + + +/* CPT_OBJ_FILEDESC: one file descriptor */ +struct cpt_fd_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_fd; + __u32 cpt_flags; +#define CPT_FD_FLAG_CLOSEEXEC 1 + __u64 cpt_file; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_FILES: one files_struct */ +struct cpt_files_struct_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_index; + __u32 cpt_max_fds; + __u32 cpt_next_fd; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by array of cpt_fd_image */ + +/* CPT_OBJ_FS: one fs_struct */ +struct cpt_fs_struct_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_umask; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by two/three CPT_OBJ_FILENAME for root, pwd and, optionally, altroot */ + +/* CPT_OBJ_INODE: one struct inode */ +struct cpt_inode_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_dev; + __u64 cpt_ino; + __u32 cpt_mode; + __u32 cpt_nlink; + __u32 cpt_uid; + __u32 cpt_gid; + __u64 cpt_rdev; + __u64 cpt_size; + __u64 cpt_blksize; + __u64 cpt_atime; + __u64 cpt_mtime; + __u64 cpt_ctime; + __u64 cpt_blocks; + __u32 cpt_sb; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_VFSMOUNT: one vfsmount */ +struct cpt_vfsmount_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_mntflags; +#define CPT_MNT_BIND 0x80000000 +#define CPT_MNT_EXT 0x40000000 + __u32 cpt_flags; +} __attribute__ ((aligned (8))); + + +struct cpt_flock_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_owner; + __u32 cpt_pid; + __u64 cpt_start; + __u64 cpt_end; + __u32 cpt_flags; + __u32 cpt_type; +} __attribute__ ((aligned (8))); + + +struct cpt_tty_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_flags; + __u32 cpt_link; + __u32 cpt_index; + __u32 cpt_drv_type; + __u32 cpt_drv_subtype; + __u32 cpt_drv_flags; + __u8 cpt_packet; + __u8 cpt_stopped; + __u8 cpt_hw_stopped; + __u8 cpt_flow_stopped; + + __u32 cpt_canon_data; + __u32 cpt_canon_head; + __u32 cpt_canon_column; + __u32 cpt_column; + __u8 cpt_ctrl_status; + __u8 cpt_erasing; + __u8 cpt_lnext; + __u8 cpt_icanon; + __u8 cpt_raw; + __u8 cpt_real_raw; + __u8 cpt_closing; + __u8 __cpt_pad1; + __u16 cpt_minimum_to_wake; + __u16 __cpt_pad2; + __u32 cpt_pgrp; + __u32 cpt_session; + __u32 cpt_c_line; + __u8 cpt_name[64]; + __u16 cpt_ws_row; + __u16 cpt_ws_col; + __u16 cpt_ws_prow; + __u16 cpt_ws_pcol; + __u8 cpt_c_cc[32]; + __u32 cpt_c_iflag; + __u32 cpt_c_oflag; + __u32 cpt_c_cflag; + __u32 cpt_c_lflag; + __u32 cpt_read_flags[4096/32]; +} __attribute__ ((aligned (8))); + +struct cpt_sock_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_parent; + __u32 cpt_index; + + __u64 cpt_ssflags; + __u16 cpt_type; + __u16 cpt_family; + __u8 cpt_sstate; + __u8 cpt_passcred; + __u8 cpt_state; + __u8 cpt_reuse; + + __u8 cpt_zapped; + __u8 cpt_shutdown; + __u8 cpt_userlocks; + __u8 cpt_no_check; + __u8 cpt_debug; + __u8 cpt_rcvtstamp; + __u8 cpt_localroute; + __u8 cpt_protocol; + + __u32 cpt_err; + __u32 cpt_err_soft; + + __u16 cpt_max_ack_backlog; + __u16 __cpt_pad1; + __u32 cpt_priority; + + __u32 cpt_rcvlowat; + __u32 cpt_bound_dev_if; + + __u64 cpt_rcvtimeo; + __u64 cpt_sndtimeo; + __u32 cpt_rcvbuf; + __u32 cpt_sndbuf; + __u64 cpt_flags; + __u64 cpt_lingertime; + __u32 cpt_peer_pid; + __u32 cpt_peer_uid; + + __u32 cpt_peer_gid; + __u32 cpt_laddrlen; + __u32 cpt_laddr[128/4]; + __u32 cpt_raddrlen; + __u32 cpt_raddr[128/4]; + /* AF_UNIX */ + __u32 cpt_peer; + + __u8 cpt_socketpair; + __u8 cpt_deleted; + __u16 __cpt_pad4; + __u32 __cpt_pad5; +/* + struct sk_filter *sk_filter; + */ + + __u64 cpt_stamp; + __u32 cpt_daddr; + __u16 cpt_dport; + __u16 cpt_sport; + + __u32 cpt_saddr; + __u32 cpt_rcv_saddr; + + __u32 cpt_uc_ttl; + __u32 cpt_tos; + + __u32 cpt_cmsg_flags; + __u32 cpt_mc_index; + + __u32 cpt_mc_addr; +/* + struct ip_options *opt; + */ + __u8 cpt_hdrincl; + __u8 cpt_mc_ttl; + __u8 cpt_mc_loop; + __u8 cpt_pmtudisc; + + __u8 cpt_recverr; + __u8 cpt_freebind; + __u16 cpt_idcounter; + __u32 cpt_cork_flags; + + __u32 cpt_cork_fragsize; + __u32 cpt_cork_length; + __u32 cpt_cork_addr; + __u32 cpt_cork_saddr; + __u32 cpt_cork_daddr; + __u32 cpt_cork_oif; + + __u32 cpt_udp_pending; + __u32 cpt_udp_corkflag; + __u16 cpt_udp_encap; + __u16 cpt_udp_len; + __u32 __cpt_pad7; + + __u64 cpt_saddr6[2]; + __u64 cpt_rcv_saddr6[2]; + __u64 cpt_daddr6[2]; + __u32 cpt_flow_label6; + __u32 cpt_frag_size6; + __u32 cpt_hop_limit6; + __u32 cpt_mcast_hops6; + + __u32 cpt_mcast_oif6; + __u8 cpt_rxopt6; + __u8 cpt_mc_loop6; + __u8 cpt_recverr6; + __u8 cpt_sndflow6; + + __u8 cpt_pmtudisc6; + __u8 cpt_ipv6only6; + __u8 cpt_mapped; + __u8 __cpt_pad8; + __u32 cpt_pred_flags; + + __u32 cpt_rcv_nxt; + __u32 cpt_snd_nxt; + + __u32 cpt_snd_una; + __u32 cpt_snd_sml; + + __u32 cpt_rcv_tstamp; + __u32 cpt_lsndtime; + + __u8 cpt_tcp_header_len; + __u8 cpt_ack_pending; + __u8 cpt_quick; + __u8 cpt_pingpong; + __u8 cpt_blocked; + __u8 __cpt_pad9; + __u16 __cpt_pad10; + + __u32 cpt_ato; + __u32 cpt_ack_timeout; + + __u32 cpt_lrcvtime; + __u16 cpt_last_seg_size; + __u16 cpt_rcv_mss; + + __u32 cpt_snd_wl1; + __u32 cpt_snd_wnd; + + __u32 cpt_max_window; + __u32 cpt_pmtu_cookie; + + __u32 cpt_mss_cache; + __u16 cpt_mss_cache_std; + __u16 cpt_mss_clamp; + + __u16 cpt_ext_header_len; + __u16 cpt_ext2_header_len; + __u8 cpt_ca_state; + __u8 cpt_retransmits; + __u8 cpt_reordering; + __u8 cpt_frto_counter; + + __u32 cpt_frto_highmark; + __u8 cpt_adv_cong; + __u8 cpt_defer_accept; + __u8 cpt_backoff; + __u8 __cpt_pad11; + + __u32 cpt_srtt; + __u32 cpt_mdev; + + __u32 cpt_mdev_max; + __u32 cpt_rttvar; + + __u32 cpt_rtt_seq; + __u32 cpt_rto; + + __u32 cpt_packets_out; + __u32 cpt_left_out; + + __u32 cpt_retrans_out; + __u32 cpt_snd_ssthresh; + + __u32 cpt_snd_cwnd; + __u16 cpt_snd_cwnd_cnt; + __u16 cpt_snd_cwnd_clamp; + + __u32 cpt_snd_cwnd_used; + __u32 cpt_snd_cwnd_stamp; + + __u32 cpt_timeout; + __u32 cpt_ka_timeout; + + __u32 cpt_rcv_wnd; + __u32 cpt_rcv_wup; + + __u32 cpt_write_seq; + __u32 cpt_pushed_seq; + + __u32 cpt_copied_seq; + __u8 cpt_tstamp_ok; + __u8 cpt_wscale_ok; + __u8 cpt_sack_ok; + __u8 cpt_saw_tstamp; + + __u8 cpt_snd_wscale; + __u8 cpt_rcv_wscale; + __u8 cpt_nonagle; + __u8 cpt_keepalive_probes; + __u32 cpt_rcv_tsval; + + __u32 cpt_rcv_tsecr; + __u32 cpt_ts_recent; + + __u64 cpt_ts_recent_stamp; + __u16 cpt_user_mss; + __u8 cpt_dsack; + __u8 cpt_eff_sacks; + __u32 cpt_sack_array[2*5]; + __u32 cpt_window_clamp; + + __u32 cpt_rcv_ssthresh; + __u8 cpt_probes_out; + __u8 cpt_num_sacks; + __u16 cpt_advmss; + + __u8 cpt_syn_retries; + __u8 cpt_ecn_flags; + __u16 cpt_prior_ssthresh; + __u32 cpt_lost_out; + + __u32 cpt_sacked_out; + __u32 cpt_fackets_out; + + __u32 cpt_high_seq; + __u32 cpt_retrans_stamp; + + __u32 cpt_undo_marker; + __u32 cpt_undo_retrans; + + __u32 cpt_urg_seq; + __u16 cpt_urg_data; + __u8 cpt_pending; + __u8 cpt_urg_mode; + + __u32 cpt_snd_up; + __u32 cpt_keepalive_time; + + __u32 cpt_keepalive_intvl; + __u32 cpt_linger2; + + __u32 cpt_rcvrtt_rtt; + __u32 cpt_rcvrtt_seq; + + __u32 cpt_rcvrtt_time; + __u32 __cpt_pad12; +} __attribute__ ((aligned (8))); + +struct cpt_sockmc_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u16 cpt_family; + __u16 cpt_mode; + __u32 cpt_ifindex; + __u32 cpt_mcaddr[4]; +} __attribute__ ((aligned (8))); +/* Followed by array of source addresses, each zero padded to 16 bytes */ + +struct cpt_openreq_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_rcv_isn; + __u32 cpt_snt_isn; + + __u16 cpt_rmt_port; + __u16 cpt_mss; + __u8 cpt_family; + __u8 cpt_retrans; + __u8 cpt_snd_wscale; + __u8 cpt_rcv_wscale; + + __u8 cpt_tstamp_ok; + __u8 cpt_sack_ok; + __u8 cpt_wscale_ok; + __u8 cpt_ecn_ok; + __u8 cpt_acked; + __u8 __cpt_pad1; + __u16 __cpt_pad2; + + __u32 cpt_window_clamp; + __u32 cpt_rcv_wnd; + __u32 cpt_ts_recent; + __u32 cpt_iif; + __u64 cpt_expires; + + __u64 cpt_loc_addr[2]; + __u64 cpt_rmt_addr[2]; +/* + struct ip_options *opt; + */ + +} __attribute__ ((aligned (8))); + +struct cpt_skb_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_owner; + __u32 cpt_queue; +#define CPT_SKB_NQ 0 +#define CPT_SKB_RQ 1 +#define CPT_SKB_WQ 2 +#define CPT_SKB_OFOQ 3 + + __u64 cpt_stamp; + __u32 cpt_len; + __u32 cpt_hspace; + __u32 cpt_tspace; + __u32 cpt_h; + __u32 cpt_nh; + __u32 cpt_mac; + + __u64 cpt_cb[5]; + __u32 cpt_mac_len; + __u32 cpt_csum; + __u8 cpt_local_df; + __u8 cpt_pkt_type; + __u8 cpt_ip_summed; + __u8 __cpt_pad1; + __u32 cpt_priority; + __u16 cpt_protocol; + __u16 cpt_security; + __u16 cpt_gso_segs; + __u16 cpt_gso_size; +} __attribute__ ((aligned (8))); + + +struct cpt_sysvshm_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_key; + __u64 cpt_uid; + __u64 cpt_gid; + __u64 cpt_cuid; + __u64 cpt_cgid; + __u64 cpt_mode; + __u64 cpt_seq; + + __u32 cpt_id; + __u32 cpt_mlockuser; + __u64 cpt_segsz; + __u64 cpt_atime; + __u64 cpt_ctime; + __u64 cpt_dtime; + __u64 cpt_creator; + __u64 cpt_last; +} __attribute__ ((aligned (8))); + + +struct cpt_sysvsem_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_key; + __u64 cpt_uid; + __u64 cpt_gid; + __u64 cpt_cuid; + __u64 cpt_cgid; + __u64 cpt_mode; + __u64 cpt_seq; + __u32 cpt_id; + __u32 __cpt_pad1; + + __u64 cpt_otime; + __u64 cpt_ctime; +} __attribute__ ((aligned (8))); +/* Content is array of pairs semval/sempid */ + +struct cpt_sysvsem_undo_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_id; + __u32 cpt_nsem; +} __attribute__ ((aligned (8))); + +struct cpt_sysvmsg_msg_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_type; + __u64 cpt_size; +} __attribute__ ((aligned (8))); + + +struct cpt_sysvmsg_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_key; + __u64 cpt_uid; + __u64 cpt_gid; + __u64 cpt_cuid; + __u64 cpt_cgid; + __u64 cpt_mode; + __u64 cpt_seq; + __u32 cpt_id; + __u32 __cpt_pad1; + + __u64 cpt_stime; + __u64 cpt_rtime; + __u64 cpt_ctime; + __u64 cpt_last_sender; + __u64 cpt_last_receiver; + __u64 cpt_qbytes; +} __attribute__ ((aligned (8))); +/* Content is array of sysv msg */ + + +struct cpt_mm_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start_code; + __u64 cpt_end_code; + __u64 cpt_start_data; + __u64 cpt_end_data; + __u64 cpt_start_brk; + __u64 cpt_brk; + __u64 cpt_start_stack; + __u64 cpt_start_arg; + __u64 cpt_end_arg; + __u64 cpt_start_env; + __u64 cpt_end_env; + __u64 cpt_def_flags; + __u64 cpt_mmub; + __u8 cpt_dumpable; + __u8 cpt_vps_dumpable; + __u8 cpt_used_hugetlb; + __u8 __cpt_pad; + __u32 cpt_vdso; +} __attribute__ ((aligned (8))); + +struct cpt_page_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; +} __attribute__ ((aligned (8))); + +struct cpt_remappage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_pgoff; +} __attribute__ ((aligned (8))); + +struct cpt_copypage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_source; +} __attribute__ ((aligned (8))); + +struct cpt_lazypage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_index; +} __attribute__ ((aligned (8))); + +struct cpt_iterpage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; +} __attribute__ ((aligned (8))); +/* Followed by array of PFNs */ + +struct cpt_vma_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_type; +#define CPT_VMA_TYPE_0 0 +#define CPT_VMA_TYPE_SHM 1 +#define CPT_VMA_VDSO 2 + __u32 cpt_anonvma; + __u64 cpt_anonvmaid; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_flags; + __u64 cpt_pgprot; + __u64 cpt_pgoff; +} __attribute__ ((aligned (8))); + +struct cpt_aio_ctx_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_max_reqs; + __u32 cpt_ring_pages; + __u32 cpt_tail; + __u32 cpt_nr; + __u64 cpt_mmap_base; + /* Data (io_event's) and struct aio_ring are stored in user space VM */ +} __attribute__ ((aligned (8))); + + +/* Format of MM section. + * + * It is array of MM objects (mm_struct). Each MM object is + * header, encoding mm_struct, followed by array of VMA objects. + * Each VMA consists of VMA header, encoding vm_area_struct, and + * if the VMA contains copied pages, the header is followed by + * array of tuples start-end each followed by data. + * + * ATTN: no block/page alignment. Only 64bit alignment. This might be not good? + */ + +struct cpt_restart_block { + __u64 fn; +#define CPT_RBL_0 0 +#define CPT_RBL_NANOSLEEP 1 +#define CPT_RBL_COMPAT_NANOSLEEP 2 +#define CPT_RBL_POLL 3 +#define CPT_RBL_FUTEX_WAIT 4 + __u64 arg0; + __u64 arg1; + __u64 arg2; + __u64 arg3; +} __attribute__ ((aligned (8))); + +struct cpt_siginfo_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_qflags; + __u32 cpt_signo; + __u32 cpt_errno; + __u32 cpt_code; + + __u64 cpt_sigval; + __u32 cpt_pid; + __u32 cpt_uid; + __u64 cpt_utime; + __u64 cpt_stime; + + __u64 cpt_user; +} __attribute__ ((aligned (8))); + +/* Portable presentaions for segment registers */ + +#define CPT_SEG_ZERO 0 +#define CPT_SEG_TLS1 1 +#define CPT_SEG_TLS2 2 +#define CPT_SEG_TLS3 3 +#define CPT_SEG_USER32_DS 4 +#define CPT_SEG_USER32_CS 5 +#define CPT_SEG_USER64_DS 6 +#define CPT_SEG_USER64_CS 7 +#define CPT_SEG_LDT 256 + +struct cpt_x86_regs +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_debugreg[8]; + __u32 cpt_fs; + __u32 cpt_gs; + + __u32 cpt_ebx; + __u32 cpt_ecx; + __u32 cpt_edx; + __u32 cpt_esi; + __u32 cpt_edi; + __u32 cpt_ebp; + __u32 cpt_eax; + __u32 cpt_xds; + __u32 cpt_xes; + __u32 cpt_orig_eax; + __u32 cpt_eip; + __u32 cpt_xcs; + __u32 cpt_eflags; + __u32 cpt_esp; + __u32 cpt_xss; + __u32 pad; +}; + +struct cpt_x86_64_regs +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_debugreg[8]; + + __u64 cpt_fsbase; + __u64 cpt_gsbase; + __u32 cpt_fsindex; + __u32 cpt_gsindex; + __u32 cpt_ds; + __u32 cpt_es; + + __u64 cpt_r15; + __u64 cpt_r14; + __u64 cpt_r13; + __u64 cpt_r12; + __u64 cpt_rbp; + __u64 cpt_rbx; + __u64 cpt_r11; + __u64 cpt_r10; + __u64 cpt_r9; + __u64 cpt_r8; + __u64 cpt_rax; + __u64 cpt_rcx; + __u64 cpt_rdx; + __u64 cpt_rsi; + __u64 cpt_rdi; + __u64 cpt_orig_rax; + __u64 cpt_rip; + __u64 cpt_cs; + __u64 cpt_eflags; + __u64 cpt_rsp; + __u64 cpt_ss; +}; + +struct cpt_ia64_regs +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 gr[128]; + __u64 fr[256]; + __u64 br[8]; + __u64 nat[2]; + + __u64 ar_bspstore; + __u64 num_regs; + __u64 loadrs; + __u64 ar_bsp; + __u64 ar_unat; + __u64 ar_pfs; + __u64 ar_ccv; + __u64 ar_fpsr; + __u64 ar_csd; + __u64 ar_ssd; + __u64 ar_ec; + __u64 ar_lc; + __u64 ar_rsc; + __u64 ar_rnat; + + __u64 cr_iip; + __u64 cr_ipsr; + + __u64 cfm; + __u64 pr; + + __u64 ibr[8]; + __u64 dbr[8]; +}; + + +struct cpt_task_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_state; + __u64 cpt_flags; + __u64 cpt_ptrace; + __u32 cpt_prio; + __u32 cpt_static_prio; + __u32 cpt_policy; + __u32 cpt_rt_priority; + + /* struct thread_info */ + __u64 cpt_exec_domain; + __u64 cpt_thrflags; + __u64 cpt_thrstatus; + __u64 cpt_addr_limit; + + __u64 cpt_personality; + + __u64 cpt_mm; + __u64 cpt_files; + __u64 cpt_fs; + __u64 cpt_signal; + __u64 cpt_sighand; + __u64 cpt_sigblocked; + __u64 cpt_sigrblocked; + __u64 cpt_sigpending; + __u64 cpt_namespace; + __u64 cpt_sysvsem_undo; + __u32 cpt_pid; + __u32 cpt_tgid; + __u32 cpt_ppid; + __u32 cpt_rppid; + __u32 cpt_pgrp; + __u32 cpt_session; + __u32 cpt_old_pgrp; + __u32 __cpt_pad; + __u32 cpt_leader; + __u8 cpt_pn_state; + __u8 cpt_stopped_state; + __u8 cpt_sigsuspend_state; + __u8 cpt_64bit; + __u64 cpt_set_tid; + __u64 cpt_clear_tid; + __u32 cpt_exit_code; + __u32 cpt_exit_signal; + __u32 cpt_pdeath_signal; + __u32 cpt_user; + __u32 cpt_uid; + __u32 cpt_euid; + __u32 cpt_suid; + __u32 cpt_fsuid; + __u32 cpt_gid; + __u32 cpt_egid; + __u32 cpt_sgid; + __u32 cpt_fsgid; + __u32 cpt_ngids; + __u32 cpt_gids[32]; + __u8 cpt_prctl_uac; + __u8 cpt_prctl_fpemu; + __u16 __cpt_pad1; + __u64 cpt_ecap; + __u64 cpt_icap; + __u64 cpt_pcap; + __u8 cpt_comm[16]; + __u64 cpt_tls[3]; + struct cpt_restart_block cpt_restart; + __u64 cpt_it_real_value; /* V8: jiffies, V9..: nsec */ + __u64 cpt_it_real_incr; /* V8: jiffies, V9..: nsec */ + __u64 cpt_it_prof_value; + __u64 cpt_it_prof_incr; + __u64 cpt_it_virt_value; + __u64 cpt_it_virt_incr; + + __u16 cpt_used_math; + __u8 cpt_keepcap; + __u8 cpt_did_exec; + __u32 cpt_ptrace_message; + + __u64 cpt_utime; + __u64 cpt_stime; + __u64 cpt_starttime; /* V8: jiffies, V9...: timespec */ + __u64 cpt_nvcsw; + __u64 cpt_nivcsw; + __u64 cpt_min_flt; + __u64 cpt_maj_flt; + + __u64 cpt_sigsuspend_blocked; + __u64 cpt_cutime, cpt_cstime; + __u64 cpt_cnvcsw, cpt_cnivcsw; + __u64 cpt_cmin_flt, cpt_cmaj_flt; + +#define CPT_RLIM_NLIMITS 16 + __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; + __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; + + __u64 cpt_task_ub; + __u64 cpt_exec_ub; + __u64 cpt_mm_ub; + __u64 cpt_fork_sub; +} __attribute__ ((aligned (8))); + +struct cpt_sigaltstack_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_stack; + __u32 cpt_stacksize; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +struct cpt_task_aux_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_robust_list; + __u64 __cpt_future[16]; +} __attribute__ ((aligned (8))); + + +struct cpt_signal_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_leader; + __u8 cpt_pgrp_type; + __u8 cpt_old_pgrp_type; + __u8 cpt_session_type; +#define CPT_PGRP_NORMAL 0 +#define CPT_PGRP_ORPHAN 1 +#define CPT_PGRP_STRAY 2 + __u8 __cpt_pad1; + __u64 cpt_pgrp; + __u64 cpt_old_pgrp; + __u64 cpt_session; + __u64 cpt_sigpending; + __u64 cpt_ctty; + + __u32 cpt_curr_target; + __u32 cpt_group_exit; + __u32 cpt_group_exit_code; + __u32 cpt_group_exit_task; + __u32 cpt_notify_count; + __u32 cpt_group_stop_count; + __u32 cpt_stop_state; + __u32 __cpt_pad2; + + __u64 cpt_utime, cpt_stime, cpt_cutime, cpt_cstime; + __u64 cpt_nvcsw, cpt_nivcsw, cpt_cnvcsw, cpt_cnivcsw; + __u64 cpt_min_flt, cpt_maj_flt, cpt_cmin_flt, cpt_cmaj_flt; + + __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; + __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; +} __attribute__ ((aligned (8))); +/* Followed by list of posix timers. */ + +struct cpt_sighand_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + +} __attribute__ ((aligned (8))); +/* Followed by list of sighandles. */ + +struct cpt_sighandler_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_signo; + __u32 __cpt_pad1; + __u64 cpt_handler; + __u64 cpt_restorer; + __u64 cpt_flags; + __u64 cpt_mask; +} __attribute__ ((aligned (8))); + +struct cpt_netdev_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_index; + __u32 cpt_flags; + __u8 cpt_name[16]; +} __attribute__ ((aligned (8))); + +struct cpt_tuntap_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_owner; + __u32 cpt_attached; + __u64 cpt_flags; + __u64 cpt_bindfile; + __u64 cpt_if_flags; + __u8 cpt_dev_addr[6]; + __u16 cpt_pad; + __u32 cpt_chr_filter[2]; + __u32 cpt_net_filter[2]; +} __attribute__ ((aligned (8))); + +struct cpt_veth_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_allow_mac_change; + __u32 __cpt_pad; +} __attribute__ ((aligned (8))); + +struct cpt_hwaddr_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u8 cpt_dev_addr[32]; +} __attribute__ ((aligned (8))); + +struct cpt_netstats_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_rx_packets; + __u64 cpt_tx_packets; + __u64 cpt_rx_bytes; + __u64 cpt_tx_bytes; + __u64 cpt_rx_errors; + __u64 cpt_tx_errors; + __u64 cpt_rx_dropped; + __u64 cpt_tx_dropped; + __u64 cpt_multicast; + __u64 cpt_collisions; + __u64 cpt_rx_length_errors; + __u64 cpt_rx_over_errors; + __u64 cpt_rx_crc_errors; + __u64 cpt_rx_frame_errors; + __u64 cpt_rx_fifo_errors; + __u64 cpt_rx_missed_errors; + __u64 cpt_tx_aborted_errors; + __u64 cpt_tx_carrier_errors; + __u64 cpt_tx_fifo_errors; + __u64 cpt_tx_heartbeat_errors; + __u64 cpt_tx_window_errors; + __u64 cpt_rx_compressed; + __u64 cpt_tx_compressed; + __u64 pad[4]; +} __attribute__ ((aligned (8))); + +struct cpt_ifaddr_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_index; + __u8 cpt_family; + __u8 cpt_masklen; + __u8 cpt_flags; + __u8 cpt_scope; + __u32 cpt_address[4]; + __u32 cpt_peer[4]; + __u32 cpt_broadcast[4]; + __u8 cpt_label[16]; + __u32 cpt_valid_lft; + __u32 cpt_prefered_lft; +} __attribute__ ((aligned (8))); + +struct cpt_ipct_tuple +{ + __u32 cpt_src; + __u16 cpt_srcport; + __u16 __cpt_pad1; + + __u32 cpt_dst; + __u16 cpt_dstport; + __u8 cpt_protonum; + __u8 cpt_dir; /* TEMPORARY HACK TO VALIDATE CODE */ +} __attribute__ ((aligned (8))); + +struct cpt_nat_manip +{ + __u8 cpt_direction; + __u8 cpt_hooknum; + __u8 cpt_maniptype; + __u8 __cpt_pad1; + + __u32 cpt_manip_addr; + __u16 cpt_manip_port; + __u16 __cpt_pad2; + __u32 __cpt_pad3; +} __attribute__ ((aligned (8))); + +struct cpt_nat_seq +{ + __u32 cpt_correction_pos; + __u32 cpt_offset_before; + __u32 cpt_offset_after; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +struct cpt_ip_connexpect_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_timeout; + __u32 cpt_sibling_conntrack; /* Index of child conntrack */ + __u32 cpt_seq; /* id in 2.6.15 */ + + struct cpt_ipct_tuple cpt_ct_tuple; /* NU 2.6.15 */ + struct cpt_ipct_tuple cpt_tuple; + struct cpt_ipct_tuple cpt_mask; + + /* union ip_conntrack_expect_help. Used by ftp, irc, amanda */ + __u32 cpt_help[3]; /* NU 2.6.15 */ + __u16 cpt_manip_proto; + __u8 cpt_dir; + __u8 cpt_flags; +} __attribute__ ((aligned (8))); + +struct cpt_ip_conntrack_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + struct cpt_ipct_tuple cpt_tuple[2]; + __u64 cpt_status; + __u64 cpt_timeout; + __u32 cpt_index; + __u8 cpt_ct_helper; + __u8 cpt_nat_helper; + __u16 cpt_pad1; + + /* union ip_conntrack_proto. Used by tcp and icmp. */ + __u32 cpt_proto_data[12]; + + /* union ip_conntrack_help. Used by ftp and pptp helper. + * We do not support pptp... + */ + __u32 cpt_help_data[6]; + + /* nat info */ + __u32 cpt_initialized; /* NU 2.6.15 */ + __u32 cpt_num_manips; /* NU 2.6.15 */ + struct cpt_nat_manip cpt_nat_manips[6]; /* NU 2.6.15 */ + + struct cpt_nat_seq cpt_nat_seq[2]; + + __u32 cpt_masq_index; + __u32 cpt_id; + __u32 cpt_mark; +} __attribute__ ((aligned (8))); + +struct cpt_ubparm +{ + __u64 barrier; + __u64 limit; + __u64 held; + __u64 maxheld; + __u64 minheld; + __u64 failcnt; +} __attribute__ ((aligned (8))); + +struct cpt_beancounter_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_parent; + __u32 cpt_id; + __u32 __cpt_pad; + struct cpt_ubparm cpt_parms[32 * 2]; +} __attribute__ ((aligned (8))); + +struct cpt_slm_sgreg_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_size; + __u32 __cpt_pad1; + __u32 cpt_id; + __u16 cpt_resource; + __u8 cpt_regname[32]; + __u8 __cpt_pad2[2]; +} __attribute__ ((aligned (8))); + +struct cpt_slm_obj_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_size; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +#ifdef __KERNEL__ + +static inline void __user * cpt_ptr_import(__u64 ptr) +{ + return (void*)(unsigned long)ptr; +} + +static inline __u64 cpt_ptr_export(void __user *ptr) +{ + return (__u64)(unsigned long)ptr; +} + +static inline void cpt_sigset_import(sigset_t *sig, __u64 ptr) +{ + memcpy(sig, &ptr, sizeof(*sig)); +} + +static inline __u64 cpt_sigset_export(sigset_t *sig) +{ + return *(__u64*)sig; +} + +static inline __u64 cpt_timespec_export(struct timespec *tv) +{ + return (((u64)tv->tv_sec) << 32) + tv->tv_nsec; +} + +static inline void cpt_timespec_import(struct timespec *tv, __u64 val) +{ + tv->tv_sec = val>>32; + tv->tv_nsec = (val&0xFFFFFFFF); +} + +static inline __u64 cpt_timeval_export(struct timeval *tv) +{ + return (((u64)tv->tv_sec) << 32) + tv->tv_usec; +} + +static inline void cpt_timeval_import(struct timeval *tv, __u64 val) +{ + tv->tv_sec = val>>32; + tv->tv_usec = (val&0xFFFFFFFF); +} + +#endif + +#endif /* __CPT_IMAGE_H_ */ diff --git a/include/linux/cpt_ioctl.h b/include/linux/cpt_ioctl.h new file mode 100644 index 0000000..b8e83cc --- /dev/null +++ b/include/linux/cpt_ioctl.h @@ -0,0 +1,43 @@ +/* + * + * include/linux/cpt_ioctl.h + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _CPT_IOCTL_H_ +#define _CPT_IOCTL_H_ 1 + +#include +#include + +#define CPTCTLTYPE '-' +#define CPT_SET_DUMPFD _IOW(CPTCTLTYPE, 1, int) +#define CPT_SET_STATUSFD _IOW(CPTCTLTYPE, 2, int) +#define CPT_SET_LOCKFD _IOW(CPTCTLTYPE, 3, int) +#define CPT_SET_VEID _IOW(CPTCTLTYPE, 4, int) +#define CPT_SUSPEND _IO(CPTCTLTYPE, 5) +#define CPT_DUMP _IO(CPTCTLTYPE, 6) +#define CPT_UNDUMP _IO(CPTCTLTYPE, 7) +#define CPT_RESUME _IO(CPTCTLTYPE, 8) +#define CPT_KILL _IO(CPTCTLTYPE, 9) +#define CPT_JOIN_CONTEXT _IO(CPTCTLTYPE, 10) +#define CPT_GET_CONTEXT _IOW(CPTCTLTYPE, 11, unsigned int) +#define CPT_PUT_CONTEXT _IO(CPTCTLTYPE, 12) +#define CPT_SET_PAGEINFDIN _IOW(CPTCTLTYPE, 13, int) +#define CPT_SET_PAGEINFDOUT _IOW(CPTCTLTYPE, 14, int) +#define CPT_PAGEIND _IO(CPTCTLTYPE, 15) +#define CPT_VMPREP _IOW(CPTCTLTYPE, 16, int) +#define CPT_SET_LAZY _IOW(CPTCTLTYPE, 17, int) +#define CPT_SET_CPU_FLAGS _IOW(CPTCTLTYPE, 18, unsigned int) +#define CPT_TEST_CAPS _IOW(CPTCTLTYPE, 19, unsigned int) +#define CPT_TEST_VECAPS _IOW(CPTCTLTYPE, 20, unsigned int) +#define CPT_SET_ERRORFD _IOW(CPTCTLTYPE, 21, int) + +#define CPT_ITER _IOW(CPTCTLTYPE, 23, int) + +#endif diff --git a/include/linux/dcache.h b/include/linux/dcache.h index efba1de..d66ceed 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -8,6 +8,8 @@ #include #include +#include + struct nameidata; struct path; struct vfsmount; @@ -111,6 +113,9 @@ struct dentry { struct dcookie_struct *d_cookie; /* cookie, if any */ #endif int d_mounted; +#ifdef CONFIG_BEANCOUNTERS + struct dentry_beancounter dentry_bc; +#endif unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */ }; @@ -174,9 +179,13 @@ d_iput: no no no yes #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ #define DCACHE_UNHASHED 0x0010 +#define DCACHE_VIRTUAL 0x0100 /* ve accessible */ + +extern void mark_tree_virtual(struct path *path); #define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 /* Parent inode is watched */ +extern struct kmem_cache *dentry_cache; extern spinlock_t dcache_lock; extern seqlock_t rename_lock; @@ -304,6 +313,7 @@ extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...); extern char *__d_path(const struct path *path, struct path *root, char *, int); extern char *d_path(const struct path *, char *, int); extern char *dentry_path(struct dentry *, char *, int); +extern int d_root_check(struct path *path); /* Allocation counts.. */ @@ -323,6 +333,12 @@ extern char *dentry_path(struct dentry *, char *, int); static inline struct dentry *dget(struct dentry *dentry) { if (dentry) { +#ifdef CONFIG_BEANCOUNTERS + preempt_disable(); + if (ub_dentry_on && ub_dget_testone(dentry)) + BUG(); + preempt_enable_no_resched(); +#endif BUG_ON(!atomic_read(&dentry->d_count)); atomic_inc(&dentry->d_count); } @@ -365,4 +381,5 @@ extern struct dentry *lookup_create(struct nameidata *nd, int is_dir); extern int sysctl_vfs_cache_pressure; +extern int check_area_access_ve(struct path *); #endif /* __LINUX_DCACHE_H */ diff --git a/include/linux/device.h b/include/linux/device.h index 4d8372d..08a186d 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -199,8 +199,16 @@ struct class { struct class_private *p; }; +#ifndef CONFIG_VE extern struct kobject *sysfs_dev_block_kobj; extern struct kobject *sysfs_dev_char_kobj; +#define ve_sysfs_dev_block_kobj sysfs_dev_block_kobj +#define ve_sysfs_dev_char_kobj sysfs_dev_char_kobj +#else +#define ve_sysfs_dev_block_kobj (get_exec_env()->dev_block_kobj) +#define ve_sysfs_dev_char_kobj (get_exec_env()->dev_char_kobj) +#endif + extern int __must_check __class_register(struct class *class, struct lock_class_key *key); extern void class_unregister(struct class *class); @@ -250,6 +258,15 @@ extern struct class * __must_check __class_create(struct module *owner, struct lock_class_key *key); extern void class_destroy(struct class *cls); +extern struct class net_class; +extern struct kset *class_kset; + +int classes_init(void); +void classes_fini(void); + +int devices_init(void); +void devices_fini(void); + /* This is a #define to keep the compiler from merging different * instances of the __key variable */ #define class_create(owner, name) \ diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 154769c..ee767ed 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -23,6 +23,16 @@ int devpts_pty_new(struct tty_struct *tty); /* mknod in devpts */ struct tty_struct *devpts_get_tty(int number); /* get tty structure */ void devpts_pty_kill(int number); /* unlink */ +struct devpts_config { + int setuid; + int setgid; + uid_t uid; + gid_t gid; + umode_t mode; +}; + +extern struct devpts_config devpts_config; +extern struct file_system_type devpts_fs_type; #else /* Dummy stubs in the no-pty case */ diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 639624b..be231eb 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -56,6 +56,11 @@ struct elevator_ops elevator_init_fn *elevator_init_fn; elevator_exit_fn *elevator_exit_fn; void (*trim)(struct io_context *); + /* In original cfq design task holds a cfqq refcount and puts it + * on exit via io context. Now async cfqqs are hold by UB, + * so we need somehow to put these queues. Use this function. + */ + void (*put_queue)(struct cfq_queue *); }; #define ELV_NAME_MAX (16) diff --git a/include/linux/elf.h b/include/linux/elf.h index edc3dac..079ffc7 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -404,4 +404,6 @@ extern int elf_coredump_extra_notes_size(void); extern int elf_coredump_extra_notes_write(struct file *file, loff_t *foffset); #endif +extern int sysctl_at_vsyscall; + #endif /* _LINUX_ELF_H */ diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index f1e1d3c..9ac8dd6 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -17,6 +17,7 @@ /* For O_CLOEXEC */ #include #include +#include /* Flags for epoll_create1. */ #define EPOLL_CLOEXEC O_CLOEXEC @@ -64,6 +65,88 @@ static inline void eventpoll_init_file(struct file *file) spin_lock_init(&file->f_ep_lock); } +struct epoll_filefd { + struct file *file; + int fd; +}; + +/* + * This structure is stored inside the "private_data" member of the file + * structure and rapresent the main data sructure for the eventpoll + * interface. + */ +struct eventpoll { + /* Protect the this structure access */ + spinlock_t lock; + + /* + * This mutex is used to ensure that files are not removed + * while epoll is using them. This is held during the event + * collection loop, the file cleanup path, the epoll file exit + * code and the ctl operations. + */ + struct mutex mtx; + + /* Wait queue used by sys_epoll_wait() */ + wait_queue_head_t wq; + + /* Wait queue used by file->poll() */ + wait_queue_head_t poll_wait; + + /* List of ready file descriptors */ + struct list_head rdllist; + + /* RB tree root used to store monitored fd structs */ + struct rb_root rbr; + + /* + * This is a single linked list that chains all the "struct epitem" that + * happened while transfering ready events to userspace w/out + * holding ->lock. + */ + struct epitem *ovflist; +}; + +/* + * Each file descriptor added to the eventpoll interface will + * have an entry of this type linked to the "rbr" RB tree. + */ +struct epitem { + /* RB tree node used to link this structure to the eventpoll RB tree */ + struct rb_node rbn; + + /* List header used to link this structure to the eventpoll ready list */ + struct list_head rdllink; + + /* + * Works together "struct eventpoll"->ovflist in keeping the + * single linked chain of items. + */ + struct epitem *next; + + /* The file descriptor information this item refers to */ + struct epoll_filefd ffd; + + /* Number of active wait queue attached to poll operations */ + int nwait; + + /* List containing poll wait queues */ + struct list_head pwqlist; + + /* The "container" of this item */ + struct eventpoll *ep; + + /* List header used to link this item to the "struct file" items list */ + struct list_head fllink; + + /* The structure that describe the interested events and the source fd */ + struct epoll_event event; +}; + +extern struct semaphore epsem; +struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); +int ep_insert(struct eventpoll *ep, struct epoll_event *event, + struct file *tfile, int fd); /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); @@ -96,6 +179,8 @@ static inline void eventpoll_release(struct file *file) eventpoll_release_file(file); } +extern struct mutex epmutex; + #else static inline void eventpoll_init_file(struct file *file) {} diff --git a/include/linux/fairsched.h b/include/linux/fairsched.h new file mode 100644 index 0000000..e08c84d --- /dev/null +++ b/include/linux/fairsched.h @@ -0,0 +1,86 @@ +/* + * Fair Scheduler + * + * Copyright (C) 2000-2008 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __LINUX_FAIRSCHED_H__ +#define __LINUX_FAIRSCHED_H__ + +#define FAIRSCHED_SET_RATE 0 +#define FAIRSCHED_DROP_RATE 1 +#define FAIRSCHED_GET_RATE 2 + +#ifdef __KERNEL__ + +/* refcnt change protected with tasklist write lock */ +struct fairsched_node { + struct task_group *tg; + int refcnt; + unsigned id; + struct list_head nodelist; + + unsigned weight; + unsigned char rate_limited; + unsigned rate; +#ifdef CONFIG_VE + struct ve_struct *owner_env; +#endif +}; + +#ifdef CONFIG_VZ_FAIRSCHED + +#define FAIRSCHED_INIT_NODE_ID INT_MAX + +extern struct fairsched_node fairsched_init_node; + +void fairsched_init_early(void); +void fairsched_init_late(void); + +static inline int task_fairsched_node_id(struct task_struct *p) +{ + return p->fsched_node->id; +} + +/* must called with tasklist write locked */ +static inline void get_task_fairsched_node(struct task_struct *p) +{ + p->fsched_node->refcnt++; +} +static inline void put_task_fairsched_node(struct task_struct *p) +{ + p->fsched_node->refcnt--; +} + +#define INIT_VZ_FAIRSCHED .fsched_node = &fairsched_init_node, + +#define FSCHWEIGHT_MAX ((1 << 16) - 1) +#define FSCHRATE_SHIFT 10 +#define FSCH_TIMESLICE 16 + +asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, + unsigned int newid); +asmlinkage int sys_fairsched_rmnod(unsigned int id); +asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid); +asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus); +asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned int weight); +asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate); + +#else /* CONFIG_VZ_FAIRSCHED */ + +static inline void fairsched_init_early(void) { } +static inline void fairsched_init_late(void) { } +static inline int task_fairsched_node_id(struct task_struct *p) { return 0; } +static inline void get_task_fairsched_node(struct task_struct *p) { } +static inline void put_task_fairsched_node(struct task_struct *p) { } + +#define INIT_VZ_FAIRSCHED + +#endif /* CONFIG_VZ_FAIRSCHED */ +#endif /* __KERNEL__ */ + +#endif /* __LINUX_FAIRSCHED_H__ */ diff --git a/include/linux/faudit.h b/include/linux/faudit.h new file mode 100644 index 0000000..631c42e --- /dev/null +++ b/include/linux/faudit.h @@ -0,0 +1,45 @@ +/* + * include/linux/faudit.h + * + * Copyright (C) 2005 SWSoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __FAUDIT_H_ +#define __FAUDIT_H_ + +#include + +struct vfsmount; +struct dentry; +struct super_block; +struct kstatfs; +struct kstat; +struct pt_regs; + +struct faudit_regs_arg { + int err; + struct pt_regs *regs; +}; + +struct faudit_stat_arg { + int err; + struct vfsmount *mnt; + struct dentry *dentry; + struct kstat *stat; +}; + +struct faudit_statfs_arg { + int err; + struct super_block *sb; + struct kstatfs *stat; +}; + +#define VIRTINFO_FAUDIT (0) +#define VIRTINFO_FAUDIT_STAT (VIRTINFO_FAUDIT + 0) +#define VIRTINFO_FAUDIT_STATFS (VIRTINFO_FAUDIT + 1) + +#endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 580b513..a612846 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -50,6 +50,7 @@ extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; +extern int odirect_enable; #ifdef CONFIG_DNOTIFY extern int dir_notify_enable; #endif @@ -62,6 +63,7 @@ extern int dir_notify_enable; #define MAY_APPEND 8 #define MAY_ACCESS 16 #define MAY_OPEN 32 +#define MAY_QUOTACTL 16 /* for devgroup-vs-openvz only */ #define FMODE_READ 1 #define FMODE_WRITE 2 @@ -70,6 +72,7 @@ extern int dir_notify_enable; #define FMODE_LSEEK 4 #define FMODE_PREAD 8 #define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ +#define FMODE_QUOTACTL 4 /* File is being opened for execution. Primary users of this flag are distributed filesystems that can use it to achieve correct ETXTBUSY @@ -96,6 +99,8 @@ extern int dir_notify_enable; #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 +#define FS_VIRTUALIZED 64 /* Can mount this fstype inside ve */ +#define FS_MANGLE_PROC 128 /* hide some /proc/mounts info inside VE */ #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() * during rename() internally. @@ -369,6 +374,9 @@ struct iattr { * Includes for diskquotas. */ #include +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) +#include +#endif /** * enum positive_aop_returns - aop return codes with specific semantics @@ -651,6 +659,9 @@ struct inode { #ifdef CONFIG_QUOTA struct dquot *i_dquot[MAXQUOTAS]; #endif +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) + struct vz_quota_ilink i_qlnk; +#endif struct list_head i_devices; union { struct pipe_inode_info *i_pipe; @@ -706,6 +717,8 @@ enum inode_i_mutex_lock_class extern void inode_double_lock(struct inode *inode1, struct inode *inode2); extern void inode_double_unlock(struct inode *inode1, struct inode *inode2); +extern struct kmem_cache *inode_cachep; + /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic @@ -825,6 +838,7 @@ struct file { struct fown_struct f_owner; unsigned int f_uid, f_gid; struct file_ra_state f_ra; + struct user_beancounter *f_ub; u64 f_version; #ifdef CONFIG_SECURITY @@ -842,6 +856,7 @@ struct file { #ifdef CONFIG_DEBUG_WRITECOUNT unsigned long f_mnt_write_state; #endif + struct ve_struct *owner_env; }; extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); @@ -952,6 +967,9 @@ struct file_lock { fl_owner_t fl_owner; unsigned char fl_flags; unsigned char fl_type; +#ifdef CONFIG_BEANCOUNTERS + unsigned char fl_charged; +#endif unsigned int fl_pid; struct pid *fl_nspid; wait_queue_head_t fl_wait; @@ -1260,6 +1278,7 @@ struct file_operations { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **); + struct file * (*get_host)(struct file *); }; struct inode_operations { @@ -1326,6 +1345,7 @@ struct super_operations { #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); + struct inode *(*get_quota_root)(struct super_block *); #endif }; @@ -1502,8 +1522,14 @@ struct file_system_type { struct lock_class_key i_mutex_key; struct lock_class_key i_mutex_dir_key; struct lock_class_key i_alloc_sem_key; + + struct file_system_type *proto; + struct ve_struct *owner_env; }; +void get_filesystem(struct file_system_type *fs); +void put_filesystem(struct file_system_type *fs); + extern int get_sb_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int), @@ -1543,6 +1569,11 @@ extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); #define kern_mount(type) kern_mount_data(type, NULL) +extern int register_ve_fs_type(struct ve_struct *, struct file_system_type *, + struct file_system_type **, struct vfsmount **); +extern void unregister_ve_fs_type(struct file_system_type *, struct vfsmount *); +extern void umount_ve_fs_type(struct file_system_type *local_fs_type); +#define kern_umount mntput extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); extern long do_mount(char *, char *, char *, unsigned long, void *); @@ -1550,6 +1581,7 @@ extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *); extern void drop_collected_mounts(struct vfsmount *); extern int vfs_statfs(struct dentry *, struct kstatfs *); +extern int faudit_statfs(struct super_block *, struct kstatfs *); /* /sys/fs */ extern struct kobject *fs_kobj; @@ -1722,7 +1754,8 @@ extern int check_disk_change(struct block_device *); extern int __invalidate_device(struct block_device *); extern int invalidate_partition(struct gendisk *, int); #endif -extern int invalidate_inodes(struct super_block *); +extern int invalidate_inodes_check(struct super_block *, int check); +#define invalidate_inodes(sb) invalidate_inodes_check(sb, 0) unsigned long __invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end, bool be_atomic); @@ -2147,6 +2180,17 @@ static inline void free_secdata(void *secdata) { } #endif /* CONFIG_SECURITY */ +static inline void *file_private(struct file *file) +{ + struct file *host = file; + + while (host->f_op->get_host) { + host = host->f_op->get_host(host); + BUG_ON(host->f_mapping != file->f_mapping); + } + return host->private_data; +} + struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/include/linux/futex.h b/include/linux/futex.h index 586ab56..9bf4c37 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -124,7 +124,7 @@ struct robust_list_head { #ifdef __KERNEL__ long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout, u32 __user *uaddr2, u32 val2, u32 val3); - +long futex_wait_restart(struct restart_block *restart); extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index e8003af..4302d3b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -50,20 +50,25 @@ struct vm_area_struct; #define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */ #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ #define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */ +#define __GFP_UBC ((__force gfp_t)0x200000u)/* charge kmem in buddy and slab */ +#define __GFP_SOFT_UBC ((__force gfp_t)0x400000u)/* use soft charging */ -#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 23 /* Room for __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ #define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH) /* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */ #define GFP_ATOMIC (__GFP_HIGH) +#define GFP_ATOMIC_UBC (__GFP_HIGH | __GFP_UBC) #define GFP_NOIO (__GFP_WAIT) #define GFP_NOFS (__GFP_WAIT | __GFP_IO) #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) +#define GFP_KERNEL_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_UBC) #define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \ __GFP_RECLAIMABLE) #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) +#define GFP_USER_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | __GFP_UBC) #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \ __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 181006c..5d48dcb 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -7,6 +7,9 @@ #include #include +#include +#include + /* * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: @@ -126,6 +129,24 @@ extern void rcu_irq_exit(void); # define rcu_irq_exit() do { } while (0) #endif /* CONFIG_PREEMPT_RCU */ +#define save_context() do { \ + struct task_struct *tsk; \ + if (hardirq_count() == HARDIRQ_OFFSET) { \ + tsk = current; \ + ve_save_context(tsk); \ + ub_save_context(tsk); \ + } \ + } while (0) + +#define restore_context() do { \ + struct task_struct *tsk; \ + if (hardirq_count() == HARDIRQ_OFFSET) { \ + tsk = current; \ + ve_restore_context(tsk); \ + ub_restore_context(tsk); \ + } \ + } while (0) + /* * It is safe to do non-atomic ops on ->hardirq_context, * because NMI handlers may not preempt and the ops are @@ -137,6 +158,7 @@ extern void rcu_irq_exit(void); rcu_irq_enter(); \ account_system_vtime(current); \ add_preempt_count(HARDIRQ_OFFSET); \ + save_context(); \ trace_hardirq_enter(); \ } while (0) @@ -152,6 +174,7 @@ extern void irq_enter(void); do { \ trace_hardirq_exit(); \ account_system_vtime(current); \ + restore_context(); \ sub_preempt_count(HARDIRQ_OFFSET); \ rcu_irq_exit(); \ } while (0) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2f245fe..84a4828 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -352,6 +352,9 @@ extern long hrtimer_nanosleep(struct timespec *rqtp, const enum hrtimer_mode mode, const clockid_t clockid); extern long hrtimer_nanosleep_restart(struct restart_block *restart_block); +#ifdef CONFIG_COMPAT +long compat_nanosleep_restart(struct restart_block *restart); +#endif extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *tsk); diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 6badb3e..50c628d 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -42,6 +42,7 @@ #define BRCTL_SET_PORT_PRIORITY 16 #define BRCTL_SET_PATH_COST 17 #define BRCTL_GET_FDB_ENTRIES 18 +#define BRCTL_SET_VIA_ORIG_DEV 19 #define BR_STATE_DISABLED 0 #define BR_STATE_LISTENING 1 @@ -70,6 +71,7 @@ struct __bridge_info __u32 tcn_timer_value; __u32 topology_change_timer_value; __u32 gc_timer_value; + __u8 via_phys_dev; }; struct __port_info @@ -104,9 +106,12 @@ struct __fdb_entry #include +#define BR_ALREADY_SEEN 1 + extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff *skb); +extern int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port); extern int (*br_should_route_hook)(struct sk_buff *skb); #endif diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 8529f57..68d3962 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -82,4 +82,44 @@ struct tun_filter { __u8 addr[0][ETH_ALEN]; }; +struct sk_buff_head; + +#define FLT_EXACT_COUNT 8 +struct tap_filter { + unsigned int count; /* Number of addrs. Zero means disabled */ + u32 mask[2]; /* Mask of the hashed addrs */ + unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; +}; + +struct tun_struct { + struct list_head list; + unsigned int flags; + int attached; + uid_t owner; + gid_t group; + + wait_queue_head_t read_wait; + struct sk_buff_head readq; + + struct net_device *dev; + struct fasync_struct *fasync; + struct file *bind_file; + + struct tap_filter txflt; + +#ifdef TUN_DEBUG + int debug; +#endif +}; + +struct tun_net { + struct list_head dev_list; +}; + +extern int tun_net_open(struct net_device *dev); +extern int tun_chr_open(struct inode *inode, struct file * file); +extern void tun_net_init(struct net_device *dev); +extern void tun_setup(struct net_device *dev); +extern struct list_head tun_dev_list; + #endif /* __IF_TUN_H */ diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 9e7b49b..3dbff65 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -84,6 +84,9 @@ struct vlan_group { struct hlist_node hlist; /* linked list */ struct net_device **vlan_devices_arrays[VLAN_GROUP_ARRAY_SPLIT_PARTS]; struct rcu_head rcu; +#ifdef CONFIG_VE + struct ve_struct *owner; +#endif }; static inline struct net_device *vlan_group_get_device(struct vlan_group *vg, diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 021d8e7..76babc9 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -10,6 +10,7 @@ #include #include #include +#include extern struct files_struct init_files; @@ -49,10 +50,17 @@ extern struct files_struct init_files; .rlim = INIT_RLIMITS, \ } +#ifdef CONFIG_VE +/* one for ve0, one for init_task */ +#define INIT_NSPROXY_COUNT ATOMIC_INIT(2) +#else +#define INIT_NSPROXY_COUNT ATOMIC_INIT(1) +#endif + extern struct nsproxy init_nsproxy; #define INIT_NSPROXY(nsproxy) { \ .pid_ns = &init_pid_ns, \ - .count = ATOMIC_INIT(1), \ + .count = INIT_NSPROXY_COUNT, \ .uts_ns = &init_uts_ns, \ .mnt_ns = NULL, \ INIT_NET_NS(net_ns) \ @@ -179,6 +187,7 @@ extern struct group_info init_groups; INIT_IDS \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ + INIT_VZ_FAIRSCHED \ } diff --git a/include/linux/inotify.h b/include/linux/inotify.h index bd57857..8833215 100644 --- a/include/linux/inotify.h +++ b/include/linux/inotify.h @@ -73,6 +73,7 @@ struct inotify_event { #include #include +#include /* * struct inotify_watch - represents a watch request on a specific inode @@ -90,6 +91,7 @@ struct inotify_watch { struct list_head i_list; /* entry in inode's list */ atomic_t count; /* reference count */ struct inotify_handle *ih; /* associated inotify handle */ + struct path path; struct inode *inode; /* associated inode */ __s32 wd; /* watch descriptor */ __u32 mask; /* event mask for this watch */ @@ -126,6 +128,8 @@ extern __s32 inotify_find_update_watch(struct inotify_handle *, struct inode *, u32); extern __s32 inotify_add_watch(struct inotify_handle *, struct inotify_watch *, struct inode *, __u32); +extern __s32 inotify_add_watch_dget(struct inotify_handle *, struct inotify_watch *, + struct path *, __u32); extern __s32 inotify_clone_watch(struct inotify_watch *, struct inotify_watch *); extern void inotify_evict_watch(struct inotify_watch *); extern int inotify_rm_watch(struct inotify_handle *, struct inotify_watch *); @@ -135,6 +139,66 @@ extern void inotify_remove_watch_locked(struct inotify_handle *, extern void get_inotify_watch(struct inotify_watch *); extern void put_inotify_watch(struct inotify_watch *); +/* + * struct inotify_handle - represents an inotify instance + * + * This structure is protected by the mutex 'mutex'. + */ +struct inotify_handle { + struct idr idr; /* idr mapping wd -> watch */ + struct mutex mutex; /* protects this bad boy */ + struct list_head watches; /* list of watches */ + atomic_t count; /* reference count */ + u32 last_wd; /* the last wd allocated */ + const struct inotify_operations *in_ops; /* inotify caller operations */ +}; + + +/* + * struct inotify_device - represents an inotify instance + * + * This structure is protected by the mutex 'mutex'. + */ +struct inotify_device { + wait_queue_head_t wq; /* wait queue for i/o */ + struct mutex ev_mutex; /* protects event queue */ + struct mutex up_mutex; /* synchronizes watch updates */ + struct list_head events; /* list of queued events */ + atomic_t count; /* reference count */ + struct user_struct *user; /* user who opened this dev */ + struct inotify_handle *ih; /* inotify handle */ + struct fasync_struct *fa; /* async notification */ + unsigned int queue_size; /* size of the queue (bytes) */ + unsigned int event_count; /* number of pending events */ + unsigned int max_events; /* maximum number of events */ +}; + +/* + * struct inotify_kernel_event - An inotify event, originating from a watch and + * queued for user-space. A list of these is attached to each instance of the + * device. In read(), this list is walked and all events that can fit in the + * buffer are returned. + * + * Protected by dev->ev_mutex of the device in which we are queued. + */ +struct inotify_kernel_event { + struct inotify_event event; /* the user-space event */ + struct list_head list; /* entry in inotify_device's list */ + char *name; /* filename, if any */ +}; + +/* + * struct inotify_user_watch - our version of an inotify_watch, we add + * a reference to the associated inotify_device. + */ +struct inotify_user_watch { + struct inotify_device *dev; /* associated device */ + struct inotify_watch wdata; /* inotify watch data */ +}; + +int inotify_create_watch(struct inotify_device *dev, struct path *p, u32 mask); + + #else static inline void inotify_d_instantiate(struct dentry *dentry, @@ -204,6 +268,13 @@ static inline __s32 inotify_add_watch(struct inotify_handle *ih, return -EOPNOTSUPP; } +static inline __s32 inotify_add_watch_dget(struct inotify_handle *h, + struct inotify_watch *w, + struct path *p, __u32 mask) +{ + return -EOPNOTSUPP; +} + static inline int inotify_rm_watch(struct inotify_handle *ih, struct inotify_watch *watch) { diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index f98a656..2d86ade 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -39,6 +39,7 @@ enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, + IOPRIO_WHO_UBC = 1000, }; /* diff --git a/include/linux/ipc.h b/include/linux/ipc.h index b882610..67d186c 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -81,6 +81,7 @@ struct ipc_kludge { #include #include +#include #define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ @@ -100,6 +101,15 @@ struct kern_ipc_perm void *security; }; +struct ipc_ids; + +struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); +static inline void ipc_unlock(struct kern_ipc_perm *perm) +{ + spin_unlock(&perm->lock); + rcu_read_unlock(); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_IPC_H */ diff --git a/include/linux/kdev_t.h b/include/linux/kdev_t.h index 2dacab8..91783a7 100644 --- a/include/linux/kdev_t.h +++ b/include/linux/kdev_t.h @@ -87,6 +87,57 @@ static inline unsigned sysv_minor(u32 dev) return dev & 0x3ffff; } +#define UNNAMED_MAJOR_COUNT 16 + +#if UNNAMED_MAJOR_COUNT > 1 + +extern int unnamed_dev_majors[UNNAMED_MAJOR_COUNT]; + +static inline dev_t make_unnamed_dev(int idx) +{ + /* + * Here we transfer bits from 8 to 8+log2(UNNAMED_MAJOR_COUNT) of the + * unnamed device index into major number. + */ + return MKDEV(unnamed_dev_majors[(idx >> 8) & (UNNAMED_MAJOR_COUNT - 1)], + idx & ~((UNNAMED_MAJOR_COUNT - 1) << 8)); +} + +static inline int unnamed_dev_idx(dev_t dev) +{ + int i; + for (i = 0; i < UNNAMED_MAJOR_COUNT && + MAJOR(dev) != unnamed_dev_majors[i]; i++); + return MINOR(dev) | (i << 8); +} + +static inline int is_unnamed_dev(dev_t dev) +{ + int i; + for (i = 0; i < UNNAMED_MAJOR_COUNT && + MAJOR(dev) != unnamed_dev_majors[i]; i++); + return i < UNNAMED_MAJOR_COUNT; +} + +#else /* UNNAMED_MAJOR_COUNT */ + +static inline dev_t make_unnamed_dev(int idx) +{ + return MKDEV(0, idx); +} + +static inline int unnamed_dev_idx(dev_t dev) +{ + return MINOR(dev); +} + +static inline int is_unnamed_dev(dev_t dev) +{ + return MAJOR(dev) == 0; +} + +#endif /* UNNAMED_MAJOR_COUNT */ + #else /* __KERNEL__ */ /* diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2651f80..80cad52 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -200,6 +200,12 @@ extern struct ratelimit_state printk_ratelimit_state; extern int printk_ratelimit(void); extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec); +asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args) + __attribute__ ((format (printf, 2, 0))); +asmlinkage int ve_printk(int, const char * fmt, ...) + __attribute__ ((format (printf, 2, 3))); +void prepare_printk(void); + #else static inline int vprintk(const char *s, va_list args) __attribute__ ((format (printf, 1, 0))); @@ -211,6 +217,15 @@ static inline int printk_ratelimit(void) { return 0; } static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \ unsigned int interval_msec) \ { return false; } +static inline int ve_printk(int d, const char *s, ...) + __attribute__ ((format (printf, 2, 3))); +static inline int ve_printk(int d, const char *s, ...) +{ + return 0; +} +static inline void prepare_printk(void) +{ +} #endif extern void asmlinkage __attribute__((format(printf, 1, 2))) @@ -218,9 +233,17 @@ extern void asmlinkage __attribute__((format(printf, 1, 2))) unsigned long int_sqrt(unsigned long); +#define VE0_LOG 1 +#define VE_LOG 2 +#define VE_LOG_BOTH (VE0_LOG | VE_LOG) +extern int console_silence_loglevel; + static inline void console_silent(void) { - console_loglevel = 0; + if (console_loglevel > console_silence_loglevel) { + printk(KERN_EMERG "console shuts up ...\n"); + console_loglevel = 0; + } } static inline void console_verbose(void) @@ -234,6 +257,7 @@ extern void wake_up_klogd(void); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; +extern int decode_call_traces; extern int panic_on_unrecovered_nmi; extern int tainted; extern const char *print_tainted(void); diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 5437ac0..2592187 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -51,6 +51,8 @@ enum kobject_action { KOBJ_REMOVE, KOBJ_CHANGE, KOBJ_MOVE, + KOBJ_START, + KOBJ_STOP, KOBJ_ONLINE, KOBJ_OFFLINE, KOBJ_MAX diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index dbb87ab..734aafe 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -61,6 +61,7 @@ struct nlm_host { struct list_head h_granted; /* Locks in GRANTED state */ struct list_head h_reclaim; /* Locks in RECLAIM state */ struct nsm_handle * h_nsmhandle; /* NSM status handle */ + struct ve_struct * owner_env; /* VE owning the host */ }; struct nsm_handle { @@ -152,8 +153,11 @@ extern struct svc_procedure nlmsvc_procedures[]; #ifdef CONFIG_LOCKD_V4 extern struct svc_procedure nlmsvc_procedures4[]; #endif -extern int nlmsvc_grace_period; -extern unsigned long nlmsvc_timeout; + +#include +extern int _nlmsvc_grace_period; +extern unsigned long _nlmsvc_timeout; + extern int nsm_use_hostnames; /* diff --git a/include/linux/major.h b/include/linux/major.h index 53d5faf..4cd77c4 100644 --- a/include/linux/major.h +++ b/include/linux/major.h @@ -170,4 +170,7 @@ #define VIOTAPE_MAJOR 230 +#define UNNAMED_EXTRA_MAJOR 130 +#define UNNAMED_EXTRA_MAJOR_COUNT 120 + #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 72a15dc..f97db27 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -704,15 +704,7 @@ static inline int page_mapped(struct page *page) extern void show_free_areas(void); -#ifdef CONFIG_SHMEM -int shmem_lock(struct file *file, int lock, struct user_struct *user); -#else -static inline int shmem_lock(struct file *file, int lock, - struct user_struct *user) -{ - return 0; -} -#endif +#define shmem_nopage filemap_nopage struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags); int shmem_zero_setup(struct vm_area_struct *); @@ -778,7 +770,9 @@ int walk_page_range(unsigned long addr, unsigned long end, void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); +int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma, + unsigned long addr, size_t size); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bf33413..604871b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -95,6 +95,14 @@ struct page { #ifdef CONFIG_CGROUP_MEM_RES_CTLR unsigned long page_cgroup; #endif +#ifdef CONFIG_BEANCOUNTERS + /* FIXME: switch to mainline memcgroup */ + union { + struct user_beancounter *page_ub; + struct page_beancounter *page_pb; + struct user_beancounter **slub_ubs; + } bc; +#endif }; /* @@ -230,11 +238,17 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access the bits */ + unsigned int vps_dumpable:2; + unsigned int oom_killed:1; + struct core_state *core_state; /* coredumping support */ /* aio bits */ rwlock_t ioctx_list_lock; /* aio lock */ struct kioctx *ioctx_list; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *mm_ub; +#endif #ifdef CONFIG_MM_OWNER /* * "owner" points to a task that is regarded as the canonical diff --git a/include/linux/mman.h b/include/linux/mman.h index 30d1073..787f2a4 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -88,6 +88,9 @@ static inline unsigned long calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | +#ifdef MAP_GROWSUP + _calc_vm_trans(flags, MAP_GROWSUP, VM_GROWSUP ) | +#endif _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 830bbcd..fdc1225 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -24,6 +24,8 @@ struct proc_mounts { extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct fs_struct *); +extern struct rw_semaphore namespace_sem; + extern void __put_mnt_ns(struct mnt_namespace *ns); static inline void put_mnt_ns(struct mnt_namespace *ns) diff --git a/include/linux/mount.h b/include/linux/mount.h index 30a1d63..803b390 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -71,6 +71,7 @@ struct vfsmount { * are held, and all mnt_writer[]s on this mount have 0 as their ->count */ atomic_t __mnt_writers; + unsigned owner; }; static inline struct vfsmount *mntget(struct vfsmount *mnt) diff --git a/include/linux/msg.h b/include/linux/msg.h index 56abf15..050f740 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -107,6 +107,14 @@ extern long do_msgsnd(int msqid, long mtype, void __user *mtext, extern long do_msgrcv(int msqid, long *pmtype, void __user *mtext, size_t msgsz, long msgtyp, int msgflg); +int sysvipc_walk_msg(int (*func)(int, struct msg_queue*, void *), void *arg); +int sysvipc_setup_msg(key_t key, int msqid, int msgflg); +int sysv_msg_store(struct msg_msg *msg, + int (*store)(void * src, int len, int offset, void * data), + int len, void * data); +struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset, + void * data), int len, void * data); + #endif /* __KERNEL__ */ #endif /* _LINUX_MSG_H */ diff --git a/include/linux/namei.h b/include/linux/namei.h index 68f8c32..16cd273 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -53,6 +53,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; */ #define LOOKUP_OPEN (0x0100) #define LOOKUP_CREATE (0x0200) +#define LOOKUP_NOAREACHECK (0x1000) /* no area check on lookup */ +#define LOOKUP_STRICT (0x2000) /* no symlinks or other filesystems */ extern int user_path_at(int, const char __user *, unsigned, struct path *); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 488c56e..2cadfda 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -278,6 +278,11 @@ enum netdev_state_t __LINK_STATE_DORMANT, }; +struct netdev_bc { + struct user_beancounter *exec_ub, *owner_ub; +}; + +#define netdev_bc(dev) (&(dev)->dev_bc) /* * This structure holds at boot time configured netdevice settings. They @@ -521,13 +526,17 @@ struct net_device #define NETIF_F_LRO 32768 /* large receive offload */ /* Segmentation offload features */ -#define NETIF_F_GSO_SHIFT 16 -#define NETIF_F_GSO_MASK 0xffff0000 +#define NETIF_F_GSO_SHIFT 20 +#define NETIF_F_GSO_MASK 0xfff00000 #define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT) #define NETIF_F_UFO (SKB_GSO_UDP << NETIF_F_GSO_SHIFT) #define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT) #define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT) #define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT) +/* device is venet device */ +#define NETIF_F_VENET (1 << (NETIF_F_GSO_SHIFT - 1)) +/* can be registered inside VE */ +#define NETIF_F_VIRTUAL (1 << (NETIF_F_GSO_SHIFT - 2)) /* List of features with software fallbacks. */ #define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6) @@ -735,6 +744,9 @@ struct net_device /* GARP */ struct garp_port *garp_port; + struct ve_struct *owner_env; /* Owner VE of the interface */ + struct netdev_bc dev_bc; + /* class/net/name entry */ struct device dev; /* space for optional statistics and wireless sysfs groups */ @@ -752,6 +764,20 @@ struct net_device }; #define to_net_dev(d) container_of(d, struct net_device, dev) +#define NETDEV_HASHBITS 8 +#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) + +static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) +{ + unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); + return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; +} + +static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) +{ + return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; +} + #define NETDEV_ALIGN 32 #define NETDEV_ALIGN_CONST (NETDEV_ALIGN - 1) @@ -1224,6 +1250,8 @@ extern int dev_ethtool(struct net *net, struct ifreq *); extern unsigned dev_get_flags(const struct net_device *); extern int dev_change_flags(struct net_device *, unsigned); extern int dev_change_name(struct net_device *, char *); +int __dev_change_net_namespace(struct net_device *, struct net *, const char *, + struct user_beancounter *exec_ub); extern int dev_change_net_namespace(struct net_device *, struct net *, const char *); extern int dev_set_mtu(struct net_device *, int); @@ -1673,6 +1701,18 @@ extern void linkwatch_run_queue(void); extern int netdev_compute_features(unsigned long all, unsigned long one); +#if defined(CONFIG_VE) && defined(CONFIG_NET) +static inline int ve_is_dev_movable(struct net_device *dev) +{ + return !(dev->features & (NETIF_F_VIRTUAL | NETIF_F_NETNS_LOCAL)); +} +#else +static inline int ve_is_dev_movable(struct net_device *dev) +{ + return 0; +} +#endif + static inline int net_gso_ok(int features, int gso_type) { int feature = gso_type << NETIF_F_GSO_SHIFT; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 0c5eb7e..8d41ea4 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -394,5 +394,24 @@ static inline struct net *nf_post_routing_net(const struct net_device *in, #endif } +#ifdef CONFIG_VE_IPTABLES +#include + +#define net_ipt_module_permitted(netns, ipt) \ + (VE_IPT_CMP((netns)->owner_ve->ipt_mask, ipt) && \ + VE_IPT_CMP((netns)->owner_ve->_iptables_modules, \ + (ipt) & ~(ipt##_MOD))) + +#define net_ipt_module_set(netns, ipt) ({ \ + (netns)->owner_ve->_iptables_modules |= ipt##_MOD; \ + }) +#define net_is_ipt_module_set(netns, ipt) ( \ + (netns)->owner_ve->_iptables_modules & (ipt##_MOD)) +#else +#define net_ipt_module_permitted(netns, ipt) (1) +#define net_ipt_module_set(netns, ipt) +#define net_is_ipt_module_set(netns, ipt) (1) +#endif + #endif /*__KERNEL__*/ #endif /*__LINUX_NETFILTER_H*/ diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 2326296..7a66377 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -302,6 +302,7 @@ struct xt_table_info { /* Size per table */ unsigned int size; + unsigned int alloc_size; /* Number of entries: FIXME. --RR */ unsigned int number; /* Initial number of entries. Needed for module usage count */ diff --git a/include/linux/netfilter/xt_hashlimit.h b/include/linux/netfilter/xt_hashlimit.h index 51b18d8..439da56 100644 --- a/include/linux/netfilter/xt_hashlimit.h +++ b/include/linux/netfilter/xt_hashlimit.h @@ -63,4 +63,11 @@ struct xt_hashlimit_mtinfo1 { struct xt_hashlimit_htable *hinfo __attribute__((aligned(8))); }; +#ifdef __KERNEL__ +struct ve_xt_hashlimit { + struct hlist_head hashlimit_htables; + struct proc_dir_entry *hashlimit_procdir4; + struct proc_dir_entry *hashlimit_procdir6; +}; +#endif #endif /*_XT_HASHLIMIT_H*/ diff --git a/include/linux/netfilter_ipv4/ipt_recent.h b/include/linux/netfilter_ipv4/ipt_recent.h index 6508a45..3b9a1e8 100644 --- a/include/linux/netfilter_ipv4/ipt_recent.h +++ b/include/linux/netfilter_ipv4/ipt_recent.h @@ -24,4 +24,12 @@ struct ipt_recent_info { u_int8_t side; }; +#ifdef __KERNEL__ +struct ve_ipt_recent { + struct list_head tables; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_dir; +#endif +}; +#endif #endif /*_IPT_RECENT_H*/ diff --git a/include/linux/nfcalls.h b/include/linux/nfcalls.h new file mode 100644 index 0000000..f968054 --- /dev/null +++ b/include/linux/nfcalls.h @@ -0,0 +1,172 @@ +/* + * include/linux/nfcalls.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_NFCALLS_H +#define _LINUX_NFCALLS_H + +#include + +#ifdef CONFIG_MODULES +extern struct module no_module; + +#define DECL_KSYM_MODULE(name) \ + extern struct module *vz_mod_##name + +#define INIT_KSYM_MODULE(name) \ + struct module *vz_mod_##name = &no_module; \ + EXPORT_SYMBOL(vz_mod_##name) + +static inline void __vzksym_modresolve(struct module **modp, struct module *mod) +{ + /* + * we want to be sure, that pointer updates are visible first: + * 1. wmb() is here only for piece of sure + * (note, no rmb() in KSYMSAFECALL) + * 2. synchronize_sched() guarantees that updates are visible + * on all cpus and allows us to remove rmb() in KSYMSAFECALL + */ + wmb(); synchronize_sched(); + *modp = mod; + /* just to be sure, our changes are visible as soon as possible */ + wmb(); synchronize_sched(); +} + +static inline void __vzksym_modunresolve(struct module **modp) +{ + /* + * try_module_get() in KSYMSAFECALL should fail at this moment since + * THIS_MODULE in in unloading state (we should be called from fini), + * no need to syncronize pointers/ve_module updates. + */ + *modp = &no_module; + /* + * synchronize_sched() guarantees here that we see + * updated module pointer before the module really gets away + */ + synchronize_sched(); +} + +static inline int __vzksym_module_get(struct module *mod) +{ + /* + * we want to avoid rmb(), so use synchronize_sched() in KSYMUNRESOLVE + * and smp_read_barrier_depends() here... + */ + smp_read_barrier_depends(); /* for module loading */ + if (!try_module_get(mod)) + return -EBUSY; + + return 0; +} + +static inline void __vzksym_module_put(struct module *mod) +{ + module_put(mod); +} +#else +#define DECL_KSYM_MODULE(name) +#define INIT_KSYM_MODULE(name) +#define __vzksym_modresolve(modp, mod) +#define __vzksym_modunresolve(modp) +#define __vzksym_module_get(mod) 0 +#define __vzksym_module_put(mod) +#endif + +#define __KSYMERRCALL(err, type, mod, name, args) \ +({ \ + type ret = (type)err; \ + if (!__vzksym_module_get(vz_mod_##mod)) { \ + if (vz_##name) \ + ret = ((*vz_##name)args); \ + __vzksym_module_put(vz_mod_##mod); \ + } \ + ret; \ +}) + +#define __KSYMSAFECALL_VOID(mod, name, args) \ + do { \ + if (!__vzksym_module_get(vz_mod_##mod)) { \ + if (vz_##name) \ + ((*vz_##name)args); \ + __vzksym_module_put(vz_mod_##mod); \ + } \ + } while (0) + +#define DECL_KSYM_CALL(type, name, args) \ + extern type (*vz_##name) args +#define INIT_KSYM_CALL(type, name, args) \ + type (*vz_##name) args; \ +EXPORT_SYMBOL(vz_##name) + +#define KSYMERRCALL(err, mod, name, args) \ + __KSYMERRCALL(err, int, mod, name, args) +#define KSYMSAFECALL(type, mod, name, args) \ + __KSYMERRCALL(0, type, mod, name, args) +#define KSYMSAFECALL_VOID(mod, name, args) \ + __KSYMSAFECALL_VOID(mod, name, args) +#define KSYMREF(name) vz_##name + +/* should be called _after_ KSYMRESOLVE's */ +#define KSYMMODRESOLVE(name) \ + __vzksym_modresolve(&vz_mod_##name, THIS_MODULE) +#define KSYMMODUNRESOLVE(name) \ + __vzksym_modunresolve(&vz_mod_##name) + +#define KSYMRESOLVE(name) \ + vz_##name = &name +#define KSYMUNRESOLVE(name) \ + vz_##name = NULL + +#if defined(CONFIG_VE) +DECL_KSYM_MODULE(ip_tables); +DECL_KSYM_MODULE(ip6_tables); +DECL_KSYM_MODULE(iptable_filter); +DECL_KSYM_MODULE(ip6table_filter); +DECL_KSYM_MODULE(iptable_mangle); +DECL_KSYM_MODULE(ip6table_mangle); +DECL_KSYM_MODULE(ip_conntrack); +DECL_KSYM_MODULE(nf_conntrack); +DECL_KSYM_MODULE(nf_conntrack_ipv4); +DECL_KSYM_MODULE(nf_conntrack_ipv6); +DECL_KSYM_MODULE(xt_conntrack); +DECL_KSYM_MODULE(ip_nat); +DECL_KSYM_MODULE(nf_nat); +DECL_KSYM_MODULE(iptable_nat); + +struct sk_buff; + +DECL_KSYM_CALL(int, init_iptable_conntrack, (void)); +DECL_KSYM_CALL(int, nf_conntrack_init_ve, (void)); +DECL_KSYM_CALL(int, init_nf_ct_l3proto_ipv4, (void)); +DECL_KSYM_CALL(int, init_nf_ct_l3proto_ipv6, (void)); +DECL_KSYM_CALL(int, nf_nat_init, (void)); +DECL_KSYM_CALL(int, init_nftable_nat, (void)); +DECL_KSYM_CALL(int, nf_nat_init, (void)); +DECL_KSYM_CALL(void, fini_nftable_nat, (void)); +DECL_KSYM_CALL(void, nf_nat_cleanup, (void)); +DECL_KSYM_CALL(void, fini_iptable_conntrack, (void)); +DECL_KSYM_CALL(void, nf_conntrack_cleanup_ve, (void)); +DECL_KSYM_CALL(void, fini_nf_ct_l3proto_ipv4, (void)); +DECL_KSYM_CALL(void, fini_nf_ct_l3proto_ipv6, (void)); + +#include +#endif + +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) +DECL_KSYM_MODULE(vzethdev); +DECL_KSYM_CALL(int, veth_open, (struct net_device *dev)); +#endif + +#if defined(CONFIG_VE_CALLS) || defined(CONFIG_VE_CALLS_MODULE) +DECL_KSYM_MODULE(vzmon); +DECL_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); +#endif + +#endif /* _LINUX_NFCALLS_H */ diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index c9beacd..cb87ca2 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -70,6 +70,7 @@ struct nfs_client { char cl_ipaddr[48]; unsigned char cl_id_uniquifier; #endif + struct ve_struct *owner_env; }; /* diff --git a/include/linux/notifier.h b/include/linux/notifier.h index da2698b..ae805e0 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -153,8 +153,9 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, #define NOTIFY_DONE 0x0000 /* Don't care */ #define NOTIFY_OK 0x0001 /* Suits me */ +#define NOTIFY_FAIL 0x0002 /* Reject */ #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ -#define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002) +#define NOTIFY_BAD (NOTIFY_STOP_MASK|NOTIFY_FAIL) /* Bad/Veto action */ /* * Clean way to return from the notifier and stop further calls. diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index c8a768e..119368b 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -66,6 +66,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); void free_nsproxy(struct nsproxy *ns); +struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct fs_struct *); @@ -76,9 +77,10 @@ static inline void put_nsproxy(struct nsproxy *ns) } } -static inline void get_nsproxy(struct nsproxy *ns) +static inline struct nsproxy *get_nsproxy(struct nsproxy *ns) { atomic_inc(&ns->count); + return ns; } #ifdef CONFIG_CGROUP_NS diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c74d3e8..c5f47c1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -173,6 +173,7 @@ __PAGEFLAG(Slab, slab) PAGEFLAG(Checked, checked) /* Used by some filesystems */ PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ PAGEFLAG(SavePinned, savepinned); /* Xen */ +PAGEFLAG(Checkpointed, owner_priv_1) PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private) __SETPAGEFLAG(Private, private) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index fac3337..4c889bf 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -74,6 +74,22 @@ struct percpu_data { (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) +struct percpu_data_static { + void *ptrs[NR_CPUS]; +}; + +#define DEFINE_PER_CPU_STATIC(type, name) \ + static struct percpu_data_static per_cpu_data__##name; \ + static __typeof__(type) per_cpu__##name[NR_CPUS] + +#define percpu_static_init(name) ({ \ + int i; \ + for (i = 0; i < NR_CPUS; i++) \ + (per_cpu_data__##name).ptrs[i] = &(per_cpu__##name)[i];\ + (__typeof__(&(per_cpu__##name)[0])) \ + __percpu_disguise(&(per_cpu_data__##name));\ + }) + extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask); extern void percpu_free(void *__pdata); @@ -81,6 +97,11 @@ extern void percpu_free(void *__pdata); #define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) +#define DEFINE_PER_CPU_STATIC(type, name) \ + static __typeof__(type) per_cpu__##name[NR_CPUS] + +#define percpu_static_init(name) (&(per_cpu__##name)[0]) + static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) { return kzalloc(size, gfp); diff --git a/include/linux/pid.h b/include/linux/pid.h index d7e98ff..9ca7db8 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -60,6 +60,9 @@ struct pid unsigned int level; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *ub; +#endif struct rcu_head rcu; struct upid numbers[1]; }; @@ -96,6 +99,11 @@ extern void change_pid(struct task_struct *task, enum pid_type, struct pid *pid); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); +extern void reattach_pid(struct task_struct *, enum pid_type, struct pid *); +extern int alloc_pidmap(struct pid_namespace *pid_ns); +extern int set_pidmap(struct pid_namespace *pid_ns, pid_t pid); + +extern spinlock_t pidmap_lock; struct pid_namespace; extern struct pid_namespace init_pid_ns; @@ -119,8 +127,11 @@ extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); int next_pidmap(struct pid_namespace *pid_ns, int last); -extern struct pid *alloc_pid(struct pid_namespace *ns); +extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid); extern void free_pid(struct pid *pid); +extern int pid_ns_attach_init(struct pid_namespace *, struct task_struct *); +extern int pid_ns_attach_task(struct pid_namespace *, struct task_struct *); +pid_t pid_to_vpid(pid_t nr); /* * the helpers to get the pid's id seen from different namespaces @@ -167,7 +178,7 @@ pid_t pid_vnr(struct pid *pid); do { #define while_each_pid_thread(pid, type, task) \ - } while_each_thread(tg___, task); \ + } while_each_thread_ve(tg___, task); \ task = tg___; \ } while_each_pid_task(pid, type, task) #endif /* _LINUX_PID_H */ diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 1af82c4..d5d638d 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -16,6 +16,14 @@ struct pidmap { struct bsd_acct_struct; +/* pid namespace flags */ + +/* if set newly created pid ns got PID_NS_HIDE_CHILD flag */ +#define PID_NS_HIDE_CHILD 0x00000001 + +/* if set newly created processes invisible from parent ns*/ +#define PID_NS_HIDDEN 0x00000002 + struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; @@ -24,6 +32,7 @@ struct pid_namespace { struct kmem_cache *pid_cachep; unsigned int level; struct pid_namespace *parent; + unsigned flags; #ifdef CONFIG_PROC_FS struct vfsmount *proc_mnt; #endif diff --git a/include/linux/poll.h b/include/linux/poll.h index ef45382..c1bf82a 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -119,6 +119,7 @@ extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds, s64 *timeout); extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s64 *timeout); +long do_restart_poll(struct restart_block *restart_block); #endif /* KERNEL */ diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index fb61850..fc2a6c7 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -126,7 +126,10 @@ extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent); extern struct vfsmount *proc_mnt; struct pid_namespace; extern int proc_fill_super(struct super_block *); -extern struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *); +extern struct inode *proc_get_inode(struct super_block *, unsigned int, + struct proc_dir_entry *glob, struct proc_dir_entry *loc); + +extern struct file_system_type proc_fs_type; /* * These are generic /proc routines that use the internal @@ -140,6 +143,7 @@ extern struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameid extern const struct file_operations proc_kcore_operations; extern const struct file_operations ppc_htab_operations; +extern const struct file_operations proc_kmsg_operations; extern int pid_ns_prepare_proc(struct pid_namespace *ns); extern void pid_ns_release_proc(struct pid_namespace *ns); @@ -174,6 +178,8 @@ extern struct proc_dir_entry *proc_mkdir(const char *,struct proc_dir_entry *); extern struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, struct proc_dir_entry *parent); +extern struct proc_dir_entry glob_proc_root; + static inline struct proc_dir_entry *proc_create(const char *name, mode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops) { @@ -292,6 +298,9 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; +#ifdef CONFIG_VE + struct proc_dir_entry *lpde; +#endif struct inode vfs_inode; }; @@ -305,6 +314,15 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode) return PROC_I(inode)->pde; } +static inline struct proc_dir_entry *LPDE(const struct inode *inode) +{ +#ifdef CONFIG_VE + return PROC_I(inode)->lpde; +#else + return NULL; +#endif +} + static inline struct net *PDE_NET(struct proc_dir_entry *pde) { return pde->parent->data; diff --git a/include/linux/quota.h b/include/linux/quota.h index 376a050..eb3df9b 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -167,6 +167,10 @@ enum { #include #include +#include + +extern spinlock_t dq_data_lock; + #include #include #include @@ -284,6 +288,8 @@ struct quota_format_ops { int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */ }; +struct inode; +struct iattr; /* Operations working with dquots */ struct dquot_operations { int (*initialize) (struct inode *, int); @@ -298,9 +304,11 @@ struct dquot_operations { int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */ int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */ int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */ + int (*rename) (struct inode *, struct inode *, struct inode *); }; /* Operations handling requests from userspace */ +struct v2_disk_dqblk; struct quotactl_ops { int (*quota_on)(struct super_block *, int, int, char *, int); int (*quota_off)(struct super_block *, int, int); @@ -313,6 +321,10 @@ struct quotactl_ops { int (*set_xstate)(struct super_block *, unsigned int, int); int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); +#ifdef CONFIG_QUOTA_COMPAT + int (*get_quoti)(struct super_block *, int, unsigned int, + struct v2_disk_dqblk __user *); +#endif }; struct quota_format_type { @@ -337,6 +349,10 @@ struct quota_info { struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) + struct vz_quota_master *vzdq_master; + int vzdq_count; +#endif }; int register_quota_format(struct quota_format_type *fmt); diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index ca6b9b5..e9cc3f1 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -183,6 +183,19 @@ static inline void vfs_dq_free_inode(struct inode *inode) inode->i_sb->dq_op->free_inode(inode, 1); } +static __inline__ int vfs_dq_rename(struct inode *inode, + struct inode *old_dir, struct inode *new_dir) +{ + struct dquot_operations *q_op; + + q_op = inode->i_sb->dq_op; + if (q_op && q_op->rename) { + if (q_op->rename(inode, old_dir, new_dir) == NO_QUOTA) + return 1; + } + return 0; +} + /* The following two functions cannot be called inside a transaction */ static inline void vfs_dq_sync(struct super_block *sb) { @@ -262,6 +275,12 @@ static inline int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) return 0; } +static inline int vfs_dq_rename(struct inode *inode, struct inode *old_dir, + struct inode *new_dir) +{ + return 0; +} + static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr) { inode_add_bytes(inode, nr); @@ -363,6 +382,7 @@ static inline void vfs_dq_free_block(struct inode *inode, qsize_t nr) #define DQUOT_FREE_INODE(inode) vfs_dq_free_inode(inode) #define DQUOT_TRANSFER(inode, iattr) vfs_dq_transfer(inode, iattr) #define DQUOT_SYNC(sb) vfs_dq_sync(sb) +#define DQUOT_RENAME(inode, od, nd) vfs_dq_rename(inode, od, nd) #define DQUOT_OFF(sb, remount) vfs_dq_off(sb, remount) #define DQUOT_ON_REMOUNT(sb) vfs_dq_quota_on_remount(sb) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index fed6f5e..dff17e3 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -82,6 +82,8 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void page_add_file_rmap(struct page *); void page_remove_rmap(struct page *, struct vm_area_struct *); +struct anon_vma *page_lock_anon_vma(struct page *page); +void page_unlock_anon_vma(struct anon_vma *anon_vma); #ifdef CONFIG_DEBUG_VM void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address); diff --git a/include/linux/sched.h b/include/linux/sched.h index 3d9120c..6e47614 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -29,6 +29,10 @@ #define CLONE_NEWNET 0x40000000 /* New network namespace */ #define CLONE_IO 0x80000000 /* Clone io context */ +/* mask of clones which are disabled in OpenVZ VEs */ +#define CLONE_NAMESPACES_MASK (CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | \ + CLONE_NEWPID | CLONE_NEWNET) + /* * Scheduling policies */ @@ -91,6 +95,8 @@ struct sched_param { #include +#include + struct mem_cgroup; struct exec_domain; struct futex_pi_state; @@ -127,14 +133,37 @@ extern unsigned long avenrun[]; /* Load averages */ load += n*(FIXED_1-exp); \ load >>= FSHIFT; +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + extern unsigned long total_forks; extern int nr_threads; DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); +extern unsigned long nr_sleeping(void); +extern unsigned long nr_stopped(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern atomic_t nr_dead; +extern unsigned long nr_zombie; + +#ifdef CONFIG_VE +struct ve_struct; +extern unsigned long nr_running_ve(struct ve_struct *); +extern unsigned long nr_iowait_ve(struct ve_struct *); +extern unsigned long nr_uninterruptible_ve(struct ve_struct *); +extern cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu); +extern cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu); +void ve_sched_attach(struct ve_struct *envid); +#else +#define nr_running_ve(ve) 0 +#define nr_iowait_ve(ve) 0 +#define nr_uninterruptible_ve(ve) 0 +#define ve_sched_get_idle_time(ve, cpu) 0 +#define ve_sched_get_iowait_time(ve, cpu) 0 +#endif struct seq_file; struct cfs_rq; @@ -271,6 +300,7 @@ static inline void show_state(void) } extern void show_regs(struct pt_regs *); +extern void smp_show_regs(struct pt_regs *, void *); /* * TASK is a pointer to the task whose backtrace we want to see (or NULL for current @@ -425,6 +455,9 @@ struct pacct_struct { unsigned long ac_minflt, ac_majflt; }; +#include +#include + /* * NOTE! "signal_struct" does not have it's own * locking, because a shared signal_struct always @@ -1088,6 +1121,7 @@ struct task_struct { /* ??? */ unsigned int personality; unsigned did_exec:1; + unsigned did_ve_enter:1; pid_t pid; pid_t tgid; @@ -1287,6 +1321,14 @@ struct task_struct { struct rcu_head rcu; /* + * state tracking for suspend + * FIXME - ptrace is completely rewritten in this kernel + * so set_pn_state() is not set in many places correctyl + */ + __u8 pn_state; + __u8 stopped_state:1; + + /* * cache last used pipe for splice */ struct pipe_inode_info *splice_pipe; @@ -1301,6 +1343,19 @@ struct task_struct { int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif +#ifdef CONFIG_BEANCOUNTERS + struct task_beancounter task_bc; +#endif +#ifdef CONFIG_VE + struct ve_task_info ve_task_info; +#endif +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) + unsigned long magic; + struct inode *ino; +#endif +#ifdef CONFIG_VZ_FAIRSCHED + struct fairsched_node *fsched_node; +#endif }; /* @@ -1479,6 +1534,43 @@ extern cputime_t task_utime(struct task_struct *p); extern cputime_t task_stime(struct task_struct *p); extern cputime_t task_gtime(struct task_struct *p); +#ifndef CONFIG_VE +#define set_pn_state(tsk, state) do { } while(0) +#define clear_pn_state(tsk) do { } while(0) +#define set_stop_state(tsk) do { } while(0) +#define clear_stop_state(tsk) do { } while(0) +#else +#define PN_STOP_TF 1 /* was not in 2.6.8 */ +#define PN_STOP_TF_RT 2 /* was not in 2.6.8 */ +#define PN_STOP_ENTRY 3 +#define PN_STOP_FORK 4 +#define PN_STOP_VFORK 5 +#define PN_STOP_SIGNAL 6 +#define PN_STOP_EXIT 7 +#define PN_STOP_EXEC 8 +#define PN_STOP_LEAVE 9 + +static inline void set_pn_state(struct task_struct *tsk, int state) +{ + tsk->pn_state = state; +} + +static inline void clear_pn_state(struct task_struct *tsk) +{ + tsk->pn_state = 0; +} + +static inline void set_stop_state(struct task_struct *tsk) +{ + tsk->stopped_state = 1; +} + +static inline void clear_stop_state(struct task_struct *tsk) +{ + tsk->stopped_state = 0; +} +#endif + /* * Per process flags */ @@ -1495,6 +1587,7 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_EXIT_RESTART 0x00004000 /* do_exit() restarted, see do_exit() */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ @@ -1586,6 +1679,21 @@ extern unsigned long long cpu_clock(int cpu); extern unsigned long long task_sched_runtime(struct task_struct *task); +static inline unsigned long cycles_to_clocks(cycles_t cycles) +{ + extern unsigned long cycles_per_clock; + do_div(cycles, cycles_per_clock); + return cycles; +} + +static inline u64 cycles_to_jiffies(cycles_t cycles) +{ + extern unsigned long cycles_per_jiffy; + do_div(cycles, cycles_per_jiffy); + return cycles; +} + + /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP extern void sched_exec(void); @@ -1720,6 +1828,7 @@ static inline struct user_struct *get_uid(struct user_struct *u) extern void free_uid(struct user_struct *); extern void switch_uid(struct user_struct *); extern void release_uids(struct user_namespace *ns); +extern int set_user(uid_t uid, int dumpclear); #include @@ -1851,6 +1960,13 @@ extern int disallow_signal(int); extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); +extern long do_fork_pid(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr, + long pid0); struct task_struct *fork_idle(int); extern void set_task_comm(struct task_struct *tsk, char *from); @@ -1866,19 +1982,19 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, } #endif -#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) +#define next_task_all(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) -#define for_each_process(p) \ - for (p = &init_task ; (p = next_task(p)) != &init_task ; ) +#define for_each_process_all(p) \ + for (p = &init_task ; (p = next_task_all(p)) != &init_task ; ) /* * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. */ -#define do_each_thread(g, t) \ - for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do +#define do_each_thread_all(g, t) \ + for (g = t = &init_task ; (g = t = next_task_all(g)) != &init_task ; ) do -#define while_each_thread(g, t) \ +#define while_each_thread_all(g, t) \ while ((t = next_thread(t)) != g) /* de_thread depends on thread_group_leader not being a pid based check */ @@ -1903,8 +2019,15 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2) static inline struct task_struct *next_thread(const struct task_struct *p) { - return list_entry(rcu_dereference(p->thread_group.next), + struct task_struct *tsk; + + tsk = list_entry(rcu_dereference(p->thread_group.next), struct task_struct, thread_group); +#ifdef CONFIG_VE + /* all threads should belong to ONE ve! */ + BUG_ON(VE_TASK_INFO(tsk)->owner_env != VE_TASK_INFO(p)->owner_env); +#endif + return tsk; } static inline int thread_group_empty(struct task_struct *p) @@ -1944,6 +2067,98 @@ static inline void unlock_task_sighand(struct task_struct *tsk, spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); } +#ifndef CONFIG_VE + +#define for_each_process_ve(p) for_each_process_all(p) +#define do_each_thread_ve(g, t) do_each_thread_all(g, t) +#define while_each_thread_ve(g, t) while_each_thread_all(g, t) +#define first_task_ve() next_task_ve(&init_task) +#define __first_task_ve(owner) next_task_ve(&init_task) +#define __next_task_ve(owner, p) next_task_ve(p) +#define next_task_ve(p) \ + (next_task_all(p) != &init_task ? next_task_all(p) : NULL) + +#define ve_is_super(env) 1 +#define ve_accessible(target, owner) 1 +#define ve_accessible_strict(target, owner) 1 +#define ve_accessible_veid(target, owner) 1 +#define ve_accessible_strict_veid(target, owner) 1 + +#define VEID(ve) 0 + +#else /* CONFIG_VE */ + +#include + +#define ve_is_super(env) ((env) == get_ve0()) + +#define ve_accessible_strict(target, owner) ((target) == (owner)) +static inline int ve_accessible(struct ve_struct *target, + struct ve_struct *owner) +{ + return ve_is_super(owner) || ve_accessible_strict(target, owner); +} + +#define ve_accessible_strict_veid(target, owner) ((target) == (owner)) +static inline int ve_accessible_veid(envid_t target, envid_t owner) +{ + return get_ve0()->veid == owner || + ve_accessible_strict_veid(target, owner); +} + +#define VEID(ve) (ve->veid) + +static inline struct task_struct *ve_lh2task(struct ve_struct *ve, + struct list_head *lh) +{ + return lh == &ve->vetask_lh ? NULL : + list_entry(lh, struct task_struct, ve_task_info.vetask_list); +} + +static inline struct task_struct *__first_task_ve(struct ve_struct *ve) +{ + struct task_struct *tsk; + + if (unlikely(ve_is_super(ve))) { + tsk = next_task_all(&init_task); + if (tsk == &init_task) + tsk = NULL; + } else { + tsk = ve_lh2task(ve, rcu_dereference(ve->vetask_lh.next)); + } + return tsk; +} + +static inline struct task_struct *__next_task_ve(struct ve_struct *ve, + struct task_struct *tsk) +{ + if (unlikely(ve_is_super(ve))) { + tsk = next_task_all(tsk); + if (tsk == &init_task) + tsk = NULL; + } else { + BUG_ON(tsk->ve_task_info.owner_env != ve); + tsk = ve_lh2task(ve, rcu_dereference(tsk-> + ve_task_info.vetask_list.next)); + } + return tsk; +} + +#define first_task_ve() __first_task_ve(get_exec_env()) +#define next_task_ve(p) __next_task_ve(get_exec_env(), p) +/* no one uses prev_task_ve(), copy next_task_ve() if needed */ + +#define for_each_process_ve(p) \ + for (p = first_task_ve(); p != NULL ; p = next_task_ve(p)) + +#define do_each_thread_ve(g, t) \ + for (g = t = first_task_ve() ; g != NULL; g = t = next_task_ve(g)) do + +#define while_each_thread_ve(g, t) \ + while ((t = next_thread(t)) != g) + +#endif /* CONFIG_VE */ + #ifndef __HAVE_THREAD_FUNCTIONS #define task_thread_info(task) ((struct thread_info *)(task)->stack) diff --git a/include/linux/sem.h b/include/linux/sem.h index 1b191c1..64f30a9 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -154,6 +154,9 @@ static inline void exit_sem(struct task_struct *tsk) } #endif +int sysvipc_walk_sem(int (*func)(int, struct sem_array*, void *), void *arg); +int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg); + #endif /* __KERNEL__ */ #endif /* _LINUX_SEM_H */ diff --git a/include/linux/shm.h b/include/linux/shm.h index eca6235..c2b3bb5 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -83,6 +83,22 @@ struct shm_info { }; #ifdef __KERNEL__ + +#include + +#define IPC_SEM_IDS 0 +#define IPC_MSG_IDS 1 +#define IPC_SHM_IDS 2 + +struct shm_file_data { + int id; + struct ipc_namespace *ns; + struct file *file; + const struct vm_operations_struct *vm_ops; +}; +#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) +#define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) + struct shmid_kernel /* private to the kernel */ { struct kern_ipc_perm shm_perm; @@ -97,6 +113,23 @@ struct shmid_kernel /* private to the kernel */ struct user_struct *mlock_user; }; +/* + * shm_lock_(check_) routines are called in the paths where the rw_mutex + * is not held. + */ +static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id); + + if (IS_ERR(ipcp)) + return (struct shmid_kernel *)ipcp; + + return container_of(ipcp, struct shmid_kernel, shm_perm); +} + +#define shm_unlock(shp) \ + ipc_unlock(&(shp)->shm_perm) + /* shm_mode upper byte flags */ #define SHM_DEST 01000 /* segment will be destroyed on last detach */ #define SHM_LOCKED 02000 /* segment will not be swapped */ @@ -118,6 +151,12 @@ static inline int is_file_shm_hugepages(struct file *file) } #endif +int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg); +struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg); +extern const struct file_operations shmem_file_operations; +extern const struct file_operations shm_file_operations; + +extern struct file_system_type tmpfs_fs_type; #endif /* __KERNEL__ */ #endif /* _LINUX_SHM_H_ */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index fd83f25..f09735a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -23,6 +23,9 @@ struct shmem_inode_info { struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *shmi_ub; +#endif }; struct shmem_sb_info { @@ -62,4 +65,7 @@ static inline void shmem_acl_destroy_inode(struct inode *inode) } #endif /* CONFIG_TMPFS_POSIX_ACL */ +int shmem_insertpage(struct inode * inode, unsigned long index, + swp_entry_t swap); + #endif diff --git a/include/linux/signal.h b/include/linux/signal.h index 84f997f..5adb84b 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -6,6 +6,8 @@ #ifdef __KERNEL__ #include +#include +#include /* * Real Time signals may be queued. @@ -16,6 +18,9 @@ struct sigqueue { int flags; siginfo_t info; struct user_struct *user; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *sig_ub; +#endif }; /* flags values. */ @@ -372,6 +377,8 @@ int unhandled_signal(struct task_struct *tsk, int sig); void signals_init(void); +extern struct kmem_cache *sigqueue_cachep; + #endif /* __KERNEL__ */ #endif /* _LINUX_SIGNAL_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9099237..8731b5c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -250,6 +250,8 @@ typedef unsigned char *sk_buff_data_t; * @vlan_tci: vlan tag control information */ +#include + struct sk_buff { /* These two members must be first. */ struct sk_buff *next; @@ -296,7 +298,13 @@ struct sk_buff { peeked:1, nf_trace:1; __be16 protocol; - +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) + __u8 brmark; +#endif +#ifdef CONFIG_VE + unsigned int accounted:1; + unsigned int redirected:1; +#endif void (*destructor)(struct sk_buff *skb); #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; @@ -343,6 +351,8 @@ struct sk_buff { *data; unsigned int truesize; atomic_t users; + struct skb_beancounter skb_bc; + struct ve_struct *owner_env; }; #ifdef __KERNEL__ @@ -350,6 +360,7 @@ struct sk_buff { * Handling routines are only of interest to the kernel */ #include +#include #include @@ -1176,6 +1187,8 @@ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len) */ static inline void skb_orphan(struct sk_buff *skb) { + ub_skb_uncharge(skb); + if (skb->destructor) skb->destructor(skb); skb->destructor = NULL; @@ -1678,6 +1691,26 @@ static inline void skb_init_secmark(struct sk_buff *skb) { } #endif +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) +static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from) +{ + to->brmark = from->brmark; +} + +static inline void skb_init_brmark(struct sk_buff *skb) +{ + skb->brmark = 0; +} +#else +static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from) +{ +} + +static inline void skb_init_brmark(struct sk_buff *skb) +{ +} +#endif + static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping) { skb->queue_mapping = queue_mapping; diff --git a/include/linux/slab.h b/include/linux/slab.h index 5ff9676..9d7cee0 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -51,6 +51,26 @@ (unsigned long)ZERO_SIZE_PTR) /* + * allocation rules: __GFP_UBC 0 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * cache (SLAB_UBC) charge charge + * (usual caches: mm, vma, task_struct, ...) + * + * cache (SLAB_UBC | SLAB_NO_CHARGE) charge --- + * (ub_kmalloc) (kmalloc) + * + * cache (no UB flags) BUG() --- + * (nonub caches, mempools) + * + * pages charge --- + * (ub_vmalloc, (vmalloc, + * poll, fdsets, ...) non-ub allocs) + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ +#define SLAB_UBC 0x10000000UL /* alloc space for ubs ... */ +#define SLAB_NO_CHARGE 0x20000000UL /* ... but don't charge */ + +/* * struct kmem_cache related prototypes */ void __init kmem_cache_init(void); @@ -65,7 +85,20 @@ void kmem_cache_free(struct kmem_cache *, void *); unsigned int kmem_cache_size(struct kmem_cache *); const char *kmem_cache_name(struct kmem_cache *); int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr); +extern void show_slab_info(void); +int kmem_cache_objuse(struct kmem_cache *cachep); +int kmem_obj_objuse(void *obj); +int kmem_dname_objuse(void *obj); +unsigned long ub_cache_growth(struct kmem_cache *cachep); +#ifdef CONFIG_BEANCOUNTERS +void kmem_mark_nocharge(struct kmem_cache *cachep); +struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj); +struct user_beancounter *slab_ub(void *obj); +#else +static inline void kmem_mark_nocharge(struct kmem_cache *cachep) { } +static inline struct user_beancounter *slab_ub(void *obj) { return NULL; } +#endif /* * Please use this macro to create slab caches. Simply specify the * name of the structure and maybe some flags that are listed above. diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 39c3a5e..de03bd0 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -15,6 +15,26 @@ #include /* kmalloc_sizes.h needs L1_CACHE_BYTES */ #include +/* + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. + * 0 for faster, smaller code (especially in the critical paths). + * + * STATS - 1 to collect stats for /proc/slabinfo. + * 0 for faster, smaller code (especially in the critical paths). + * + * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) + */ + +#ifdef CONFIG_DEBUG_SLAB +#define SLAB_DEBUG 1 +#define SLAB_STATS 1 +#define SLAB_FORCED_DEBUG 1 +#else +#define SLAB_DEBUG 0 +#define SLAB_STATS 0 +#define SLAB_FORCED_DEBUG 0 +#endif + /* Size description struct for general caches. */ struct cache_sizes { size_t cs_size; @@ -24,6 +44,7 @@ struct cache_sizes { #endif }; extern struct cache_sizes malloc_sizes[]; +extern int malloc_cache_num; void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *__kmalloc(size_t size, gfp_t flags); @@ -48,6 +69,8 @@ static inline void *kmalloc(size_t size, gfp_t flags) __you_cannot_kmalloc_that_much(); } found: + if (flags & __GFP_UBC) + i += malloc_cache_num; #ifdef CONFIG_ZONE_DMA if (flags & GFP_DMA) return kmem_cache_alloc(malloc_sizes[i].cs_dmacachep, diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 2f5c16b..9fd7575 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -95,6 +95,10 @@ struct kmem_cache { struct kobject kobj; /* For sysfs */ #endif +#ifdef CONFIG_BEANCOUNTERS + atomic_t grown; + int objuse; +#endif #ifdef CONFIG_NUMA /* * Defragmentation by allocating from a remote node. @@ -126,6 +130,19 @@ struct kmem_cache { */ extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1]; +#ifdef CONFIG_BEANCOUNTERS +extern struct kmem_cache ub_kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; +static inline struct kmem_cache *__kmalloc_cache(gfp_t f, int idx) +{ + return (f & __GFP_UBC) ? &ub_kmalloc_caches[idx] : &kmalloc_caches[idx]; +} +#else +static inline struct kmem_cache *__kmalloc_cache(gfp_t flags, int idx) +{ + return &kmalloc_caches[idx]; +} +#endif + /* * Sorry that the following has to be that ugly but some versions of GCC * have trouble with constant propagation and loops. @@ -184,14 +201,14 @@ static __always_inline int kmalloc_index(size_t size) * This ought to end up with a global pointer to the right cache * in kmalloc_caches. */ -static __always_inline struct kmem_cache *kmalloc_slab(size_t size) +static __always_inline struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) { int index = kmalloc_index(size); if (index == 0) return NULL; - return &kmalloc_caches[index]; + return __kmalloc_cache(flags, index); } #ifdef CONFIG_ZONE_DMA @@ -216,7 +233,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) return kmalloc_large(size, flags); if (!(flags & SLUB_DMA)) { - struct kmem_cache *s = kmalloc_slab(size); + struct kmem_cache *s = kmalloc_slab(size, flags); if (!s) return ZERO_SIZE_PTR; @@ -235,7 +252,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { if (__builtin_constant_p(size) && size <= PAGE_SIZE && !(flags & SLUB_DMA)) { - struct kmem_cache *s = kmalloc_slab(size); + struct kmem_cache *s = kmalloc_slab(size, flags); if (!s) return ZERO_SIZE_PTR; diff --git a/include/linux/smp.h b/include/linux/smp.h index 66484d4..ac21923 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -12,6 +12,9 @@ extern void cpu_idle(void); +struct pt_regs; +typedef void (*smp_nmi_function)(struct pt_regs *regs, void *info); + struct call_single_data { struct list_head list; void (*func) (void *info); @@ -58,6 +61,8 @@ extern int __cpu_up(unsigned int cpunum); */ extern void smp_cpus_done(unsigned int max_cpus); +extern int smp_nmi_call_function(smp_nmi_function func, void *info, int wait); + /* * Call a function on all other processors */ @@ -138,6 +143,12 @@ static inline void smp_send_reschedule(int cpu) { } static inline void init_call_single_data(void) { } +static inline int smp_nmi_call_function(smp_nmi_function func, + void *info, int wait) +{ + return 0; +} + #endif /* !SMP */ /* diff --git a/include/linux/socket.h b/include/linux/socket.h index dc5086f..8038e33 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -300,6 +300,16 @@ struct ucred { #define IPX_TYPE 1 #ifdef __KERNEL__ + +#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - + 16 for IP, 16 for IPX, + 24 for IPv6, + about 80 for AX.25 + must be at least one bigger than + the AF_UNIX size (see net/unix/af_unix.c + :unix_mkname()). + */ + extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len); extern int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, int len); @@ -313,6 +323,8 @@ extern int memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len); extern int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr, int __user *ulen); extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); +extern int vz_security_family_check(int family); +extern int vz_security_protocol_check(int protocol); #endif #endif /* not kernel and not glibc */ diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index e5bfe01..8b00c3b 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -44,6 +44,7 @@ struct rpc_clnt { cl_discrtry : 1,/* disconnect before retry */ cl_autobind : 1,/* use getport() */ cl_chatty : 1;/* be verbose */ + unsigned int cl_broken : 1;/* no responce for too long */ struct rpc_rtt * cl_rtt; /* RTO estimator data */ const struct rpc_timeout *cl_timeout; /* Timeout strategy */ @@ -57,6 +58,7 @@ struct rpc_clnt { struct rpc_rtt cl_rtt_default; struct rpc_timeout cl_timeout_default; struct rpc_program * cl_program; + unsigned long cl_pr_time; char cl_inline_name[32]; }; diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 4d80a11..ceee9a3 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -24,6 +24,14 @@ #define RPC_MAX_SLOT_TABLE (128U) /* + * Grand abort timeout (stop the client if occures) + */ +extern int xprt_abort_timeout; + +#define RPC_MIN_ABORT_TIMEOUT 300 +#define RPC_MAX_ABORT_TIMEOUT INT_MAX + +/* * This describes a timeout strategy */ struct rpc_timeout { @@ -123,6 +131,7 @@ struct rpc_xprt_ops { struct rpc_xprt { struct kref kref; /* Reference count */ struct rpc_xprt_ops * ops; /* transport methods */ + struct ve_struct * owner_env; /* VE owner of mount */ const struct rpc_timeout *timeout; /* timeout parms */ struct sockaddr_storage addr; /* server address */ diff --git a/include/linux/swap.h b/include/linux/swap.h index de40f16..74394ee 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -18,6 +18,7 @@ struct bio; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 +#define SWAP_FLAG_READONLY 0x40000000 /* set if swap is read-only */ static inline int current_is_kswapd(void) { @@ -93,6 +94,7 @@ struct address_space; struct sysinfo; struct writeback_control; struct zone; +struct user_beancounter; /* * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of @@ -122,6 +124,7 @@ enum { SWP_ACTIVE = (SWP_USED | SWP_WRITEOK), /* add others here before... */ SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ + SWP_READONLY = (1 << 2), }; #define SWAP_CLUSTER_MAX 32 @@ -132,6 +135,7 @@ enum { /* * The in-memory structure used to track swap areas. */ +struct user_beancounter; struct swap_info_struct { unsigned int flags; int prio; /* swap priority */ @@ -149,6 +153,9 @@ struct swap_info_struct { unsigned int max; unsigned int inuse_pages; int next; /* next entry on swap list */ +#ifdef CONFIG_BC_SWAP_ACCOUNTING + struct user_beancounter **swap_ubs; +#endif }; struct swap_list_t { @@ -156,9 +163,21 @@ struct swap_list_t { int next; /* swapfile to be used next */ }; +extern struct swap_list_t swap_list; +extern struct swap_info_struct swap_info[MAX_SWAPFILES]; + /* Swap 50% full? Release swapcache more aggressively.. */ #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) +/* linux/mm/oom_kill.c */ +extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); +extern int register_oom_notifier(struct notifier_block *nb); +extern int unregister_oom_notifier(struct notifier_block *nb); +extern int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + struct mem_cgroup *mem, const char *message); +extern struct task_struct *select_bad_process(struct user_beancounter *ub, + struct mem_cgroup *memcg); + /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; @@ -226,6 +245,8 @@ extern void show_swap_cache_info(void); extern int add_to_swap(struct page *, gfp_t); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); extern void __delete_from_swap_cache(struct page *); +extern int __add_to_swap_cache(struct page *page, + swp_entry_t entry, gfp_t gfp_mask); extern void delete_from_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); @@ -238,7 +259,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, /* linux/mm/swapfile.c */ extern long total_swap_pages; extern void si_swapinfo(struct sysinfo *); -extern swp_entry_t get_swap_page(void); +extern swp_entry_t get_swap_page(struct user_beancounter *); extern swp_entry_t get_swap_page_of_type(int); extern int swap_duplicate(swp_entry_t); extern int valid_swaphandles(swp_entry_t, unsigned long *); @@ -251,6 +272,7 @@ extern sector_t swapdev_block(int, pgoff_t); extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int can_share_swap_page(struct page *); extern int remove_exclusive_swap_page(struct page *); +extern int try_to_remove_exclusive_swap_page(struct page *); struct backing_dev_info; /* linux/mm/thrash.c */ @@ -339,7 +361,7 @@ static inline int remove_exclusive_swap_page(struct page *p) return 0; } -static inline swp_entry_t get_swap_page(void) +static inline swp_entry_t get_swap_page(struct user_beancounter *ub) { swp_entry_t entry; entry.val = 0; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index d0437f3..28eab78 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -1102,10 +1102,15 @@ struct ctl_table_header *__register_sysctl_paths( struct ctl_table_header *register_sysctl_table(struct ctl_table * table); struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, struct ctl_table *table); +struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *, int); +struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *, + struct ctl_table *, int); void unregister_sysctl_table(struct ctl_table_header * table); int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table); +extern int ve_allow_kthreads; + #endif /* __KERNEL__ */ #endif /* _LINUX_SYSCTL_H */ diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 37fa241..3075594 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -19,6 +19,7 @@ struct kobject; struct module; +struct sysfs_open_dirent; /* FIXME * The *owner field is no longer used, but leave around @@ -78,6 +79,66 @@ struct sysfs_ops { ssize_t (*store)(struct kobject *,struct attribute *,const char *, size_t); }; +/* type-specific structures for sysfs_dirent->s_* union members */ +struct sysfs_elem_dir { + struct kobject *kobj; + /* children list starts here and goes through sd->s_sibling */ + struct sysfs_dirent *children; +}; + +struct sysfs_elem_symlink { + struct sysfs_dirent *target_sd; +}; + +struct sysfs_elem_attr { + struct attribute *attr; + struct sysfs_open_dirent *open; +}; + +struct sysfs_elem_bin_attr { + struct bin_attribute *bin_attr; +}; + +/* + * sysfs_dirent - the building block of sysfs hierarchy. Each and + * every sysfs node is represented by single sysfs_dirent. + * + * As long as s_count reference is held, the sysfs_dirent itself is + * accessible. Dereferencing s_elem or any other outer entity + * requires s_active reference. + */ +struct sysfs_dirent { + atomic_t s_count; + atomic_t s_active; + struct sysfs_dirent *s_parent; + struct sysfs_dirent *s_sibling; + const char *s_name; + + union { + struct sysfs_elem_dir s_dir; + struct sysfs_elem_symlink s_symlink; + struct sysfs_elem_attr s_attr; + struct sysfs_elem_bin_attr s_bin_attr; + }; + + unsigned int s_flags; + ino_t s_ino; + umode_t s_mode; + struct iattr *s_iattr; +}; + +#define SD_DEACTIVATED_BIAS INT_MIN + +#define SYSFS_TYPE_MASK 0x00ff +#define SYSFS_DIR 0x0001 +#define SYSFS_KOBJ_ATTR 0x0002 +#define SYSFS_KOBJ_BIN_ATTR 0x0004 +#define SYSFS_KOBJ_LINK 0x0008 +#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) + +#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK +#define SYSFS_FLAG_REMOVED 0x0200 + #ifdef CONFIG_SYSFS int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), @@ -121,6 +182,8 @@ void sysfs_notify(struct kobject *kobj, char *dir, char *attr); extern int __must_check sysfs_init(void); +extern struct file_system_type sysfs_fs_type; + #else /* CONFIG_SYSFS */ static inline int sysfs_schedule_callback(struct kobject *kobj, diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h index 4d090f9..ba40964 100644 --- a/include/linux/task_io_accounting_ops.h +++ b/include/linux/task_io_accounting_ops.h @@ -5,10 +5,12 @@ #define __TASK_IO_ACCOUNTING_OPS_INCLUDED #include +#include #ifdef CONFIG_TASK_IO_ACCOUNTING static inline void task_io_account_read(size_t bytes) { + ub_io_account_read(bytes); current->ioac.read_bytes += bytes; } @@ -21,8 +23,14 @@ static inline unsigned long task_io_get_inblock(const struct task_struct *p) return p->ioac.read_bytes >> 9; } -static inline void task_io_account_write(size_t bytes) +static inline void task_io_account_write(struct page *page, size_t bytes, + int sync) { + if (sync) + ub_io_account_write(bytes); + else + ub_io_account_dirty(page, bytes); + current->ioac.write_bytes += bytes; } @@ -37,6 +45,7 @@ static inline unsigned long task_io_get_oublock(const struct task_struct *p) static inline void task_io_account_cancelled_write(size_t bytes) { + ub_io_account_write_cancelled(bytes); current->ioac.cancelled_write_bytes += bytes; } @@ -64,7 +73,8 @@ static inline unsigned long task_io_get_inblock(const struct task_struct *p) return 0; } -static inline void task_io_account_write(size_t bytes) +static inline void task_io_account_write(struct page *page, size_t bytes, + int sync) { } diff --git a/include/linux/tty.h b/include/linux/tty.h index 0cbec74..a0db563 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -269,6 +269,7 @@ struct tty_struct { /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; struct tty_port *port; + struct ve_struct *owner_env; }; /* tty magic number */ @@ -298,6 +299,7 @@ struct tty_struct { #define TTY_HUPPED 18 /* Post driver->hangup() */ #define TTY_FLUSHING 19 /* Flushing to ldisc in progress */ #define TTY_FLUSHPENDING 20 /* Queued buffer flush pending */ +#define TTY_CHARGED 21 /* Charged as ub resource */ #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty)) diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 16d2794..981f9b1 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -260,8 +260,19 @@ struct tty_driver { const struct tty_operations *ops; struct list_head tty_drivers; + struct ve_struct *owner_env; }; +#ifdef CONFIG_UNIX98_PTYS +extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ +extern struct tty_driver *pts_driver; /* Unix98 pty slaves; for /dev/ptmx */ +#endif + +#ifdef CONFIG_LEGACY_PTYS +extern struct tty_driver *pty_driver; +extern struct tty_driver *pty_slave_driver; +#endif + extern struct list_head tty_drivers; struct tty_driver *alloc_tty_driver(int lines); @@ -270,6 +281,9 @@ void tty_set_operations(struct tty_driver *driver, const struct tty_operations *op); extern struct tty_driver *tty_find_polling_driver(char *name, int *line); +int init_ve_tty_class(void); +void fini_ve_tty_class(void); + /* tty driver magic number */ #define TTY_DRIVER_MAGIC 0x5402 diff --git a/include/linux/types.h b/include/linux/types.h index d4a9ce6..dcdaf75 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -29,6 +29,11 @@ typedef __kernel_timer_t timer_t; typedef __kernel_clockid_t clockid_t; typedef __kernel_mqd_t mqd_t; +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + #ifdef __KERNEL__ typedef _Bool bool; diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 1123267..ec24d89 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -43,6 +43,7 @@ struct uts_namespace { struct new_utsname name; }; extern struct uts_namespace init_uts_ns; +extern struct new_utsname virt_utsname; #ifdef CONFIG_UTS_NS static inline void get_uts_ns(struct uts_namespace *ns) diff --git a/include/linux/ve.h b/include/linux/ve.h new file mode 100644 index 0000000..f1b84d4 --- /dev/null +++ b/include/linux/ve.h @@ -0,0 +1,353 @@ +/* + * include/linux/ve.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_VE_H +#define _LINUX_VE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef VZMON_DEBUG +# define VZTRACE(fmt,args...) \ + printk(KERN_DEBUG fmt, ##args) +#else +# define VZTRACE(fmt,args...) +#endif /* VZMON_DEBUG */ + +struct tty_driver; +struct devpts_config; +struct task_struct; +struct new_utsname; +struct file_system_type; +struct icmp_mib; +struct ip_mib; +struct tcp_mib; +struct udp_mib; +struct linux_mib; +struct fib_info; +struct fib_rule; +struct veip_struct; +struct ve_monitor; +struct nsproxy; + +#if defined(CONFIG_VE) && defined(CONFIG_INET) +struct fib_table; +#ifdef CONFIG_VE_IPTABLES +struct xt_table; +struct nf_conn; + +#define FRAG6Q_HASHSZ 64 + +struct ve_nf_conntrack { + struct hlist_head *_bysource; + struct nf_nat_protocol **_nf_nat_protos; + int _nf_nat_vmalloced; + struct xt_table *_nf_nat_table; + struct nf_conntrack_l3proto *_nf_nat_l3proto; + atomic_t _nf_conntrack_count; + int _nf_conntrack_max; + struct hlist_head *_nf_conntrack_hash; + int _nf_conntrack_checksum; + int _nf_conntrack_vmalloc; + struct hlist_head _unconfirmed; + struct hlist_head *_nf_ct_expect_hash; + unsigned int _nf_ct_expect_vmalloc; + unsigned int _nf_ct_expect_count; + unsigned int _nf_ct_expect_max; + struct hlist_head *_nf_ct_helper_hash; + unsigned int _nf_ct_helper_vmalloc; +#ifdef CONFIG_SYSCTL + /* l4 stuff: */ + unsigned long _nf_ct_icmp_timeout; + unsigned long _nf_ct_icmpv6_timeout; + unsigned int _nf_ct_udp_timeout; + unsigned int _nf_ct_udp_timeout_stream; + unsigned int _nf_ct_generic_timeout; + unsigned int _nf_ct_log_invalid; + unsigned int _nf_ct_tcp_timeout_max_retrans; + unsigned int _nf_ct_tcp_timeout_unacknowledged; + int _nf_ct_tcp_be_liberal; + int _nf_ct_tcp_loose; + int _nf_ct_tcp_max_retrans; + unsigned int _nf_ct_tcp_timeouts[10]; + struct ctl_table_header *_icmp_sysctl_header; + unsigned int _tcp_sysctl_table_users; + struct ctl_table_header *_tcp_sysctl_header; + unsigned int _udp_sysctl_table_users; + struct ctl_table_header *_udp_sysctl_header; + struct ctl_table_header *_icmpv6_sysctl_header; + struct ctl_table_header *_generic_sysctl_header; +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + struct ctl_table_header *_icmp_compat_sysctl_header; + struct ctl_table_header *_tcp_compat_sysctl_header; + struct ctl_table_header *_udp_compat_sysctl_header; + struct ctl_table_header *_generic_compat_sysctl_header; +#endif + /* l4 protocols sysctl tables: */ + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_icmp; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_tcp4; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_icmpv6; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_tcp6; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_udp4; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_udp6; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_generic; + struct nf_conntrack_l4proto **_nf_ct_protos[PF_MAX]; + /* l3 protocols sysctl tables: */ + struct nf_conntrack_l3proto *_nf_conntrack_l3proto_ipv4; + struct nf_conntrack_l3proto *_nf_ct_l3protos[AF_MAX]; + /* sysctl standalone stuff: */ + struct ctl_table_header *_nf_ct_sysctl_header; + struct ctl_table_header *_nf_ct_netfilter_header; + ctl_table *_nf_ct_sysctl_table; + ctl_table *_nf_ct_netfilter_table; + ctl_table *_nf_ct_net_table; + ctl_table *_ip_ct_netfilter_table; + struct ctl_table_header *_ip_ct_sysctl_header; + int _nf_ct_log_invalid_proto_min; + int _nf_ct_log_invalid_proto_max; +#endif /* CONFIG_SYSCTL */ +}; +#endif +#endif + +struct ve_cpu_stats { + cycles_t idle_time; + cycles_t iowait_time; + cycles_t strt_idle_time; + cycles_t used_time; + seqcount_t stat_lock; + int nr_running; + int nr_unint; + int nr_iowait; + cputime64_t user; + cputime64_t nice; + cputime64_t system; +} ____cacheline_aligned; + +struct ve_ipt_recent; +struct ve_xt_hashlimit; +struct svc_rqst; + +struct cgroup; +struct css_set; + +struct ve_struct { + struct list_head ve_list; + + envid_t veid; + struct list_head vetask_lh; + /* capability bounding set */ + kernel_cap_t ve_cap_bset; + atomic_t pcounter; + /* ref counter to ve from ipc */ + atomic_t counter; + unsigned int class_id; + struct rw_semaphore op_sem; + int is_running; + int is_locked; + atomic_t suspend; + /* see vzcalluser.h for VE_FEATURE_XXX definitions */ + __u64 features; + +/* VE's root */ + struct path root_path; + + struct file_system_type *proc_fstype; + struct vfsmount *proc_mnt; + struct proc_dir_entry *proc_root; + +/* BSD pty's */ +#ifdef CONFIG_LEGACY_PTYS + struct tty_driver *pty_driver; + struct tty_driver *pty_slave_driver; +#endif +#ifdef CONFIG_UNIX98_PTYS + struct tty_driver *ptm_driver; + struct tty_driver *pts_driver; + struct ida *allocated_ptys; + struct file_system_type *devpts_fstype; + struct vfsmount *devpts_mnt; + struct dentry *devpts_root; + struct devpts_config *devpts_config; +#endif + + struct ve_nfs_context *nfs_context; + + struct file_system_type *shmem_fstype; + struct vfsmount *shmem_mnt; +#ifdef CONFIG_SYSFS + struct file_system_type *sysfs_fstype; + struct vfsmount *sysfs_mnt; + struct super_block *sysfs_sb; + struct sysfs_dirent *_sysfs_root; +#endif +#ifndef CONFIG_SYSFS_DEPRECATED + struct kobject *_virtual_dir; +#endif + struct kset *class_kset; + struct kset *devices_kset; + struct kobject *dev_kobj; + struct kobject *dev_char_kobj; + struct kobject *dev_block_kobj; + struct class *tty_class; + struct class *mem_class; + +#ifdef CONFIG_NET + struct class *net_class; +#ifdef CONFIG_INET + unsigned long rt_flush_required; +#endif +#endif +#if defined(CONFIG_VE_NETDEV) || defined (CONFIG_VE_NETDEV_MODULE) + struct veip_struct *veip; + struct net_device *_venet_dev; +#endif + +/* per VE CPU stats*/ + struct timespec start_timespec; + u64 start_jiffies; /* Deprecated */ + cycles_t start_cycles; + unsigned long avenrun[3]; /* loadavg data */ + + cycles_t cpu_used_ve; + struct kstat_lat_pcpu_struct sched_lat_ve; + +#ifdef CONFIG_INET + struct venet_stat *stat; +#ifdef CONFIG_VE_IPTABLES +/* core/netfilter.c virtualization */ + struct xt_table *_ve_ipt_filter_pf; /* packet_filter struct */ + struct xt_table *_ve_ip6t_filter_pf; + struct xt_table *_ipt_mangle_table; + struct xt_table *_ip6t_mangle_table; + struct list_head _xt_tables[NPROTO]; + + __u64 ipt_mask; + __u64 _iptables_modules; + struct ve_nf_conntrack *_nf_conntrack; + struct ve_ipt_recent *_ipt_recent; + struct ve_xt_hashlimit *_xt_hashlimit; +#endif /* CONFIG_VE_IPTABLES */ + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct ipstats_mib *_ipv6_statistics[2]; + struct icmpv6_mib *_icmpv6_statistics[2]; + struct icmpv6msg_mib *_icmpv6msg_statistics[2]; + struct udp_mib *_udp_stats_in6[2]; + struct udp_mib *_udplite_stats_in6[2]; +#endif +#endif + wait_queue_head_t *_log_wait; + unsigned *_log_start; + unsigned *_log_end; + unsigned *_logged_chars; + char *log_buf; +#define VE_DEFAULT_LOG_BUF_LEN 4096 + + struct ve_cpu_stats *cpu_stats; + unsigned long down_at; + struct list_head cleanup_list; +#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) + struct list_head _fuse_conn_list; + struct super_block *_fuse_control_sb; + + struct file_system_type *fuse_fs_type; + struct file_system_type *fuse_ctl_fs_type; +#endif + unsigned long jiffies_fixup; + unsigned char disable_net; + struct ve_monitor *monitor; + struct proc_dir_entry *monitor_proc; + unsigned long meminfo_val; + +#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) \ + || defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) + unsigned int _nlmsvc_users; + struct task_struct* _nlmsvc_task; + int _nlmsvc_grace_period; + unsigned long _nlmsvc_timeout; + struct svc_rqst* _nlmsvc_rqst; +#endif + + struct nsproxy *ve_ns; + struct net *ve_netns; + struct cgroup *ve_cgroup; + struct css_set *ve_css_set; +}; + +int init_ve_cgroups(struct ve_struct *ve); +void fini_ve_cgroups(struct ve_struct *ve); + +#define VE_CPU_STATS(ve, cpu) (per_cpu_ptr((ve)->cpu_stats, cpu)) + +extern int nr_ve; +extern struct proc_dir_entry *proc_vz_dir; +extern struct proc_dir_entry *glob_proc_vz_dir; + +#ifdef CONFIG_VE + +void do_update_load_avg_ve(void); +void do_env_free(struct ve_struct *ptr); + +static inline struct ve_struct *get_ve(struct ve_struct *ptr) +{ + if (ptr != NULL) + atomic_inc(&ptr->counter); + return ptr; +} + +static inline void put_ve(struct ve_struct *ptr) +{ + if (ptr && atomic_dec_and_test(&ptr->counter)) { + BUG_ON(atomic_read(&ptr->pcounter) > 0); + BUG_ON(ptr->is_running); + do_env_free(ptr); + } +} + +static inline void pget_ve(struct ve_struct *ptr) +{ + atomic_inc(&ptr->pcounter); +} + +void ve_cleanup_schedule(struct ve_struct *); +static inline void pput_ve(struct ve_struct *ptr) +{ + if (unlikely(atomic_dec_and_test(&ptr->pcounter))) + ve_cleanup_schedule(ptr); +} + +extern spinlock_t ve_cleanup_lock; +extern struct list_head ve_cleanup_list; +extern struct task_struct *ve_cleanup_thread; + +extern unsigned long long ve_relative_clock(struct timespec * ts); + +#ifdef CONFIG_FAIRSCHED +#define ve_cpu_online_map(ve, mask) fairsched_cpu_online_map(ve->veid, mask) +#else +#define ve_cpu_online_map(ve, mask) do { *(mask) = cpu_online_map; } while (0) +#endif +#else /* CONFIG_VE */ +#define ve_utsname system_utsname +#define get_ve(ve) (NULL) +#define put_ve(ve) do { } while (0) +#define pget_ve(ve) do { } while (0) +#define pput_ve(ve) do { } while (0) +#endif /* CONFIG_VE */ + +#endif /* _LINUX_VE_H */ diff --git a/include/linux/ve_nfs.h b/include/linux/ve_nfs.h new file mode 100644 index 0000000..8f2e8f8 --- /dev/null +++ b/include/linux/ve_nfs.h @@ -0,0 +1,30 @@ +/* + * linux/include/ve_nfs.h + * + * VE context for NFS + * + * Copyright (C) 2007 SWsoft + */ + +#ifndef __VE_NFS_H__ +#define __VE_NFS_H__ + +#ifdef CONFIG_VE + +#include + +#define NFS_CTX_FIELD(arg) (get_exec_env()->_##arg) + +#else /* CONFIG_VE */ + +#define NFS_CTX_FIELD(arg) _##arg + +#endif /* CONFIG_VE */ + +#define nlmsvc_grace_period NFS_CTX_FIELD(nlmsvc_grace_period) +#define nlmsvc_timeout NFS_CTX_FIELD(nlmsvc_timeout) +#define nlmsvc_users NFS_CTX_FIELD(nlmsvc_users) +#define nlmsvc_task NFS_CTX_FIELD(nlmsvc_task) +#define nlmsvc_rqst NFS_CTX_FIELD(nlmsvc_rqst) + +#endif diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h new file mode 100644 index 0000000..26ca897 --- /dev/null +++ b/include/linux/ve_proto.h @@ -0,0 +1,89 @@ +/* + * include/linux/ve_proto.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VE_H__ +#define __VE_H__ + +#ifdef CONFIG_VE + +struct ve_struct; + +#ifdef CONFIG_INET +void tcp_v4_kill_ve_sockets(struct ve_struct *envid); +#ifdef CONFIG_VE_NETDEV +int venet_init(void); +#endif +#endif + +extern struct list_head ve_list_head; +#define for_each_ve(ve) list_for_each_entry((ve), &ve_list_head, ve_list) +extern rwlock_t ve_list_lock; +extern struct ve_struct *get_ve_by_id(envid_t); +extern struct ve_struct *__find_ve_by_id(envid_t); + +struct env_create_param3; +extern int real_env_create(envid_t veid, unsigned flags, u32 class_id, + struct env_create_param3 *data, int datalen); +extern void ve_move_task(struct task_struct *, struct ve_struct *); + +int set_device_perms_ve(struct ve_struct *, unsigned, dev_t, unsigned); +int get_device_perms_ve(int dev_type, dev_t dev, int access_mode); +int devperms_seq_show(struct seq_file *m, void *v); + +enum { + VE_SS_CHAIN, + + VE_MAX_CHAINS +}; + +typedef int ve_hook_init_fn(void *data); +typedef void ve_hook_fini_fn(void *data); + +struct ve_hook +{ + ve_hook_init_fn *init; + ve_hook_fini_fn *fini; + struct module *owner; + + /* Functions are called in ascending priority */ + int priority; + + /* Private part */ + struct list_head list; +}; + +enum { + HOOK_PRIO_DEFAULT = 0, + + HOOK_PRIO_FS = HOOK_PRIO_DEFAULT, + + HOOK_PRIO_NET_PRE, + HOOK_PRIO_NET, + HOOK_PRIO_NET_POST, + + HOOK_PRIO_AFTERALL = INT_MAX +}; + +void *ve_seq_start(struct seq_file *m, loff_t *pos); +void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos); +void ve_seq_stop(struct seq_file *m, void *v); + +extern int ve_hook_iterate_init(int chain, void *data); +extern void ve_hook_iterate_fini(int chain, void *data); + +extern void ve_hook_register(int chain, struct ve_hook *vh); +extern void ve_hook_unregister(struct ve_hook *vh); +#else /* CONFIG_VE */ +#define ve_hook_register(ch, vh) do { } while (0) +#define ve_hook_unregister(ve) do { } while (0) + +#define get_device_perms_ve(t, d, a) (0) +#endif /* CONFIG_VE */ +#endif diff --git a/include/linux/ve_task.h b/include/linux/ve_task.h new file mode 100644 index 0000000..4b7d722 --- /dev/null +++ b/include/linux/ve_task.h @@ -0,0 +1,73 @@ +/* + * include/linux/ve_task.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VE_TASK_H__ +#define __VE_TASK_H__ + +#include +#include + +struct ve_task_info { +/* virtualization */ + struct ve_struct *owner_env; + struct ve_struct *exec_env; + struct ve_struct *saved_env; + struct list_head vetask_list; + struct dentry *glob_proc_dentry; +/* statistics: scheduling latency */ + cycles_t sleep_time; + cycles_t sched_time; + cycles_t sleep_stamp; + cycles_t wakeup_stamp; + seqcount_t wakeup_lock; +}; + +#define VE_TASK_INFO(task) (&(task)->ve_task_info) +#define VE_TASK_LIST_2_TASK(lh) \ + list_entry(lh, struct task_struct, ve_task_info.vetask_list) + +#ifdef CONFIG_VE +extern struct ve_struct ve0; +#define get_ve0() (&ve0) + +#define ve_save_context(t) do { \ + t->ve_task_info.saved_env = \ + t->ve_task_info.exec_env; \ + t->ve_task_info.exec_env = get_ve0(); \ + } while (0) +#define ve_restore_context(t) do { \ + t->ve_task_info.exec_env = \ + t->ve_task_info.saved_env; \ + } while (0) + +#define get_exec_env() (current->ve_task_info.exec_env) +#define set_exec_env(ve) ({ \ + struct ve_task_info *vi; \ + struct ve_struct *old, *new; \ + \ + vi = ¤t->ve_task_info; \ + old = vi->exec_env; \ + new = ve; \ + if (unlikely(new == NULL)) { \ + printk("%s: NULL exec env (%s)\n", __func__, #ve);\ + new = get_ve0(); \ + } \ + vi->exec_env = new; \ + old; \ + }) +#else +#define get_ve0() (NULL) +#define get_exec_env() (NULL) +#define set_exec_env(new_env) (NULL) +#define ve_save_context(t) do { } while (0) +#define ve_restore_context(t) do { } while (0) +#endif + +#endif /* __VE_TASK_H__ */ diff --git a/include/linux/veip.h b/include/linux/veip.h new file mode 100644 index 0000000..745f1ec --- /dev/null +++ b/include/linux/veip.h @@ -0,0 +1,15 @@ +#ifndef __VE_IP_H_ +#define __VE_IP_H_ + +struct ve_addr_struct { + int family; + __u32 key[4]; +}; + +struct sockaddr; + +extern void veaddr_print(char *, int, struct ve_addr_struct *); +extern int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen, + struct ve_addr_struct *veaddr); + +#endif diff --git a/include/linux/venet.h b/include/linux/venet.h new file mode 100644 index 0000000..14cf89e --- /dev/null +++ b/include/linux/venet.h @@ -0,0 +1,86 @@ +/* + * include/linux/venet.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VENET_H +#define _VENET_H + +#include +#include +#include +#include +#include + +#define VEIP_HASH_SZ 512 + +struct ve_struct; +struct venet_stat; +struct venet_stats { + struct net_device_stats stats; + struct net_device_stats *real_stats; +}; + +struct ip_entry_struct +{ + struct ve_addr_struct addr; + struct ve_struct *active_env; + struct venet_stat *stat; + struct veip_struct *veip; + struct list_head ip_hash; + struct list_head ve_list; +}; + +struct veip_struct +{ + struct list_head src_lh; + struct list_head dst_lh; + struct list_head ip_lh; + struct list_head list; + envid_t veid; +}; + +static inline struct net_device_stats * +venet_stats(struct net_device *dev, int cpu) +{ + struct venet_stats *stats; + stats = (struct venet_stats*)dev->priv; + return per_cpu_ptr(stats->real_stats, cpu); +} + +/* veip_hash_lock should be taken for write by caller */ +void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip); +/* veip_hash_lock should be taken for write by caller */ +void ip_entry_unhash(struct ip_entry_struct *entry); +/* veip_hash_lock should be taken for read by caller */ +struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *); + +/* veip_hash_lock should be taken for read by caller */ +struct veip_struct *veip_find(envid_t veid); +/* veip_hash_lock should be taken for write by caller */ +struct veip_struct *veip_findcreate(envid_t veid); +/* veip_hash_lock should be taken for write by caller */ +void veip_put(struct veip_struct *veip); + +extern struct list_head veip_lh; + +int veip_start(struct ve_struct *ve); +void veip_stop(struct ve_struct *ve); +__exit void veip_cleanup(void); +int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr); +int veip_entry_del(envid_t veid, struct ve_addr_struct *addr); +int venet_change_skb_owner(struct sk_buff *skb); + +extern struct list_head ip_entry_hash_table[]; +extern rwlock_t veip_hash_lock; + +#ifdef CONFIG_PROC_FS +int veip_seq_show(struct seq_file *m, void *v); +#endif + +#endif diff --git a/include/linux/veprintk.h b/include/linux/veprintk.h new file mode 100644 index 0000000..5669d7b --- /dev/null +++ b/include/linux/veprintk.h @@ -0,0 +1,38 @@ +/* + * include/linux/veprintk.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VE_PRINTK_H__ +#define __VE_PRINTK_H__ + +#ifdef CONFIG_VE + +#define ve_log_wait (*(get_exec_env()->_log_wait)) +#define ve_log_start (*(get_exec_env()->_log_start)) +#define ve_log_end (*(get_exec_env()->_log_end)) +#define ve_logged_chars (*(get_exec_env()->_logged_chars)) +#define ve_log_buf (get_exec_env()->log_buf) +#define ve_log_buf_len (ve_is_super(get_exec_env()) ? \ + log_buf_len : VE_DEFAULT_LOG_BUF_LEN) +#define VE_LOG_BUF_MASK (ve_log_buf_len - 1) +#define VE_LOG_BUF(idx) (ve_log_buf[(idx) & VE_LOG_BUF_MASK]) + +#else + +#define ve_log_wait log_wait +#define ve_log_start log_start +#define ve_log_end log_end +#define ve_logged_chars logged_chars +#define ve_log_buf log_buf +#define ve_log_buf_len log_buf_len +#define VE_LOG_BUF_MASK LOG_BUF_MASK +#define VE_LOG_BUF(idx) LOG_BUF(idx) + +#endif /* CONFIG_VE */ +#endif /* __VE_PRINTK_H__ */ diff --git a/include/linux/veth.h b/include/linux/veth.h index 3354c1e..34cfe2b 100644 --- a/include/linux/veth.h +++ b/include/linux/veth.h @@ -1,3 +1,12 @@ +/* + * include/linux/veth.h + * + * Copyright (C) 2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ #ifndef __NET_VETH_H_ #define __NET_VETH_H_ @@ -9,4 +18,28 @@ enum { #define VETH_INFO_MAX (__VETH_INFO_MAX - 1) }; +#ifdef __KERNEL__ +struct veth_struct +{ + struct net_device_stats stats; + struct net_device *pair; + struct list_head hwaddr_list; + struct net_device_stats *real_stats; + int allow_mac_change; +}; + +#define veth_from_netdev(dev) \ + ((struct veth_struct *)(netdev_priv(dev))) +static inline struct net_device * veth_to_netdev(struct veth_struct *veth) +{ + return (struct net_device *)((char *)veth - ((sizeof(struct net_device) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST)); +} +#endif + +static inline struct net_device_stats * +veth_stats(struct net_device *dev, int cpuid) +{ + return per_cpu_ptr(veth_from_netdev(dev)->real_stats, cpuid); +} + #endif diff --git a/include/linux/virtinfo.h b/include/linux/virtinfo.h new file mode 100644 index 0000000..b0dad07 --- /dev/null +++ b/include/linux/virtinfo.h @@ -0,0 +1,100 @@ +/* + * include/linux/virtinfo.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __LINUX_VIRTINFO_H +#define __LINUX_VIRTINFO_H + +#include +#include +#include + +struct vnotifier_block +{ + int (*notifier_call)(struct vnotifier_block *self, + unsigned long, void *, int); + struct vnotifier_block *next; + int priority; +}; + +extern struct semaphore virtinfo_sem; +void __virtinfo_notifier_register(int type, struct vnotifier_block *nb); +void virtinfo_notifier_register(int type, struct vnotifier_block *nb); +void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb); +int virtinfo_notifier_call(int type, unsigned long n, void *data); + +struct page_info { + unsigned long nr_file_dirty; + unsigned long nr_writeback; + unsigned long nr_anon_pages; + unsigned long nr_file_mapped; + unsigned long nr_slab_rec; + unsigned long nr_slab_unrec; + unsigned long nr_pagetable; + unsigned long nr_unstable_nfs; + unsigned long nr_bounce; + unsigned long nr_writeback_temp; +}; + +struct meminfo { + struct sysinfo si; + struct page_info pi; + unsigned long active, inactive; + unsigned long cache, swapcache; + unsigned long committed_space; + unsigned long allowed; + unsigned long vmalloc_total, vmalloc_used, vmalloc_largest; +}; + +#define VIRTINFO_MEMINFO 0 +#define VIRTINFO_ENOUGHMEM 1 +#define VIRTINFO_DOFORK 2 +#define VIRTINFO_DOEXIT 3 +#define VIRTINFO_DOEXECVE 4 +#define VIRTINFO_DOFORKRET 5 +#define VIRTINFO_DOFORKPOST 6 +#define VIRTINFO_EXIT 7 +#define VIRTINFO_EXITMMAP 8 +#define VIRTINFO_EXECMMAP 9 +#define VIRTINFO_OUTOFMEM 10 +#define VIRTINFO_PAGEIN 11 +#define VIRTINFO_SYSINFO 12 +#define VIRTINFO_NEWUBC 13 +#define VIRTINFO_VMSTAT 14 + +enum virt_info_types { + VITYPE_GENERAL, + VITYPE_FAUDIT, + VITYPE_QUOTA, + VITYPE_SCP, + + VIRT_TYPES +}; + +#ifdef CONFIG_VZ_GENCALLS + +static inline int virtinfo_gencall(unsigned long n, void *data) +{ + int r; + + r = virtinfo_notifier_call(VITYPE_GENERAL, n, data); + if (r & NOTIFY_FAIL) + return -ENOBUFS; + if (r & NOTIFY_OK) + return -ERESTARTNOINTR; + return 0; +} + +#else + +#define virtinfo_gencall(n, data) 0 + +#endif + +#endif /* __LINUX_VIRTINFO_H */ diff --git a/include/linux/virtinfoscp.h b/include/linux/virtinfoscp.h new file mode 100644 index 0000000..9e7584f --- /dev/null +++ b/include/linux/virtinfoscp.h @@ -0,0 +1,21 @@ +#ifndef __VIRTINFO_SCP_H__ +#define __VIRTINFO_SCP_H__ + +/* + * Dump and restore operations are non-symmetric. + * With respect to finish/fail hooks, 2 dump hooks are called from + * different proc operations, but restore hooks are called from a single one. + */ +#define VIRTINFO_SCP_COLLECT 0x10 +#define VIRTINFO_SCP_DUMP 0x11 +#define VIRTINFO_SCP_DMPFIN 0x12 +#define VIRTINFO_SCP_RSTCHECK 0x13 +#define VIRTINFO_SCP_RESTORE 0x14 +#define VIRTINFO_SCP_RSTFAIL 0x15 + +#define VIRTINFO_SCP_RSTTSK 0x20 +#define VIRTINFO_SCP_RSTMM 0x21 + +#define VIRTNOTIFY_CHANGE 0x100 + +#endif /* __VIRTINFO_SCP_H__ */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 328eb40..f90025c 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -22,6 +22,10 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ #endif +/* align size to 2^n page boundary */ +#define POWER2_PAGE_ALIGN(size) \ + ((typeof(size))(1UL << (PAGE_SHIFT + get_order(size)))) + struct vm_struct { /* keep next,addr,size together to speedup lookups */ struct vm_struct *next; @@ -38,12 +42,16 @@ struct vm_struct { * Highlevel APIs for driver use */ extern void *vmalloc(unsigned long size); +extern void *ub_vmalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); +extern void *ub_vmalloc_node(unsigned long size, int node); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); +extern void *vmalloc_best(unsigned long size); +extern void *ub_vmalloc_best(unsigned long size); extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot); extern void vfree(const void *addr); @@ -71,6 +79,9 @@ extern struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, void *caller); extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end); +extern struct vm_struct * get_vm_area_best(unsigned long size, + unsigned long flags); +extern void vprintstat(void); extern struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node, gfp_t gfp_mask); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 58334d4..75ace44 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -98,6 +98,7 @@ static inline void vm_events_fold_cpu(int cpu) } #endif +extern unsigned long vm_events(enum vm_event_item i); #else /* Disable counters */ @@ -120,6 +121,7 @@ static inline void vm_events_fold_cpu(int cpu) { } +static inline unsigned long vm_events(enum vm_event_item i) { return 0; } #endif /* CONFIG_VM_EVENT_COUNTERS */ #define __count_zone_vm_events(item, zone, delta) \ diff --git a/include/linux/vzcalluser.h b/include/linux/vzcalluser.h new file mode 100644 index 0000000..46c04e6 --- /dev/null +++ b/include/linux/vzcalluser.h @@ -0,0 +1,198 @@ +/* + * include/linux/vzcalluser.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_VZCALLUSER_H +#define _LINUX_VZCALLUSER_H + +#include +#include +#include + +#define KERN_VZ_PRIV_RANGE 51 + +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + +#ifndef __KERNEL__ +#define __user +#endif + +/* + * VE management ioctls + */ + +struct vzctl_old_env_create { + envid_t veid; + unsigned flags; +#define VE_CREATE 1 /* Create VE, VE_ENTER added automatically */ +#define VE_EXCLUSIVE 2 /* Fail if exists */ +#define VE_ENTER 4 /* Enter existing VE */ +#define VE_TEST 8 /* Test if VE exists */ +#define VE_LOCK 16 /* Do not allow entering created VE */ +#define VE_SKIPLOCK 32 /* Allow entering embrion VE */ + __u32 addr; +}; + +struct vzctl_mark_env_to_down { + envid_t veid; +}; + +struct vzctl_setdevperms { + envid_t veid; + unsigned type; +#define VE_USE_MAJOR 010 /* Test MAJOR supplied in rule */ +#define VE_USE_MINOR 030 /* Test MINOR supplied in rule */ +#define VE_USE_MASK 030 /* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */ + unsigned dev; + unsigned mask; +}; + +struct vzctl_ve_netdev { + envid_t veid; + int op; +#define VE_NETDEV_ADD 1 +#define VE_NETDEV_DEL 2 + char __user *dev_name; +}; + +struct vzctl_ve_meminfo { + envid_t veid; + unsigned long val; +}; + +struct vzctl_env_create_cid { + envid_t veid; + unsigned flags; + __u32 class_id; +}; + +struct vzctl_env_create { + envid_t veid; + unsigned flags; + __u32 class_id; +}; + +struct env_create_param { + __u64 iptables_mask; +}; + +#define VZCTL_ENV_CREATE_DATA_MINLEN sizeof(struct env_create_param) + +struct env_create_param2 { + __u64 iptables_mask; + __u64 feature_mask; + __u32 total_vcpus; /* 0 - don't care, same as in host */ +}; + +struct env_create_param3 { + __u64 iptables_mask; + __u64 feature_mask; + __u32 total_vcpus; + __u32 pad; + __u64 known_features; +}; + +#define VE_FEATURE_SYSFS (1ULL << 0) +#define VE_FEATURE_NFS (1ULL << 1) +#define VE_FEATURE_DEF_PERMS (1ULL << 2) +#define VE_FEATURE_SIT (1ULL << 3) +#define VE_FEATURE_IPIP (1ULL << 4) + +#define VE_FEATURES_OLD (VE_FEATURE_SYSFS) +#define VE_FEATURES_DEF (VE_FEATURE_SYSFS | \ + VE_FEATURE_DEF_PERMS) + +typedef struct env_create_param3 env_create_param_t; +#define VZCTL_ENV_CREATE_DATA_MAXLEN sizeof(env_create_param_t) + +struct vzctl_env_create_data { + envid_t veid; + unsigned flags; + __u32 class_id; + env_create_param_t __user *data; + int datalen; +}; + +struct vz_load_avg { + int val_int; + int val_frac; +}; + +struct vz_cpu_stat { + unsigned long user_jif; + unsigned long nice_jif; + unsigned long system_jif; + unsigned long uptime_jif; + __u64 idle_clk; + __u64 strv_clk; + __u64 uptime_clk; + struct vz_load_avg avenrun[3]; /* loadavg data */ +}; + +struct vzctl_cpustatctl { + envid_t veid; + struct vz_cpu_stat __user *cpustat; +}; + +#define VZCTLTYPE '.' +#define VZCTL_OLD_ENV_CREATE _IOW(VZCTLTYPE, 0, \ + struct vzctl_old_env_create) +#define VZCTL_MARK_ENV_TO_DOWN _IOW(VZCTLTYPE, 1, \ + struct vzctl_mark_env_to_down) +#define VZCTL_SETDEVPERMS _IOW(VZCTLTYPE, 2, \ + struct vzctl_setdevperms) +#define VZCTL_ENV_CREATE_CID _IOW(VZCTLTYPE, 4, \ + struct vzctl_env_create_cid) +#define VZCTL_ENV_CREATE _IOW(VZCTLTYPE, 5, \ + struct vzctl_env_create) +#define VZCTL_GET_CPU_STAT _IOW(VZCTLTYPE, 6, \ + struct vzctl_cpustatctl) +#define VZCTL_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \ + struct vzctl_env_create_data) +#define VZCTL_VE_NETDEV _IOW(VZCTLTYPE, 11, \ + struct vzctl_ve_netdev) +#define VZCTL_VE_MEMINFO _IOW(VZCTLTYPE, 13, \ + struct vzctl_ve_meminfo) + +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +#include + +struct compat_vzctl_ve_netdev { + envid_t veid; + int op; + compat_uptr_t dev_name; +}; + +struct compat_vzctl_ve_meminfo { + envid_t veid; + compat_ulong_t val; +}; + +struct compat_vzctl_env_create_data { + envid_t veid; + unsigned flags; + __u32 class_id; + compat_uptr_t data; + int datalen; +}; + +#define VZCTL_COMPAT_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \ + struct compat_vzctl_env_create_data) +#define VZCTL_COMPAT_VE_NETDEV _IOW(VZCTLTYPE, 11, \ + struct compat_vzctl_ve_netdev) +#define VZCTL_COMPAT_VE_MEMINFO _IOW(VZCTLTYPE, 13, \ + struct compat_vzctl_ve_meminfo) +#endif +#endif + +#endif diff --git a/include/linux/vzctl.h b/include/linux/vzctl.h new file mode 100644 index 0000000..ad967ed --- /dev/null +++ b/include/linux/vzctl.h @@ -0,0 +1,30 @@ +/* + * include/linux/vzctl.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_VZCTL_H +#define _LINUX_VZCTL_H + +#include + +struct module; +struct inode; +struct file; +struct vzioctlinfo { + unsigned type; + int (*ioctl)(struct file *, unsigned int, unsigned long); + int (*compat_ioctl)(struct file *, unsigned int, unsigned long); + struct module *owner; + struct list_head list; +}; + +extern void vzioctl_register(struct vzioctlinfo *inf); +extern void vzioctl_unregister(struct vzioctlinfo *inf); + +#endif diff --git a/include/linux/vzctl_quota.h b/include/linux/vzctl_quota.h new file mode 100644 index 0000000..6d36cdd --- /dev/null +++ b/include/linux/vzctl_quota.h @@ -0,0 +1,74 @@ +/* + * include/linux/vzctl_quota.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __LINUX_VZCTL_QUOTA_H__ +#define __LINUX_VZCTL_QUOTA_H__ + +#include + +#ifndef __KERNEL__ +#define __user +#endif + +/* + * Quota management ioctl + */ + +struct vz_quota_stat; +struct vzctl_quotactl { + int cmd; + unsigned int quota_id; + struct vz_quota_stat __user *qstat; + char __user *ve_root; +}; + +struct vzctl_quotaugidctl { + int cmd; /* subcommand */ + unsigned int quota_id; /* quota id where it applies to */ + unsigned int ugid_index;/* for reading statistic. index of first + uid/gid record to read */ + unsigned int ugid_size; /* size of ugid_buf array */ + void *addr; /* user-level buffer */ +}; + +#define VZDQCTLTYPE '+' +#define VZCTL_QUOTA_DEPR_CTL _IOWR(VZDQCTLTYPE, 1, \ + struct vzctl_quotactl) +#define VZCTL_QUOTA_NEW_CTL _IOWR(VZDQCTLTYPE, 2, \ + struct vzctl_quotactl) +#define VZCTL_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \ + struct vzctl_quotaugidctl) + +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +struct compat_vzctl_quotactl { + int cmd; + unsigned int quota_id; + compat_uptr_t qstat; + compat_uptr_t ve_root; +}; + +struct compat_vzctl_quotaugidctl { + int cmd; /* subcommand */ + unsigned int quota_id; /* quota id where it applies to */ + unsigned int ugid_index;/* for reading statistic. index of first + uid/gid record to read */ + unsigned int ugid_size; /* size of ugid_buf array */ + compat_uptr_t addr; /* user-level buffer */ +}; + +#define VZCTL_COMPAT_QUOTA_CTL _IOWR(VZDQCTLTYPE, 2, \ + struct compat_vzctl_quotactl) +#define VZCTL_COMPAT_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \ + struct compat_vzctl_quotaugidctl) +#endif +#endif + +#endif /* __LINUX_VZCTL_QUOTA_H__ */ diff --git a/include/linux/vzctl_venet.h b/include/linux/vzctl_venet.h new file mode 100644 index 0000000..4797a50 --- /dev/null +++ b/include/linux/vzctl_venet.h @@ -0,0 +1,51 @@ +/* + * include/linux/vzctl_venet.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VZCTL_VENET_H +#define _VZCTL_VENET_H + +#include +#include +#include + +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + +struct vzctl_ve_ip_map { + envid_t veid; + int op; +#define VE_IP_ADD 1 +#define VE_IP_DEL 2 + struct sockaddr *addr; + int addrlen; +}; + +#define VENETCTLTYPE '(' + +#define VENETCTL_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \ + struct vzctl_ve_ip_map) + +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +struct compat_vzctl_ve_ip_map { + envid_t veid; + int op; + compat_uptr_t addr; + int addrlen; +}; + +#define VENETCTL_COMPAT_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \ + struct compat_vzctl_ve_ip_map) +#endif +#endif + +#endif diff --git a/include/linux/vzctl_veth.h b/include/linux/vzctl_veth.h new file mode 100644 index 0000000..1480c5b --- /dev/null +++ b/include/linux/vzctl_veth.h @@ -0,0 +1,42 @@ +/* + * include/linux/vzctl_veth.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VZCTL_VETH_H +#define _VZCTL_VETH_H + +#include +#include + +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + +struct vzctl_ve_hwaddr { + envid_t veid; + int op; +#define VE_ETH_ADD 1 +#define VE_ETH_DEL 2 +#define VE_ETH_ALLOW_MAC_CHANGE 3 +#define VE_ETH_DENY_MAC_CHANGE 4 + unsigned char dev_addr[6]; + int addrlen; + char dev_name[16]; + unsigned char dev_addr_ve[6]; + int addrlen_ve; + char dev_name_ve[16]; +}; + +#define VETHCTLTYPE '[' + +#define VETHCTL_VE_HWADDR _IOW(VETHCTLTYPE, 3, \ + struct vzctl_ve_hwaddr) + +#endif diff --git a/include/linux/vzdq_tree.h b/include/linux/vzdq_tree.h new file mode 100644 index 0000000..c019e09 --- /dev/null +++ b/include/linux/vzdq_tree.h @@ -0,0 +1,99 @@ +/* + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo disk quota tree definition + */ + +#ifndef _VZDQ_TREE_H +#define _VZDQ_TREE_H + +#include +#include + +typedef unsigned int quotaid_t; +#define QUOTAID_BITS 32 +#define QUOTAID_BBITS 4 +#define QUOTAID_EBITS 8 + +#if QUOTAID_EBITS % QUOTAID_BBITS +#error Quota bit assumption failure +#endif + +#define QUOTATREE_BSIZE (1 << QUOTAID_BBITS) +#define QUOTATREE_BMASK (QUOTATREE_BSIZE - 1) +#define QUOTATREE_DEPTH ((QUOTAID_BITS + QUOTAID_BBITS - 1) \ + / QUOTAID_BBITS) +#define QUOTATREE_EDEPTH ((QUOTAID_BITS + QUOTAID_EBITS - 1) \ + / QUOTAID_EBITS) +#define QUOTATREE_BSHIFT(lvl) ((QUOTATREE_DEPTH - (lvl) - 1) * QUOTAID_BBITS) + +/* + * Depth of keeping unused node (not inclusive). + * 0 means release all nodes including root, + * QUOTATREE_DEPTH means never release nodes. + * Current value: release all nodes strictly after QUOTATREE_EDEPTH + * (measured in external shift units). + */ +#define QUOTATREE_CDEPTH (QUOTATREE_DEPTH \ + - 2 * QUOTATREE_DEPTH / QUOTATREE_EDEPTH \ + + 1) + +/* + * Levels 0..(QUOTATREE_DEPTH-1) are tree nodes. + * On level i the maximal number of nodes is 2^(i*QUOTAID_BBITS), + * and each node contains 2^QUOTAID_BBITS pointers. + * Level 0 is a (single) tree root node. + * + * Nodes of level (QUOTATREE_DEPTH-1) contain pointers to caller's data. + * Nodes of lower levels contain pointers to nodes. + * + * Double pointer in array of i-level node, pointing to a (i+1)-level node + * (such as inside quotatree_find_state) are marked by level (i+1), not i. + * Level 0 double pointer is a pointer to root inside tree struct. + * + * The tree is permanent, i.e. all index blocks allocated are keeped alive to + * preserve the blocks numbers in the quota file tree to keep its changes + * locally. + */ +struct quotatree_node { + struct list_head list; + quotaid_t num; + void *blocks[QUOTATREE_BSIZE]; +}; + +struct quotatree_level { + struct list_head usedlh, freelh; + quotaid_t freenum; +}; + +struct quotatree_tree { + struct quotatree_level levels[QUOTATREE_DEPTH]; + struct quotatree_node *root; + unsigned int leaf_num; +}; + +struct quotatree_find_state { + void **block; + int level; +}; + +/* number of leafs (objects) and leaf level of the tree */ +#define QTREE_LEAFNUM(tree) ((tree)->leaf_num) +#define QTREE_LEAFLVL(tree) (&(tree)->levels[QUOTATREE_DEPTH - 1]) + +struct quotatree_tree *quotatree_alloc(void); +void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st); +int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st, void *data); +void quotatree_remove(struct quotatree_tree *tree, quotaid_t id); +void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)); +void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id); +void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index); + +#endif /* _VZDQ_TREE_H */ + diff --git a/include/linux/vzevent.h b/include/linux/vzevent.h new file mode 100644 index 0000000..1a67297 --- /dev/null +++ b/include/linux/vzevent.h @@ -0,0 +1,13 @@ +#ifndef __LINUX_VZ_EVENT_H__ +#define __LINUX_VZ_EVENT_H__ + +#if defined(CONFIG_VZ_EVENT) || defined(CONFIG_VZ_EVENT_MODULE) +extern int vzevent_send(int msg, const char *attrs_fmt, ...); +#else +static inline int vzevent_send(int msg, const char *attrs_fmt, ...) +{ + return 0; +} +#endif + +#endif /* __LINUX_VZ_EVENT_H__ */ diff --git a/include/linux/vziptable_defs.h b/include/linux/vziptable_defs.h new file mode 100644 index 0000000..ec7586f --- /dev/null +++ b/include/linux/vziptable_defs.h @@ -0,0 +1,51 @@ +#ifndef _LINUX_VZIPTABLE_DEFS_H +#define _LINUX_VZIPTABLE_DEFS_H + +/* these masks represent modules */ +#define VE_IP_IPTABLES_MOD (1U<<0) +#define VE_IP_FILTER_MOD (1U<<1) +#define VE_IP_MANGLE_MOD (1U<<2) +#define VE_IP_CONNTRACK_MOD (1U<<14) +#define VE_IP_CONNTRACK_FTP_MOD (1U<<15) +#define VE_IP_CONNTRACK_IRC_MOD (1U<<16) +#define VE_IP_NAT_MOD (1U<<20) +#define VE_IP_NAT_FTP_MOD (1U<<21) +#define VE_IP_NAT_IRC_MOD (1U<<22) +#define VE_IP_IPTABLES6_MOD (1U<<26) +#define VE_IP_FILTER6_MOD (1U<<27) +#define VE_IP_MANGLE6_MOD (1U<<28) +#define VE_IP_IPTABLE_NAT_MOD (1U<<29) +#define VE_NF_CONNTRACK_MOD (1U<<30) + +/* these masks represent modules with their dependences */ +#define VE_IP_IPTABLES (VE_IP_IPTABLES_MOD) +#define VE_IP_FILTER (VE_IP_FILTER_MOD \ + | VE_IP_IPTABLES) +#define VE_IP_MANGLE (VE_IP_MANGLE_MOD \ + | VE_IP_IPTABLES) +#define VE_IP_IPTABLES6 (VE_IP_IPTABLES6_MOD) +#define VE_IP_FILTER6 (VE_IP_FILTER6_MOD | VE_IP_IPTABLES6) +#define VE_IP_MANGLE6 (VE_IP_MANGLE6_MOD | VE_IP_IPTABLES6) +#define VE_NF_CONNTRACK (VE_NF_CONNTRACK_MOD | VE_IP_IPTABLES) +#define VE_IP_CONNTRACK (VE_IP_CONNTRACK_MOD \ + | VE_IP_IPTABLES) +#define VE_IP_CONNTRACK_FTP (VE_IP_CONNTRACK_FTP_MOD \ + | VE_IP_CONNTRACK) +#define VE_IP_CONNTRACK_IRC (VE_IP_CONNTRACK_IRC_MOD \ + | VE_IP_CONNTRACK) +#define VE_IP_NAT (VE_IP_NAT_MOD \ + | VE_IP_CONNTRACK) +#define VE_IP_NAT_FTP (VE_IP_NAT_FTP_MOD \ + | VE_IP_NAT | VE_IP_CONNTRACK_FTP) +#define VE_IP_NAT_IRC (VE_IP_NAT_IRC_MOD \ + | VE_IP_NAT | VE_IP_CONNTRACK_IRC) +#define VE_IP_IPTABLE_NAT (VE_IP_IPTABLE_NAT_MOD | VE_IP_CONNTRACK) + +/* safe iptables mask to be used by default */ +#define VE_IP_DEFAULT \ + (VE_IP_IPTABLES | \ + VE_IP_FILTER | VE_IP_MANGLE) + +#define VE_IPT_CMP(x, y) (((x) & (y)) == (y)) + +#endif /* _LINUX_VZIPTABLE_DEFS_H */ diff --git a/include/linux/vzquota.h b/include/linux/vzquota.h new file mode 100644 index 0000000..e16605e --- /dev/null +++ b/include/linux/vzquota.h @@ -0,0 +1,379 @@ +/* + * + * Copyright (C) 2001-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo disk quota implementation + */ + +#ifndef _VZDQUOTA_H +#define _VZDQUOTA_H + +#include +#include + +/* vzquotactl syscall commands */ +#define VZ_DQ_CREATE 5 /* create quota master block */ +#define VZ_DQ_DESTROY 6 /* destroy qmblk */ +#define VZ_DQ_ON 7 /* mark dentry with already created qmblk */ +#define VZ_DQ_OFF 8 /* remove mark, don't destroy qmblk */ +#define VZ_DQ_SETLIMIT 9 /* set new limits */ +#define VZ_DQ_GETSTAT 10 /* get usage statistic */ +#define VZ_DQ_OFF_FORCED 11 /* forced off */ +/* set of syscalls to maintain UGID quotas */ +#define VZ_DQ_UGID_GETSTAT 1 /* get usage/limits for ugid(s) */ +#define VZ_DQ_UGID_ADDSTAT 2 /* set usage/limits statistic for ugid(s) */ +#define VZ_DQ_UGID_GETGRACE 3 /* get expire times */ +#define VZ_DQ_UGID_SETGRACE 4 /* set expire times */ +#define VZ_DQ_UGID_GETCONFIG 5 /* get ugid_max limit, cnt, flags of qmblk */ +#define VZ_DQ_UGID_SETCONFIG 6 /* set ugid_max limit, flags of qmblk */ +#define VZ_DQ_UGID_SETLIMIT 7 /* set ugid B/I limits */ +#define VZ_DQ_UGID_SETINFO 8 /* set ugid info */ + +/* common structure for vz and ugid quota */ +struct dq_stat { + /* blocks limits */ + __u64 bhardlimit; /* absolute limit in bytes */ + __u64 bsoftlimit; /* preferred limit in bytes */ + time_t btime; /* time limit for excessive disk use */ + __u64 bcurrent; /* current bytes count */ + /* inodes limits */ + __u32 ihardlimit; /* absolute limit on allocated inodes */ + __u32 isoftlimit; /* preferred inode limit */ + time_t itime; /* time limit for excessive inode use */ + __u32 icurrent; /* current # allocated inodes */ +}; + +/* One second resolution for grace times */ +#define CURRENT_TIME_SECONDS (get_seconds()) + +/* Values for dq_info->flags */ +#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ +#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ + +struct dq_info { + time_t bexpire; /* expire timeout for excessive disk use */ + time_t iexpire; /* expire timeout for excessive inode use */ + unsigned flags; /* see previos defines */ +}; + +struct vz_quota_stat { + struct dq_stat dq_stat; + struct dq_info dq_info; +}; + +/* UID/GID interface record - for user-kernel level exchange */ +struct vz_quota_iface { + unsigned int qi_id; /* UID/GID this applies to */ + unsigned int qi_type; /* USRQUOTA|GRPQUOTA */ + struct dq_stat qi_stat; /* limits, options, usage stats */ +}; + +#ifdef CONFIG_COMPAT +#include +struct compat_dq_stat { + /* blocks limits */ + __u64 bhardlimit; /* absolute limit in bytes */ + __u64 bsoftlimit; /* preferred limit in bytes */ + compat_time_t btime; /* time limit for excessive disk use */ + __u64 bcurrent; /* current bytes count */ + /* inodes limits */ + __u32 ihardlimit; /* absolute limit on allocated inodes */ + __u32 isoftlimit; /* preferred inode limit */ + compat_time_t itime; /* time limit for excessive inode use */ + __u32 icurrent; /* current # allocated inodes */ +}; + +struct compat_dq_info { + compat_time_t bexpire; /* expire timeout for excessive disk use */ + compat_time_t iexpire; /* expire timeout for excessive inode use */ + unsigned flags; /* see previos defines */ +}; + +struct compat_vz_quota_stat { + struct compat_dq_stat dq_stat; + struct compat_dq_info dq_info; +}; + +struct compat_vz_quota_iface { + unsigned int qi_id; /* UID/GID this applies to */ + unsigned int qi_type; /* USRQUOTA|GRPQUOTA */ + struct compat_dq_stat qi_stat; /* limits, options, usage stats */ +}; + +static inline void compat_dqstat2dqstat(struct compat_dq_stat *odqs, + struct dq_stat *dqs) +{ + dqs->bhardlimit = odqs->bhardlimit; + dqs->bsoftlimit = odqs->bsoftlimit; + dqs->bcurrent = odqs->bcurrent; + dqs->btime = odqs->btime; + + dqs->ihardlimit = odqs->ihardlimit; + dqs->isoftlimit = odqs->isoftlimit; + dqs->icurrent = odqs->icurrent; + dqs->itime = odqs->itime; +} + +static inline void compat_dqinfo2dqinfo(struct compat_dq_info *odqi, + struct dq_info *dqi) +{ + dqi->bexpire = odqi->bexpire; + dqi->iexpire = odqi->iexpire; + dqi->flags = odqi->flags; +} + +static inline void dqstat2compat_dqstat(struct dq_stat *dqs, + struct compat_dq_stat *odqs) +{ + odqs->bhardlimit = dqs->bhardlimit; + odqs->bsoftlimit = dqs->bsoftlimit; + odqs->bcurrent = dqs->bcurrent; + odqs->btime = (compat_time_t)dqs->btime; + + odqs->ihardlimit = dqs->ihardlimit; + odqs->isoftlimit = dqs->isoftlimit; + odqs->icurrent = dqs->icurrent; + odqs->itime = (compat_time_t)dqs->itime; +} + +static inline void dqinfo2compat_dqinfo(struct dq_info *dqi, + struct compat_dq_info *odqi) +{ + odqi->bexpire = (compat_time_t)dqi->bexpire; + odqi->iexpire = (compat_time_t)dqi->iexpire; + odqi->flags = dqi->flags; +} +#endif + +/* values for flags and dq_flags */ +/* this flag is set if the userspace has been unable to provide usage + * information about all ugids + * if the flag is set, we don't allocate new UG quota blocks (their + * current usage is unknown) or free existing UG quota blocks (not to + * lose information that this block is ok) */ +#define VZDQUG_FIXED_SET 0x01 +/* permit to use ugid quota */ +#define VZDQUG_ON 0x02 +#define VZDQ_USRQUOTA 0x10 +#define VZDQ_GRPQUOTA 0x20 +#define VZDQ_NOACT 0x1000 /* not actual */ +#define VZDQ_NOQUOT 0x2000 /* not under quota tree */ + +struct vz_quota_ugid_stat { + unsigned int limit; /* max amount of ugid records */ + unsigned int count; /* amount of ugid records */ + unsigned int flags; +}; + +struct vz_quota_ugid_setlimit { + unsigned int type; /* quota type (USR/GRP) */ + unsigned int id; /* ugid */ + struct if_dqblk dqb; /* limits info */ +}; + +struct vz_quota_ugid_setinfo { + unsigned int type; /* quota type (USR/GRP) */ + struct if_dqinfo dqi; /* grace info */ +}; + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include + +/* Values for dq_info flags */ +#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ +#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ + +/* values for dq_state */ +#define VZDQ_STARTING 0 /* created, not turned on yet */ +#define VZDQ_WORKING 1 /* quota created, turned on */ +#define VZDQ_STOPING 2 /* created, turned on and off */ + +/* master quota record - one per veid */ +struct vz_quota_master { + struct list_head dq_hash; /* next quota in hash list */ + atomic_t dq_count; /* inode reference count */ + unsigned int dq_flags; /* see VZDQUG_FIXED_SET */ + unsigned int dq_state; /* see values above */ + unsigned int dq_id; /* VEID this applies to */ + struct dq_stat dq_stat; /* limits, grace, usage stats */ + struct dq_info dq_info; /* grace times and flags */ + spinlock_t dq_data_lock; /* for dq_stat */ + + struct semaphore dq_sem; /* semaphore to protect + ugid tree */ + + struct list_head dq_ilink_list; /* list of vz_quota_ilink */ + struct quotatree_tree *dq_uid_tree; /* vz_quota_ugid tree for UIDs */ + struct quotatree_tree *dq_gid_tree; /* vz_quota_ugid tree for GIDs */ + unsigned int dq_ugid_count; /* amount of ugid records */ + unsigned int dq_ugid_max; /* max amount of ugid records */ + struct dq_info dq_ugid_info[MAXQUOTAS]; /* ugid grace times */ + + struct path dq_root_path; /* path of fs tree */ + struct super_block *dq_sb; /* superblock of our quota root */ +}; + +/* UID/GID quota record - one per pair (quota_master, uid or gid) */ +struct vz_quota_ugid { + unsigned int qugid_id; /* UID/GID this applies to */ + struct dq_stat qugid_stat; /* limits, options, usage stats */ + int qugid_type; /* USRQUOTA|GRPQUOTA */ + atomic_t qugid_count; /* reference count */ +}; + +#define VZ_QUOTA_UGBAD ((struct vz_quota_ugid *)0xfeafea11) + +struct vz_quota_datast { + struct vz_quota_ilink qlnk; +}; + +#define VIRTINFO_QUOTA_GETSTAT 0 +#define VIRTINFO_QUOTA_ON 1 +#define VIRTINFO_QUOTA_OFF 2 +#define VIRTINFO_QUOTA_DISABLE 3 + +struct virt_info_quota { + struct super_block *super; + struct dq_stat *qstat; +}; + +/* + * Interface to VZ quota core + */ +#define INODE_QLNK(inode) (&(inode)->i_qlnk) +#define QLNK_INODE(qlnk) container_of((qlnk), struct inode, i_qlnk) + +#define VZ_QUOTA_BAD ((struct vz_quota_master *)0xefefefef) + +#define VZ_QUOTAO_SETE 1 +#define VZ_QUOTAO_INIT 2 +#define VZ_QUOTAO_DESTR 3 +#define VZ_QUOTAO_SWAP 4 +#define VZ_QUOTAO_INICAL 5 +#define VZ_QUOTAO_DRCAL 6 +#define VZ_QUOTAO_QSET 7 +#define VZ_QUOTAO_TRANS 8 +#define VZ_QUOTAO_ACT 9 +#define VZ_QUOTAO_DTREE 10 +#define VZ_QUOTAO_DET 11 +#define VZ_QUOTAO_ON 12 +#define VZ_QUOTAO_RE_LOCK 13 + +#define DQUOT_CMD_ALLOC 0 +#define DQUOT_CMD_PREALLOC 1 +#define DQUOT_CMD_CHECK 12 +#define DQUOT_CMD_FORCE 13 + +extern struct semaphore vz_quota_sem; +void inode_qmblk_lock(struct super_block *sb); +void inode_qmblk_unlock(struct super_block *sb); +void qmblk_data_read_lock(struct vz_quota_master *qmblk); +void qmblk_data_read_unlock(struct vz_quota_master *qmblk); +void qmblk_data_write_lock(struct vz_quota_master *qmblk); +void qmblk_data_write_unlock(struct vz_quota_master *qmblk); + +/* for quota operations */ +void vzquota_inode_init_call(struct inode *inode); +void vzquota_inode_drop_call(struct inode *inode); +int vzquota_inode_transfer_call(struct inode *, struct iattr *); +struct vz_quota_master *vzquota_inode_data(struct inode *inode, + struct vz_quota_datast *); +void vzquota_data_unlock(struct inode *inode, struct vz_quota_datast *); +int vzquota_rename_check(struct inode *inode, + struct inode *old_dir, struct inode *new_dir); +struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode); +/* for second-level quota */ +struct vz_quota_master *vzquota_find_qmblk(struct super_block *); +/* for management operations */ +struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, + struct vz_quota_stat *qstat); +void vzquota_free_master(struct vz_quota_master *); +struct vz_quota_master *vzquota_find_master(unsigned int quota_id); +int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, + struct vz_quota_master *qmblk, char __user *buf); +int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk, + char __user *buf, int force); +int vzquota_get_super(struct super_block *sb); +void vzquota_put_super(struct super_block *sb); + +static inline struct vz_quota_master *qmblk_get(struct vz_quota_master *qmblk) +{ + if (!atomic_read(&qmblk->dq_count)) + BUG(); + atomic_inc(&qmblk->dq_count); + return qmblk; +} + +static inline void __qmblk_put(struct vz_quota_master *qmblk) +{ + atomic_dec(&qmblk->dq_count); +} + +static inline void qmblk_put(struct vz_quota_master *qmblk) +{ + if (!atomic_dec_and_test(&qmblk->dq_count)) + return; + vzquota_free_master(qmblk); +} + +extern struct list_head vzquota_hash_table[]; +extern int vzquota_hash_size; + +/* + * Interface to VZ UGID quota + */ +extern struct quotactl_ops vz_quotactl_operations; +extern struct dquot_operations vz_quota_operations2; +extern struct quota_format_type vz_quota_empty_v2_format; + +#define QUGID_TREE(qmblk, type) (((type) == USRQUOTA) ? \ + qmblk->dq_uid_tree : \ + qmblk->dq_gid_tree) + +#define VZDQUG_FIND_DONT_ALLOC 1 +#define VZDQUG_FIND_FAKE 2 +struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags); +struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags); +struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid); +void vzquota_put_ugid(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid); +void vzquota_kill_ugid(struct vz_quota_master *qmblk); +int vzquota_ugid_init(void); +void vzquota_ugid_release(void); +int vzquota_transfer_usage(struct inode *inode, int mask, + struct vz_quota_ilink *qlnk); +void vzquota_inode_off(struct inode *inode); + +long do_vzquotaugidctl(int cmd, unsigned int quota_id, + unsigned int ugid_index, unsigned int ugid_size, + void *addr, int compat); + +/* + * Other VZ quota parts + */ +extern struct dquot_operations vz_quota_operations; + +long do_vzquotactl(int cmd, unsigned int quota_id, + struct vz_quota_stat __user *qstat, const char __user *ve_root, + int compat); +int vzquota_proc_init(void); +void vzquota_proc_release(void); +struct vz_quota_master *vzquota_find_qmblk(struct super_block *); +extern struct semaphore vz_quota_sem; + +void vzaquota_init(void); +void vzaquota_fini(void); + +#endif /* __KERNEL__ */ + +#endif /* _VZDQUOTA_H */ diff --git a/include/linux/vzquota_qlnk.h b/include/linux/vzquota_qlnk.h new file mode 100644 index 0000000..2788c41 --- /dev/null +++ b/include/linux/vzquota_qlnk.h @@ -0,0 +1,25 @@ +/* + * include/linux/vzquota_qlnk.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VZDQUOTA_QLNK_H +#define _VZDQUOTA_QLNK_H + +struct vz_quota_master; +struct vz_quota_ugid; + +/* inode link, used to track inodes using quota via dq_ilink_list */ +struct vz_quota_ilink { + struct vz_quota_master *qmblk; + struct vz_quota_ugid *qugid[MAXQUOTAS]; + struct list_head list; + unsigned char origin[2]; +}; + +#endif /* _VZDQUOTA_QLNK_H */ diff --git a/include/linux/vzratelimit.h b/include/linux/vzratelimit.h new file mode 100644 index 0000000..f26baad --- /dev/null +++ b/include/linux/vzratelimit.h @@ -0,0 +1,28 @@ +/* + * include/linux/vzratelimit.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VZ_RATELIMIT_H__ +#define __VZ_RATELIMIT_H__ + +/* + * Generic ratelimiting stuff. + */ + +struct vz_rate_info { + int burst; + int interval; /* jiffy_t per event */ + int bucket; /* kind of leaky bucket */ + unsigned long last; /* last event */ +}; + +/* Return true if rate limit permits. */ +int vz_ratelimit(struct vz_rate_info *p); + +#endif /* __VZ_RATELIMIT_H__ */ diff --git a/include/linux/vzstat.h b/include/linux/vzstat.h new file mode 100644 index 0000000..5c23ea4 --- /dev/null +++ b/include/linux/vzstat.h @@ -0,0 +1,182 @@ +/* + * include/linux/vzstat.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VZSTAT_H__ +#define __VZSTAT_H__ + +struct swap_cache_info_struct { + unsigned long add_total; + unsigned long del_total; + unsigned long find_success; + unsigned long find_total; + unsigned long noent_race; + unsigned long exist_race; + unsigned long remove_race; +}; + +struct kstat_lat_snap_struct { + cycles_t maxlat, totlat; + unsigned long count; +}; +struct kstat_lat_pcpu_snap_struct { + cycles_t maxlat, totlat; + unsigned long count; + seqcount_t lock; +} ____cacheline_aligned_in_smp; + +struct kstat_lat_struct { + struct kstat_lat_snap_struct cur, last; + cycles_t avg[3]; +}; +struct kstat_lat_pcpu_struct { + struct kstat_lat_pcpu_snap_struct cur[NR_CPUS]; + cycles_t max_snap; + struct kstat_lat_snap_struct last; + cycles_t avg[3]; +}; + +struct kstat_perf_snap_struct { + cycles_t wall_tottime, cpu_tottime; + cycles_t wall_maxdur, cpu_maxdur; + unsigned long count; +}; +struct kstat_perf_struct { + struct kstat_perf_snap_struct cur, last; +}; + +struct kstat_zone_avg { + unsigned long free_pages_avg[3], + nr_active_avg[3], + nr_inactive_avg[3]; +}; + +#define KSTAT_ALLOCSTAT_NR 5 + +struct kernel_stat_glob { + unsigned long nr_unint_avg[3]; + + unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR]; + struct kstat_lat_struct alloc_lat[KSTAT_ALLOCSTAT_NR]; + struct kstat_lat_pcpu_struct sched_lat; + struct kstat_lat_struct swap_in; + + struct kstat_perf_struct ttfp, cache_reap, + refill_inact, shrink_icache, shrink_dcache; + + struct kstat_zone_avg zone_avg[3]; /* MAX_NR_ZONES */ +} ____cacheline_aligned; + +extern struct kernel_stat_glob kstat_glob ____cacheline_aligned; +extern spinlock_t kstat_glb_lock; + +#ifdef CONFIG_VE +#define KSTAT_PERF_ENTER(name) \ + unsigned long flags; \ + cycles_t start, sleep_time; \ + \ + start = get_cycles(); \ + sleep_time = VE_TASK_INFO(current)->sleep_time; \ + +#define KSTAT_PERF_LEAVE(name) \ + spin_lock_irqsave(&kstat_glb_lock, flags); \ + kstat_glob.name.cur.count++; \ + start = get_cycles() - start; \ + if (kstat_glob.name.cur.wall_maxdur < start) \ + kstat_glob.name.cur.wall_maxdur = start;\ + kstat_glob.name.cur.wall_tottime += start; \ + start -= VE_TASK_INFO(current)->sleep_time - \ + sleep_time; \ + if (kstat_glob.name.cur.cpu_maxdur < start) \ + kstat_glob.name.cur.cpu_maxdur = start; \ + kstat_glob.name.cur.cpu_tottime += start; \ + spin_unlock_irqrestore(&kstat_glb_lock, flags); \ + +#else +#define KSTAT_PERF_ENTER(name) +#define KSTAT_PERF_LEAVE(name) +#endif + +/* + * Add another statistics reading. + * Serialization is the caller's due. + */ +static inline void KSTAT_LAT_ADD(struct kstat_lat_struct *p, + cycles_t dur) +{ + p->cur.count++; + if (p->cur.maxlat < dur) + p->cur.maxlat = dur; + p->cur.totlat += dur; +} + +static inline void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu, + cycles_t dur) +{ + struct kstat_lat_pcpu_snap_struct *cur; + + cur = &p->cur[cpu]; + write_seqcount_begin(&cur->lock); + cur->count++; + if (cur->maxlat < dur) + cur->maxlat = dur; + cur->totlat += dur; + write_seqcount_end(&cur->lock); +} + +/* + * Move current statistics to last, clear last. + * Serialization is the caller's due. + */ +static inline void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p) +{ + cycles_t m; + memcpy(&p->last, &p->cur, sizeof(p->last)); + p->cur.maxlat = 0; + m = p->last.maxlat; + CALC_LOAD(p->avg[0], EXP_1, m) + CALC_LOAD(p->avg[1], EXP_5, m) + CALC_LOAD(p->avg[2], EXP_15, m) +} + +static inline void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p) +{ + unsigned i, cpu; + struct kstat_lat_pcpu_snap_struct snap, *cur; + cycles_t m; + + memset(&p->last, 0, sizeof(p->last)); + for (cpu = 0; cpu < NR_CPUS; cpu++) { + cur = &p->cur[cpu]; + do { + i = read_seqcount_begin(&cur->lock); + memcpy(&snap, cur, sizeof(snap)); + } while (read_seqcount_retry(&cur->lock, i)); + /* + * read above and this update of maxlat is not atomic, + * but this is OK, since it happens rarely and losing + * a couple of peaks is not essential. xemul + */ + cur->maxlat = 0; + + p->last.count += snap.count; + p->last.totlat += snap.totlat; + if (p->last.maxlat < snap.maxlat) + p->last.maxlat = snap.maxlat; + } + + m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap); + CALC_LOAD(p->avg[0], EXP_1, m); + CALC_LOAD(p->avg[1], EXP_5, m); + CALC_LOAD(p->avg[2], EXP_15, m); + /* reset max_snap to calculate it correctly next time */ + p->max_snap = 0; +} + +#endif /* __VZSTAT_H__ */ diff --git a/include/net/addrconf.h b/include/net/addrconf.h index c216de5..dff9367 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -260,5 +260,9 @@ extern int if6_proc_init(void); extern void if6_proc_exit(void); #endif +int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, + unsigned int plen, __u8 ifa_flags, __u32 prefered_lft, + __u32 valid_lft); + #endif #endif diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 7dd29b7..f830fb4 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -9,6 +9,7 @@ extern void unix_inflight(struct file *fp); extern void unix_notinflight(struct file *fp); extern void unix_gc(void); +extern void unix_destruct_fds(struct sk_buff *skb); #define UNIX_HASH_SIZE 256 diff --git a/include/net/flow.h b/include/net/flow.h index 228b247..d802436 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -10,6 +10,7 @@ #include #include +struct ve_struct; struct flowi { int oif; int iif; @@ -75,6 +76,9 @@ struct flowi { #define fl_icmp_code uli_u.icmpt.code #define fl_ipsec_spi uli_u.spi #define fl_mh_type uli_u.mht.type +#ifdef CONFIG_VE + struct ve_struct *owner_env; +#endif __u32 secid; /* used by xfrm; see secid.txt */ } __attribute__((__aligned__(BITS_PER_LONG/8))); diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index e081eef..7a554cc 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -15,6 +15,9 @@ struct netns_frags { struct inet_frag_queue { struct hlist_node list; struct netns_frags *net; +#ifdef CONFIG_VE + struct ve_struct *owner_ve; +#endif struct list_head lru_list; /* lru list member */ spinlock_t lock; atomic_t refcnt; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index bb619d8..148c512 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -75,6 +75,7 @@ struct inet_ehash_bucket { * ports are created in O(1) time? I thought so. ;-) -DaveM */ struct inet_bind_bucket { + struct ve_struct *owner_env; struct net *ib_net; unsigned short port; signed short fastreuse; @@ -198,7 +199,8 @@ extern struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - const unsigned short snum); + const unsigned short snum, + struct ve_struct *env); extern void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 9132490..50fe9af 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -81,6 +81,7 @@ struct inet_timewait_death_row { struct inet_hashinfo *hashinfo; int sysctl_tw_recycle; int sysctl_max_tw_buckets; + int ub_managed; }; extern void inet_twdr_hangman(unsigned long data); @@ -134,6 +135,7 @@ struct inet_timewait_sock { unsigned long tw_ttd; struct inet_bind_bucket *tw_tb; struct hlist_node tw_death_node; + envid_t tw_owner_env; }; static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 7c5c0f7..6d549ac 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -156,6 +156,7 @@ struct fib6_table { u32 tb6_id; rwlock_t tb6_lock; struct fib6_node tb6_root; + struct ve_struct *owner_env; }; #define RT6_TABLE_UNSPEC RT_TABLE_UNSPEC diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 113028f..52b65c3 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -115,7 +115,7 @@ extern struct ctl_path net_ipv6_ctl_path[]; struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS##modifier((_idev)->stats.statname, (field)); \ - SNMP_INC_STATS##modifier(statname##_statistics, (field)); \ + SNMP_INC_STATS##modifier(ve_##statname##_statistics, (field)); \ }) #define _DEVADD(statname, modifier, idev, field, val) \ @@ -123,9 +123,22 @@ extern struct ctl_path net_ipv6_ctl_path[]; struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_ADD_STATS##modifier((_idev)->stats.statname, (field), (val)); \ - SNMP_ADD_STATS##modifier(statname##_statistics, (field), (val));\ + SNMP_ADD_STATS##modifier(ve_##statname##_statistics, (field), (val));\ }) +#ifdef CONFIG_VE +#define ve_ipv6_statistics (get_exec_env()->_ipv6_statistics) +#define ve_icmpv6_statistics (get_exec_env()->_icmpv6_statistics) +#define ve_icmpv6msg_statistics (get_exec_env()->_icmpv6msg_statistics) + +extern int init_ipv6_mibs(void); +extern void cleanup_ipv6_mibs(void); +#else +#define ve_ipv6_statistics ipv6_statistics +#define ve_icmpv6_statistics icmpv6_statistics +#define ve_icmpv6msg_statistics icmpv6msg_statistics +#endif + /* MIBs */ DECLARE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index a8eb43c..616b640 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -48,6 +48,13 @@ struct net { struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; + int ifindex; + +#ifdef CONFIG_VE + struct completion *sysfs_completion; + struct ve_struct *owner_ve; +#endif + /* core fib_rules */ struct list_head rules_ops; spinlock_t rules_mod_lock; diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 7573d52..9ced12d 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -16,8 +16,18 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; +#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) extern int nf_conntrack_ipv4_compat_init(void); extern void nf_conntrack_ipv4_compat_fini(void); +#else +static inline int nf_conntrack_ipv4_compat_init(void) +{ + return 0; +} +static inline void nf_conntrack_ipv4_compat_fini(void) +{ +} +#endif extern void need_ipv4_conntrack(void); diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 0741ad5..bf79a6f 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -28,6 +28,10 @@ #include +#ifdef CONFIG_VE_IPTABLES +#include +#endif + /* per conntrack: protocol private data */ union nf_conntrack_proto { /* insert conntrack proto private data here */ @@ -125,6 +129,10 @@ struct nf_conn struct nf_ct_ext *ext; struct rcu_head rcu; + +#ifdef CONFIG_VE_IPTABLES + struct ve_struct *ct_owner_env; +#endif }; static inline struct nf_conn * @@ -188,6 +196,11 @@ extern void nf_conntrack_hash_insert(struct nf_conn *ct); extern void nf_conntrack_flush(void); +struct nf_conntrack_helper * nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple); +void nf_ct_helper_put(struct nf_conntrack_helper *helper); + +struct nf_conntrack_helper * __nf_conntrack_helper_find_byname(const char *name); + extern bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, struct nf_conntrack_tuple *tuple); @@ -253,6 +266,7 @@ extern void nf_conntrack_free(struct nf_conn *ct); extern struct nf_conn * nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, + struct user_beancounter *, gfp_t gfp); /* It's confirmed if it is, or has been in the hash table. */ @@ -276,6 +290,8 @@ extern unsigned int nf_conntrack_htable_size; extern int nf_conntrack_checksum; extern atomic_t nf_conntrack_count; extern int nf_conntrack_max; +extern int nf_conntrack_disable_ve0; +extern int ip_conntrack_disable_ve0; DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); #define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++) diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index a817712..469fdc3 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -52,6 +52,45 @@ nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple); extern int __nf_conntrack_confirm(struct sk_buff *skb); +#if defined(CONFIG_VE_IPTABLES) +#include +#define ve_nf_conntrack_hash (get_exec_env()->_nf_conntrack->_nf_conntrack_hash) +#define ve_nf_conntrack_vmalloc (get_exec_env()->_nf_conntrack->_nf_conntrack_vmalloc) +#define ve_unconfirmed (get_exec_env()->_nf_conntrack->_unconfirmed) +#else +#define ve_nf_conntrack_hash nf_conntrack_hash +#define ve_nf_conntrack_vmalloc nf_conntrack_vmalloc +#define ve_unconfirmed unconfirmed +#endif /* CONFIG_VE_IPTABLES */ + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +#define ve_nf_ct_sysctl_header \ + (get_exec_env()->_nf_conntrack->_nf_ct_sysctl_header) +#define ve_nf_ct_netfilter_header \ + (get_exec_env()->_nf_conntrack->_nf_ct_netfilter_header) +#define ve_nf_ct_sysctl_table \ + (get_exec_env()->_nf_conntrack->_nf_ct_sysctl_table) +#define ve_nf_ct_netfilter_table \ + (get_exec_env()->_nf_conntrack->_nf_ct_netfilter_table) +#define ve_nf_ct_net_table \ + (get_exec_env()->_nf_conntrack->_nf_ct_net_table) +extern void nf_ct_proto_generic_sysctl_cleanup(void); +extern int nf_ct_proto_generic_sysctl_init(void); +#else +#define ve_nf_ct_sysctl_header nf_ct_sysctl_header +#define ve_nf_ct_netfilter_header nf_ct_netfilter_header +#define ve_nf_ct_sysctl_table nf_ct_sysctl_table +#define ve_nf_ct_netfilter_table nf_ct_netfilter_table +#define ve_nf_ct_net_table nf_ct_net_table +static inline int nf_ct_proto_generic_sysctl_init(void) +{ + return 0; +} +static inline void nf_ct_proto_generic_sysctl_cleanup(void) +{ +} +#endif /* CONFIG_VE_IPTABLES */ + /* Confirm a connection: returns NF_DROP if packet must be dropped. */ static inline int nf_conntrack_confirm(struct sk_buff *skb) { @@ -71,7 +110,9 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *proto); +#ifndef CONFIG_VE_IPTABLES extern struct hlist_head *nf_conntrack_hash; +#endif extern spinlock_t nf_conntrack_lock ; extern struct hlist_head unconfirmed; diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index f0b9078..4bcf1bd 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -34,6 +34,9 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct = (struct nf_conn *)skb->nfct; struct nf_conntrack_ecache *ecache; + if (!ve_is_super(get_exec_env())) + return; + local_bh_disable(); ecache = &__get_cpu_var(nf_conntrack_ecache); if (ct != ecache->ct) @@ -45,7 +48,7 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, static inline void nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct) { - if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) + if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && ve_is_super(get_exec_env())) atomic_notifier_call_chain(&nf_conntrack_chain, event, ct); } @@ -57,7 +60,8 @@ static inline void nf_ct_expect_event(enum ip_conntrack_expect_events event, struct nf_conntrack_expect *exp) { - atomic_notifier_call_chain(&nf_ct_expect_chain, event, exp); + if (ve_is_super(get_exec_env())) + atomic_notifier_call_chain(&nf_ct_expect_chain, event, exp); } #else /* CONFIG_NF_CONNTRACK_EVENTS */ diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index dfdf4b4..4175cdf 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -6,9 +6,17 @@ #define _NF_CONNTRACK_EXPECT_H #include -extern struct hlist_head *nf_ct_expect_hash; extern unsigned int nf_ct_expect_hsize; extern unsigned int nf_ct_expect_max; +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_nf_ct_expect_hash (get_exec_env()->_nf_conntrack->_nf_ct_expect_hash) +#define ve_nf_ct_expect_max (get_exec_env()->_nf_conntrack->_nf_ct_expect_max) +#else +extern struct hlist_head *nf_ct_expect_hash; +#define ve_nf_ct_expect_hash nf_ct_expect_hash +#define ve_nf_ct_expect_max nf_ct_expect_max +#endif struct nf_conntrack_expect { @@ -73,6 +81,8 @@ void nf_conntrack_expect_fini(void); struct nf_conntrack_expect * __nf_ct_expect_find(const struct nf_conntrack_tuple *tuple); +void nf_ct_expect_insert(struct nf_conntrack_expect *exp); + struct nf_conntrack_expect * nf_ct_expect_find_get(const struct nf_conntrack_tuple *tuple); diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index 0378676..ac81973 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -42,6 +42,9 @@ struct nf_conntrack_l3proto int (*print_tuple)(struct seq_file *s, const struct nf_conntrack_tuple *); + /* Called when a conntrack entry is destroyed */ + void (*destroy)(struct nf_conn *conntrack); + /* * Called before tracking. * *dataoff: offset of protocol header (TCP, UDP,...) in skb @@ -67,6 +70,31 @@ struct nf_conntrack_l3proto struct module *me; }; +/* virtualization of l3 protocol's sysctl tables: */ +#if defined(CONFIG_VE_IPTABLES) +#include +#define ve_nf_ct3 (get_exec_env()->_nf_conntrack) +#endif + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +#define ve_nf_ct_l3protos ve_nf_ct3->_nf_ct_l3protos +#define ve_nf_conntrack_l3proto_ipv4 (ve_nf_ct3->_nf_conntrack_l3proto_ipv4) +#define ve_nf_conntrack_max (ve_nf_ct3->_nf_conntrack_max) +#define ve_nf_conntrack_count (ve_nf_ct3->_nf_conntrack_count) +#define ve_nf_conntrack_checksum (ve_nf_ct3->_nf_conntrack_checksum) +#else /* !CONFIG_VE_IPTABLES || !CONFIG_SYSCTL: */ +#define ve_nf_ct_l3protos nf_ct_l3protos +#define ve_nf_conntrack_l3proto_ipv4 &nf_conntrack_l3proto_ipv4 +#define ve_nf_conntrack_max nf_conntrack_max +#define ve_nf_conntrack_count nf_conntrack_count +#define ve_nf_conntrack_checksum nf_conntrack_checksum +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ + +extern int init_nf_ct_l3proto_ipv4(void); +extern void fini_nf_ct_l3proto_ipv4(void); +extern int init_nf_ct_l3proto_ipv6(void); +extern void fini_nf_ct_l3proto_ipv6(void); + extern struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX]; /* Protocol registration. */ @@ -83,7 +111,11 @@ __nf_ct_l3proto_find(u_int16_t l3proto) { if (unlikely(l3proto >= AF_MAX)) return &nf_conntrack_l3proto_generic; - return rcu_dereference(nf_ct_l3protos[l3proto]); +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_nf_conntrack) + return &nf_conntrack_l3proto_generic; +#endif + return rcu_dereference(ve_nf_ct_l3protos[l3proto]); } #endif /*_NF_CONNTRACK_L3PROTO_H*/ diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 723df9d..e5be345 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -97,6 +97,7 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_generic; #define MAX_NF_CT_PROTO 256 +extern struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX]; extern struct nf_conntrack_l4proto * __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto); @@ -117,16 +118,146 @@ extern int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t); extern const struct nla_policy nf_ct_port_nla_policy[]; +#ifdef CONFIG_SYSCTL /* Log invalid packets */ extern unsigned int nf_ct_log_invalid; +#endif + +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_nf_ct4 (get_exec_env()->_nf_conntrack) +#endif + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) + +#define ve_nf_ct_protos (ve_nf_ct4->_nf_ct_protos) +#define ve_nf_conntrack_l4proto_icmp (ve_nf_ct4->_nf_conntrack_l4proto_icmp) +#define ve_nf_conntrack_l4proto_icmpv6 \ + (ve_nf_ct4->_nf_conntrack_l4proto_icmpv6) +#define ve_nf_conntrack_l4proto_tcp4 (ve_nf_ct4->_nf_conntrack_l4proto_tcp4) +#define ve_nf_conntrack_l4proto_tcp6 (ve_nf_ct4->_nf_conntrack_l4proto_tcp6) +#define ve_nf_conntrack_l4proto_udp4 (ve_nf_ct4->_nf_conntrack_l4proto_udp4) +#define ve_nf_conntrack_l4proto_udp6 (ve_nf_ct4->_nf_conntrack_l4proto_udp6) +#define ve_nf_conntrack_l4proto_generic \ + (ve_nf_ct4->_nf_conntrack_l4proto_generic) +#define ve_nf_ct_log_invalid (ve_nf_ct4->_nf_ct_log_invalid) +/* TCP: */ +#define ve_nf_ct_tcp_timeouts (ve_nf_ct4->_nf_ct_tcp_timeouts) +#define ve_nf_ct_tcp_timeout_max_retrans \ + (ve_nf_ct4->_nf_ct_tcp_timeout_max_retrans) +#define ve_nf_ct_tcp_timeout_unacknowledged \ + (ve_nf_ct4->_nf_ct_tcp_timeout_unacknowledged) +#define ve_nf_ct_tcp_max_retrans (ve_nf_ct4->_nf_ct_tcp_max_retrans) +#define ve_nf_ct_tcp_loose (ve_nf_ct4->_nf_ct_tcp_loose) +#define ve_nf_ct_tcp_be_liberal (ve_nf_ct4->_nf_ct_tcp_be_liberal) +#define ve_tcp_sysctl_table_users (ve_nf_ct4->_tcp_sysctl_table_users) +#define ve_tcp_sysctl_header (ve_nf_ct4->_tcp_sysctl_header) +#define ve_tcp_compat_sysctl_header (ve_nf_ct4->_tcp_compat_sysctl_header) +/* UDP: */ +#define ve_nf_ct_udp_timeout (ve_nf_ct4->_nf_ct_udp_timeout) +#define ve_nf_ct_udp_timeout_stream (ve_nf_ct4->_nf_ct_udp_timeout_stream) +#define ve_udp_sysctl_table_users (ve_nf_ct4->_udp_sysctl_table_users) +#define ve_udp_sysctl_header (ve_nf_ct4->_udp_sysctl_header) +#define ve_udp_compat_sysctl_header (ve_nf_ct4->_udp_compat_sysctl_header) +/* ICMP: */ +#define ve_nf_ct_icmp_timeout (ve_nf_ct4->_nf_ct_icmp_timeout) +#define ve_icmp_sysctl_header (ve_nf_ct4->_icmp_sysctl_header) +#define ve_icmp_compat_sysctl_header (ve_nf_ct4->_icmp_compat_sysctl_header) +/* ICMPV6: */ +#define ve_nf_ct_icmpv6_timeout (ve_nf_ct4->_nf_ct_icmpv6_timeout) +#define ve_icmpv6_sysctl_header (ve_nf_ct4->_icmpv6_sysctl_header) +/* GENERIC: */ +#define ve_nf_ct_generic_timeout (ve_nf_ct4->_nf_ct_generic_timeout) +#define ve_generic_sysctl_header (ve_nf_ct4->_generic_sysctl_header) +#define ve_generic_compat_sysctl_header (ve_nf_ct4->_generic_compat_sysctl_header) + +extern void nf_ct_proto_icmp_sysctl_cleanup(void); +extern int nf_ct_proto_icmp_sysctl_init(void); +extern void nf_ct_proto_icmpv6_sysctl_cleanup(void); +extern int nf_ct_proto_icmpv6_sysctl_init(void); +extern void nf_ct_proto_tcp_sysctl_cleanup(void); +extern int nf_ct_proto_tcp_sysctl_init(void); +extern void nf_ct_proto_udp_sysctl_cleanup(void); +extern int nf_ct_proto_udp_sysctl_init(void); + +#else /* !CONFIG_VE_IPTABLES || !CONFIG_SYSCTL: */ + +#define ve_nf_ct_protos nf_ct_protos +#define ve_nf_conntrack_l4proto_icmp &nf_conntrack_l4proto_icmp +#define ve_nf_conntrack_l4proto_icmpv6 &nf_conntrack_l4proto_icmpv6 +#define ve_nf_conntrack_l4proto_tcp4 &nf_conntrack_l4proto_tcp4 +#define ve_nf_conntrack_l4proto_tcp6 &nf_conntrack_l4proto_tcp6 +#define ve_nf_conntrack_l4proto_udp4 &nf_conntrack_l4proto_udp4 +#define ve_nf_conntrack_l4proto_udp6 &nf_conntrack_l4proto_udp6 +#define ve_nf_conntrack_l4proto_generic &nf_conntrack_l4proto_generic + +#if defined(CONFIG_SYSCTL) + +#define ve_nf_ct_log_invalid nf_ct_log_invalid +/* TCP: */ +#define ve_nf_ct_tcp_timeouts *tcp_timeouts +#define ve_nf_ct_tcp_timeout_max_retrans \ + nf_ct_tcp_timeout_max_retrans +#define ve_nf_ct_tcp_timeout_unacknowledged \ + nf_ct_tcp_timeout_unacknowledged +#define ve_nf_ct_tcp_max_retrans nf_ct_tcp_max_retrans +#define ve_nf_ct_tcp_loose nf_ct_tcp_loose +#define ve_nf_ct_tcp_be_liberal nf_ct_tcp_be_liberal +#define ve_tcp_sysctl_table_users tcp_sysctl_table_users +#define ve_tcp_sysctl_header tcp_sysctl_header +/* UDP:*/ +#define ve_nf_ct_udp_timeout nf_ct_udp_timeout +#define ve_nf_ct_udp_timeout_stream nf_ct_udp_timeout_stream +#define ve_udp_sysctl_table_users udp_sysctl_table_users +#define ve_udp_sysctl_header udp_sysctl_header +/* ICMP: */ +#define ve_nf_ct_icmp_timeout nf_ct_icmp_timeout +#define ve_icmp_sysctl_header icmp_sysctl_header +/* ICMPV6: */ +#define ve_nf_ct_icmpv6_timeout nf_ct_icmpv6_timeout +#define ve_icmpv6_sysctl_header icmpv6_sysctl_header +/* GENERIC: */ +#define ve_nf_ct_generic_timeout nf_ct_generic_timeout +#define ve_generic_sysctl_header generic_sysctl_header +#endif /* CONFIG_SYSCTL */ + +static inline int nf_ct_proto_icmp_sysctl_init(void) +{ + return 0; +} +static inline void nf_ct_proto_icmp_sysctl_cleanup(void) +{ +} +static inline int nf_ct_proto_tcp_sysctl_init(void) +{ + return 0; +} +static inline void nf_ct_proto_tcp_sysctl_cleanup(void) +{ +} +static inline int nf_ct_proto_udp_sysctl_init(void) +{ + return 0; +} +static inline void nf_ct_proto_udp_sysctl_cleanup(void) +{ +} +static inline int nf_ct_proto_icmpv6_sysctl_init(void) +{ + return 0; +} +static inline void nf_ct_proto_icmpv6_sysctl_cleanup(void) +{ +} +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ #ifdef CONFIG_SYSCTL #ifdef DEBUG_INVALID_PACKETS #define LOG_INVALID(proto) \ - (nf_ct_log_invalid == (proto) || nf_ct_log_invalid == IPPROTO_RAW) + (ve_nf_ct_log_invalid == (proto) || ve_nf_ct_log_invalid == IPPROTO_RAW) #else #define LOG_INVALID(proto) \ - ((nf_ct_log_invalid == (proto) || nf_ct_log_invalid == IPPROTO_RAW) \ + ((ve_nf_ct_log_invalid == (proto) || ve_nf_ct_log_invalid == IPPROTO_RAW) \ && net_ratelimit()) #endif #else diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index 9dc1039..bfa9069 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -77,6 +77,8 @@ struct nf_conn_nat #endif }; +void nf_nat_hash_conntrack(struct nf_conn *ct); + /* Set up the info structure to map into this range. */ extern unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, @@ -85,6 +87,7 @@ extern unsigned int nf_nat_setup_info(struct nf_conn *ct, /* Is this tuple already taken? (not by us)*/ extern int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack); +extern void ip_nat_hash_conntrack(struct nf_conn *ct); static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct) { diff --git a/include/net/netfilter/nf_nat_rule.h b/include/net/netfilter/nf_nat_rule.h index e4a18ae..8bb00da 100644 --- a/include/net/netfilter/nf_nat_rule.h +++ b/include/net/netfilter/nf_nat_rule.h @@ -4,7 +4,7 @@ #include #include -extern int nf_nat_rule_init(void) __init; +extern int nf_nat_rule_init(void); extern void nf_nat_rule_cleanup(void); extern int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, diff --git a/include/net/netlink_sock.h b/include/net/netlink_sock.h new file mode 100644 index 0000000..ce4701a --- /dev/null +++ b/include/net/netlink_sock.h @@ -0,0 +1,23 @@ +#ifndef __NET_NETLINK_SOCK_H +#define __NET_NETLINK_SOCK_H + +struct netlink_sock { + /* struct sock has to be the first member of netlink_sock */ + struct sock sk; + u32 pid; + u32 dst_pid; + u32 dst_group; + u32 flags; + u32 subscriptions; + u32 ngroups; + unsigned long *groups; + unsigned long state; + wait_queue_head_t wait; + struct netlink_callback *cb; + struct mutex *cb_mutex; + struct mutex cb_def_mutex; + void (*netlink_rcv)(struct sk_buff *skb); + struct module *module; +}; + +#endif /* __NET_NETLINK_SOCK_H */ diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 2932721..a3e3007 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -13,6 +13,7 @@ struct netns_sysctl_ipv6 { #ifdef CONFIG_SYSCTL struct ctl_table_header *table; struct ctl_table_header *frags_hdr; + struct ctl_table_header *nf_frags_hdr; #endif int bindv6only; int flush_delay; @@ -31,6 +32,11 @@ struct netns_ipv6 { struct ipv6_devconf *devconf_all; struct ipv6_devconf *devconf_dflt; struct netns_frags frags; + struct netns_frags ct_frags; + +#ifdef CONFIG_SYSCTL + struct nf_conntrack_l3proto *nf_conntrack_l3proto_ipv6; +#endif #ifdef CONFIG_NETFILTER struct xt_table *ip6table_filter; struct xt_table *ip6table_mangle; @@ -55,5 +61,7 @@ struct netns_ipv6 { struct sock *ndisc_sk; struct sock *tcp_sk; struct sock *igmp_sk; + + struct proc_dir_entry *proc_dev_snmp; }; #endif diff --git a/include/net/route.h b/include/net/route.h index 4f0d8c1..0836235 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -138,6 +138,7 @@ static inline void ip_rt_put(struct rtable * rt) #define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) extern const __u8 ip_tos2prio[16]; +extern int ip_rt_src_check; static inline char rt_tos2priority(u8 tos) { diff --git a/include/net/sock.h b/include/net/sock.h index 06c5259..7fc48ef 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -57,6 +57,8 @@ #include #include +#include + /* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of @@ -279,6 +281,8 @@ struct sock { int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); + struct sock_beancounter sk_bc; + struct ve_struct *owner_env; }; /* @@ -495,6 +499,8 @@ static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb) }) extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p); +extern int __sk_stream_wait_memory(struct sock *sk, long *timeo_p, + unsigned long amount); extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); extern void sk_stream_wait_close(struct sock *sk, long timeo_p); extern int sk_stream_error(struct sock *sk, int flags, int err); @@ -729,7 +735,8 @@ static inline int sk_has_account(struct sock *sk) return !!sk->sk_prot->memory_allocated; } -static inline int sk_wmem_schedule(struct sock *sk, int size) +static inline int sk_wmem_schedule(struct sock *sk, int size, + struct sk_buff *skb) { if (!sk_has_account(sk)) return 1; @@ -737,12 +744,15 @@ static inline int sk_wmem_schedule(struct sock *sk, int size) __sk_mem_schedule(sk, size, SK_MEM_SEND); } -static inline int sk_rmem_schedule(struct sock *sk, int size) +static inline int sk_rmem_schedule(struct sock *sk, struct sk_buff *skb) { if (!sk_has_account(sk)) return 1; - return size <= sk->sk_forward_alloc || - __sk_mem_schedule(sk, size, SK_MEM_RECV); + if (!(skb->truesize <= sk->sk_forward_alloc || + __sk_mem_schedule(sk, skb->truesize, SK_MEM_RECV))) + return 0; + + return !ub_sockrcvbuf_charge(sk, skb); } static inline void sk_mem_reclaim(struct sock *sk) @@ -862,6 +872,11 @@ extern struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode); +extern struct sk_buff *sock_alloc_send_skb2(struct sock *sk, + unsigned long size, + unsigned long size2, + int noblock, + int *errcode); extern void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); extern void sock_kfree_s(struct sock *sk, void *mem, int size); @@ -1124,6 +1139,7 @@ static inline int skb_copy_to_page(struct sock *sk, char __user *from, static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { + WARN_ON(skb->destructor); sock_hold(sk); skb->sk = sk; skb->destructor = sock_wfree; @@ -1132,6 +1148,7 @@ static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { + WARN_ON(skb->destructor); skb->sk = sk; skb->destructor = sock_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); @@ -1322,6 +1339,13 @@ static inline void sk_change_net(struct sock *sk, struct net *net) sock_net_set(sk, hold_net(net)); } +static inline void sk_change_net_get(struct sock *sk, struct net *net) +{ + struct net *old_net = sock_net(sk); + sock_net_set(sk, get_net(net)); + put_net(old_net); +} + extern void sock_enable_timestamp(struct sock *sk); extern int sock_get_timestamp(struct sock *, struct timeval __user *); extern int sock_get_timestampns(struct sock *, struct timespec __user *); diff --git a/include/net/tcp.h b/include/net/tcp.h index 8983386..014798b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -43,6 +43,13 @@ #include #include +#include + +#define TCP_PAGE(sk) (sk->sk_sndmsg_page) +#define TCP_OFF(sk) (sk->sk_sndmsg_off) + +#define TW_WSCALE_MASK 0x0f +#define TW_WSCALE_SPEC 0x10 extern struct inet_hashinfo tcp_hashinfo; @@ -221,7 +228,9 @@ extern int sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; extern int sysctl_tcp_app_win; +#ifndef sysctl_tcp_adv_win_scale extern int sysctl_tcp_adv_win_scale; +#endif extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_frto_response; @@ -236,6 +245,10 @@ extern int sysctl_tcp_base_mss; extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_max_ssthresh; +extern int sysctl_tcp_use_sg; +extern int sysctl_tcp_max_tw_kmem_fraction; +extern int sysctl_tcp_max_tw_buckets_ub; + extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; @@ -546,7 +559,11 @@ extern u32 __tcp_select_window(struct sock *sk); * to use only the low 32-bits of jiffies and hide the ugly * casts with the following macro. */ +#ifdef CONFIG_VE +#define tcp_time_stamp ((__u32)(jiffies + get_exec_env()->jiffies_fixup)) +#else #define tcp_time_stamp ((__u32)(jiffies)) +#endif /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission diff --git a/include/net/udp.h b/include/net/udp.h index addcdc6..a48c56a 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -153,6 +153,18 @@ DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6); /* UDP-Lite does not have a standardized MIB yet, so we inherit from UDP */ DECLARE_SNMP_STAT(struct udp_mib, udplite_stats_in6); +#ifdef CONFIG_VE +#define ve_udp_statistics (get_exec_env()->_udp_statistics) +#define ve_udplite_statistics (get_exec_env()->_udplite_statistics) +#define ve_udp_stats_in6 (get_exec_env()->_udp_stats_in6) +#define ve_udplite_stats_in6 (get_exec_env()->_udplite_stats_in6) +#else +#define ve_udp_statistics udp_statistics +#define ve_udplite_statistics udplite_statistics +#define ve_udp_stats_in6 udp_stats_in6 +#define ve_udplite_stats_in6 udplite_stats_in6 +#endif + /* * SNMP statistics for UDP and UDP-Lite */ diff --git a/init/Kconfig b/init/Kconfig index c11da38..b054f1e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -208,7 +208,7 @@ config TASK_XACCT config TASK_IO_ACCOUNTING bool "Enable per-task storage I/O accounting (EXPERIMENTAL)" - depends on TASK_XACCT + depends on TASK_XACCT && BEANCOUNTERS help Collect information on the number of bytes of storage I/O which this task has caused. @@ -292,7 +292,7 @@ config CGROUP_DEBUG config CGROUP_NS bool "Namespace cgroup subsystem" - depends on CGROUPS + depends on CGROUPS && !VE help Provides a simple namespace cgroup subsystem to provide hierarchical naming of sets of namespaces, @@ -308,7 +308,7 @@ config CGROUP_DEVICE config CPUSETS bool "Cpuset support" - depends on SMP && CGROUPS + depends on SMP && CGROUPS && !VE help This option will let you create and manage CPUSETs which allow dynamically partitioning a system into sets of CPUs and @@ -352,17 +352,18 @@ config RT_GROUP_SCHED choice depends on GROUP_SCHED prompt "Basis for grouping tasks" - default USER_SCHED + default VZ_FAIRSCHED config USER_SCHED bool "user id" + depends on !VE help This option will choose userid as the basis for grouping tasks, thus providing equal CPU bandwidth to each user. config CGROUP_SCHED bool "Control groups" - depends on CGROUPS + depends on CGROUPS && !VE help This option allows you to create arbitrary task groups using the "cgroup" pseudo filesystem and control @@ -370,6 +371,12 @@ config CGROUP_SCHED Refer to Documentation/cgroups.txt for more information on "cgroup" pseudo filesystem. +config VZ_FAIRSCHED + bool "OpenVZ groups" + help + This option add customizable task groups with OpenVZ compatible + syscall and procfs interface. + endchoice config CGROUP_CPUACCT diff --git a/init/calibrate.c b/init/calibrate.c index a379c90..0cacb27 100644 --- a/init/calibrate.c +++ b/init/calibrate.c @@ -9,6 +9,7 @@ #include #include #include +#include unsigned long lpj_fine; unsigned long preset_lpj; @@ -108,6 +109,60 @@ static unsigned long __cpuinit calibrate_delay_direct(void) static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;} #endif +unsigned long cycles_per_jiffy, cycles_per_clock; + +static __devinit void calibrate_cycles(void) +{ + unsigned long ticks; + cycles_t time; + + ticks = jiffies; + while (ticks == jiffies) + /* nothing */; + time = get_cycles(); + ticks = jiffies; + while (ticks == jiffies) + /* nothing */; + + time = get_cycles() - time; + cycles_per_jiffy = time; + if ((time >> 32) != 0) { + printk("CPU too fast! timings are incorrect\n"); + cycles_per_jiffy = -1; + } +} + +EXPORT_SYMBOL(cycles_per_jiffy); +EXPORT_SYMBOL(cycles_per_clock); + +static __devinit void calc_cycles_per_jiffy(void) +{ +#if 0 + extern unsigned long fast_gettimeoffset_quotient; + unsigned long low, high; + + if (fast_gettimeoffset_quotient != 0) { + __asm__("divl %2" + :"=a" (low), "=d" (high) + :"r" (fast_gettimeoffset_quotient), + "0" (0), "1" (1000000/HZ)); + + cycles_per_jiffy = low; + } +#endif + if (cycles_per_jiffy == 0) + calibrate_cycles(); + + if (cycles_per_jiffy == 0) { + printk(KERN_WARNING "Cycles are stuck! " + "Some statistics will not be available."); + /* to prevent division by zero in cycles_to_(clocks|jiffies) */ + cycles_per_jiffy = 1; + cycles_per_clock = 1; + } else + cycles_per_clock = cycles_per_jiffy * (HZ / CLOCKS_PER_SEC); +} + /* * This is the number of bits of precision for the loops_per_jiffy. Each * bit takes on average 1.5/HZ seconds. This (like the original) is a little @@ -173,4 +228,5 @@ void __cpuinit calibrate_delay(void) printk(KERN_CONT "%lu.%02lu BogoMIPS (lpj=%lu)\n", loops_per_jiffy/(500000/HZ), (loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy); + calc_cycles_per_jiffy(); } diff --git a/init/main.c b/init/main.c index 3820323..ad14217 100644 --- a/init/main.c +++ b/init/main.c @@ -60,6 +60,9 @@ #include #include #include +#include + +#include #include #include @@ -105,6 +108,16 @@ extern void tc_init(void); enum system_states system_state; EXPORT_SYMBOL(system_state); +#ifdef CONFIG_VE +extern void init_ve_system(void); +extern void init_ve0(void); +extern void prepare_ve0_process(struct task_struct *tsk); +#else +#define init_ve_system() do { } while (0) +#define init_ve0() do { } while (0) +#define prepare_ve0_process(tsk) do { } while (0) +#endif + /* * Boot command-line arguments */ @@ -543,6 +556,9 @@ asmlinkage void __init start_kernel(void) smp_setup_processor_id(); + prepare_ve0_process(&init_task); + init_ve0(); + /* * Need to run as early as possible, to initialize the * lockdep hash: @@ -561,6 +577,7 @@ asmlinkage void __init start_kernel(void) * enable them */ lock_kernel(); + ub_init_early(); tick_init(); boot_cpu_init(); page_address_init(); @@ -666,6 +683,7 @@ asmlinkage void __init start_kernel(void) thread_info_cache_init(); fork_init(num_physpages); proc_caches_init(); + ub_init_late(); buffer_init(); unnamed_dev_init(); key_init(); @@ -687,6 +705,10 @@ asmlinkage void __init start_kernel(void) acpi_early_init(); /* before LAPIC and SMP init */ +#ifdef CONFIG_BC_RSS_ACCOUNTING + ub_init_pbc(); +#endif + /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -766,6 +788,8 @@ static void __init do_initcalls(void) */ static void __init do_basic_setup(void) { + init_ve_system(); + rcu_init_sched(); /* needed by module_init stage. */ /* drivers will send hotplug events */ init_workqueues(); @@ -857,6 +881,7 @@ static int __init kernel_init(void * unused) do_pre_smp_initcalls(); smp_init(); + fairsched_init_late(); sched_init_smp(); cpuset_init_smp(); diff --git a/init/version.c b/init/version.c index 52a8b98..ccc6262 100644 --- a/init/version.c +++ b/init/version.c @@ -36,6 +36,12 @@ struct uts_namespace init_uts_ns = { }; EXPORT_SYMBOL_GPL(init_uts_ns); +struct new_utsname virt_utsname = { + /* we need only this field */ + .release = UTS_RELEASE, +}; +EXPORT_SYMBOL(virt_utsname); + /* FIXED STRINGS! Don't touch! */ const char linux_banner[] = "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 69bc859..521f6f6 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -271,19 +271,14 @@ static struct ctl_table ipc_kern_table[] = { {} }; -static struct ctl_table ipc_root_table[] = { - { - .ctl_name = CTL_KERN, - .procname = "kernel", - .mode = 0555, - .child = ipc_kern_table, - }, +static struct ctl_path ipc_path[] = { + { .ctl_name = CTL_KERN, .procname = "kernel", }, {} }; static int __init ipc_sysctl_init(void) { - register_sysctl_table(ipc_root_table); + register_sysctl_glob_paths(ipc_path, ipc_kern_table, 1); return 0; } diff --git a/ipc/msg.c b/ipc/msg.c index b4eee1c..4fb6c0f 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -183,6 +183,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) int id, retval; key_t key = params->key; int msgflg = params->flg; + int msqid = params->id; msq = ipc_rcu_alloc(sizeof(*msq)); if (!msq) @@ -201,7 +202,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) /* * ipc_addid() locks msq */ - id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); + id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni, msqid); if (id < 0) { security_msg_queue_free(msq); ipc_rcu_putref(msq); @@ -323,6 +324,7 @@ asmlinkage long sys_msgget(key_t key, int msgflg) msg_params.key = key; msg_params.flg = msgflg; + msg_params.id = -1; return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); } @@ -942,3 +944,55 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it) msq->q_ctime); } #endif + +#ifdef CONFIG_VE +#include + +int sysvipc_setup_msg(key_t key, int msqid, int msgflg) +{ + struct ipc_namespace *ns; + struct ipc_ops msg_ops; + struct ipc_params msg_params; + + ns = current->nsproxy->ipc_ns; + + msg_ops.getnew = newque; + msg_ops.associate = msg_security; + msg_ops.more_checks = NULL; + + msg_params.key = key; + msg_params.flg = msgflg | IPC_CREAT; + msg_params.id = msqid; + + return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); +} +EXPORT_SYMBOL_GPL(sysvipc_setup_msg); + +int sysvipc_walk_msg(int (*func)(int i, struct msg_queue*, void *), void *arg) +{ + int err = 0; + struct msg_queue * msq; + struct ipc_namespace *ns; + int next_id; + int total, in_use; + + ns = current->nsproxy->ipc_ns; + + down_write(&msg_ids(ns).rw_mutex); + in_use = msg_ids(ns).in_use; + for (total = 0, next_id = 0; total < in_use; next_id++) { + msq = idr_find(&msg_ids(ns).ipcs_idr, next_id); + if (msq == NULL) + continue; + ipc_lock_by_ptr(&msq->q_perm); + err = func(ipc_buildid(next_id, msq->q_perm.seq), msq, arg); + msg_unlock(msq); + if (err) + break; + total++; + } + up_write(&msg_ids(ns).rw_mutex); + return err; +} +EXPORT_SYMBOL_GPL(sysvipc_walk_msg); +#endif diff --git a/ipc/msgutil.c b/ipc/msgutil.c index c82c215..d058294 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -8,6 +8,7 @@ * See the file COPYING for more details. */ +#include #include #include #include @@ -17,6 +18,8 @@ #include "util.h" +#include + struct msg_msgseg { struct msg_msgseg* next; /* the next part of the message follows immediately */ @@ -25,52 +28,53 @@ struct msg_msgseg { #define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) #define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) -struct msg_msg *load_msg(const void __user *src, int len) +struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset, + void * data), int len, void * data) { struct msg_msg *msg; struct msg_msgseg **pseg; int err; int alen; + int offset = 0; alen = len; if (alen > DATALEN_MSG) alen = DATALEN_MSG; - msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL); + msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_UBC); if (msg == NULL) return ERR_PTR(-ENOMEM); msg->next = NULL; msg->security = NULL; - if (copy_from_user(msg + 1, src, alen)) { + if (load(msg + 1, alen, offset, data)) { err = -EFAULT; goto out_err; } len -= alen; - src = ((char __user *)src) + alen; + offset += alen; pseg = &msg->next; while (len > 0) { struct msg_msgseg *seg; alen = len; if (alen > DATALEN_SEG) alen = DATALEN_SEG; - seg = kmalloc(sizeof(*seg) + alen, - GFP_KERNEL); + seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_UBC); if (seg == NULL) { err = -ENOMEM; goto out_err; } *pseg = seg; seg->next = NULL; - if (copy_from_user(seg + 1, src, alen)) { + if (load(seg + 1, alen, offset, data)) { err = -EFAULT; goto out_err; } pseg = &seg->next; len -= alen; - src = ((char __user *)src) + alen; + offset += alen; } err = security_msg_msg_alloc(msg); @@ -83,33 +87,58 @@ out_err: free_msg(msg); return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(sysv_msg_load); -int store_msg(void __user *dest, struct msg_msg *msg, int len) +static int do_load_msg(void * dst, int len, int offset, void * data) +{ + return copy_from_user(dst, data + offset, len); +} + +struct msg_msg *load_msg(const void __user *src, int len) +{ + return sysv_msg_load(do_load_msg, len, (void*)src); +} + +int sysv_msg_store(struct msg_msg *msg, + int (*store)(void * src, int len, int offset, void * data), + int len, void * data) { int alen; + int offset = 0; struct msg_msgseg *seg; - + alen = len; if (alen > DATALEN_MSG) alen = DATALEN_MSG; - if (copy_to_user(dest, msg + 1, alen)) + if (store(msg + 1, alen, offset, data)) return -1; len -= alen; - dest = ((char __user *)dest) + alen; + offset += alen; seg = msg->next; while (len > 0) { alen = len; if (alen > DATALEN_SEG) alen = DATALEN_SEG; - if (copy_to_user(dest, seg + 1, alen)) + if (store(seg + 1, alen, offset, data)) return -1; len -= alen; - dest = ((char __user *)dest) + alen; + offset += alen; seg = seg->next; } return 0; } +EXPORT_SYMBOL_GPL(sysv_msg_store); + +static int do_store_msg(void * src, int len, int offset, void * data) +{ + return copy_to_user(data + offset, src, len); +} + +int store_msg(void __user *dest, struct msg_msg *msg, int len) +{ + return sysv_msg_store(msg, do_store_msg, len, dest); +} void free_msg(struct msg_msg *msg) { diff --git a/ipc/sem.c b/ipc/sem.c index bf1bc36..d44231c 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -87,6 +87,8 @@ #include #include "util.h" +#include + #define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS]) #define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) @@ -240,6 +242,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) key_t key = params->key; int nsems = params->u.nsems; int semflg = params->flg; + int semid = params->id; if (!nsems) return -EINVAL; @@ -263,7 +266,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) return retval; } - id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); + id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni, semid); if (id < 0) { security_sem_free(sma); ipc_rcu_putref(sma); @@ -326,6 +329,7 @@ asmlinkage long sys_semget(key_t key, int nsems, int semflg) sem_params.key = key; sem_params.flg = semflg; sem_params.u.nsems = nsems; + sem_params.id = -1; return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); } @@ -941,7 +945,7 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp) undo_list = current->sysvsem.undo_list; if (!undo_list) { - undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL); + undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_UBC); if (undo_list == NULL) return -ENOMEM; spin_lock_init(&undo_list->lock); @@ -1006,7 +1010,8 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) sem_getref_and_unlock(sma); /* step 2: allocate new undo structure */ - new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); + new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, + GFP_KERNEL_UBC); if (!new) { sem_putref(sma); return ERR_PTR(-ENOMEM); @@ -1068,7 +1073,7 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops, if (nsops > ns->sc_semopm) return -E2BIG; if(nsops > SEMOPM_FAST) { - sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); + sops = kmalloc(sizeof(*sops)*nsops, GFP_KERNEL_UBC); if(sops==NULL) return -ENOMEM; } @@ -1371,3 +1376,57 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) sma->sem_ctime); } #endif + +#ifdef CONFIG_VE +#include + +int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg) +{ + struct ipc_namespace *ns; + struct ipc_ops sem_ops; + struct ipc_params sem_params; + + ns = current->nsproxy->ipc_ns; + + sem_ops.getnew = newary; + sem_ops.associate = sem_security; + sem_ops.more_checks = sem_more_checks; + + sem_params.key = key; + sem_params.flg = semflg | IPC_CREAT; + sem_params.u.nsems = size; + sem_params.id = semid; + + return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); +} +EXPORT_SYMBOL_GPL(sysvipc_setup_sem); + +int sysvipc_walk_sem(int (*func)(int i, struct sem_array*, void *), void *arg) +{ + int err = 0; + struct sem_array *sma; + struct ipc_namespace *ns; + int next_id; + int total, in_use; + + ns = current->nsproxy->ipc_ns; + + down_write(&sem_ids(ns).rw_mutex); + in_use = sem_ids(ns).in_use; + for (total = 0, next_id = 0; total < in_use; next_id++) { + sma = idr_find(&sem_ids(ns).ipcs_idr, next_id); + if (sma == NULL) + continue; + ipc_lock_by_ptr(&sma->sem_perm); + err = func(ipc_buildid(next_id, sma->sem_perm.seq), sma, arg); + sem_unlock(sma); + if (err) + break; + total++; + } + up_write(&sem_ids(ns).rw_mutex); + return err; +} +EXPORT_SYMBOL_GPL(sysvipc_walk_sem); +EXPORT_SYMBOL_GPL(exit_sem); +#endif diff --git a/ipc/shm.c b/ipc/shm.c index e77ec69..e3395af 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -39,27 +39,17 @@ #include #include #include +#include #include -#include "util.h" - -struct shm_file_data { - int id; - struct ipc_namespace *ns; - struct file *file; - const struct vm_operations_struct *vm_ops; -}; +#include +#include -#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) +#include "util.h" -static const struct file_operations shm_file_operations; static struct vm_operations_struct shm_vm_ops; -#define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) - -#define shm_unlock(shp) \ - ipc_unlock(&(shp)->shm_perm) static int newseg(struct ipc_namespace *, struct ipc_params *); static void shm_open(struct vm_area_struct *vma); @@ -111,20 +101,6 @@ void __init shm_init (void) IPC_SHM_IDS, sysvipc_shm_proc_show); } -/* - * shm_lock_(check_) routines are called in the paths where the rw_mutex - * is not necessarily held. - */ -static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id); - - if (IS_ERR(ipcp)) - return (struct shmid_kernel *)ipcp; - - return container_of(ipcp, struct shmid_kernel, shm_perm); -} - static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, int id) { @@ -157,6 +133,48 @@ static void shm_open(struct vm_area_struct *vma) shm_unlock(shp); } +static int shmem_lock(struct shmid_kernel *shp, int lock, + struct user_struct *user) +{ + struct file *file = shp->shm_file; + struct inode *inode = file->f_path.dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long size; + + size = shp->shm_segsz + PAGE_SIZE - 1; + +#ifdef CONFIG_SHMEM + spin_lock(&info->lock); + if (lock && !(info->flags & VM_LOCKED)) { + if (ub_lockedshm_charge(info, size) < 0) + goto out_ch; + + if (!user_shm_lock(inode->i_size, user)) + goto out_user; + info->flags |= VM_LOCKED; + } + if (!lock && (info->flags & VM_LOCKED) && user) { + ub_lockedshm_uncharge(info, size); + user_shm_unlock(inode->i_size, user); + info->flags &= ~VM_LOCKED; + } + spin_unlock(&info->lock); + return 0; + +out_user: + ub_lockedshm_uncharge(info, size); +out_ch: + spin_unlock(&info->lock); + return -ENOMEM; +#else + if (lock && ub_lockedshm_charge(info, size)) + return -ENOMEM; + if (!lock) + ub_lockedshm_uncharge(info, size); + return 0; +#endif +} + /* * shm_destroy - free the struct shmid_kernel * @@ -172,7 +190,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) shm_rmid(ns, shp); shm_unlock(shp); if (!is_file_hugepages(shp->shm_file)) - shmem_lock(shp->shm_file, 0, shp->mlock_user); + shmem_lock(shp, 0, shp->mlock_user); else user_shm_unlock(shp->shm_file->f_path.dentry->d_inode->i_size, shp->mlock_user); @@ -304,12 +322,13 @@ int is_file_shm_hugepages(struct file *file) return ret; } -static const struct file_operations shm_file_operations = { +const struct file_operations shm_file_operations = { .mmap = shm_mmap, .fsync = shm_fsync, .release = shm_release, .get_unmapped_area = shm_get_unmapped_area, }; +EXPORT_SYMBOL_GPL(shm_file_operations); static struct vm_operations_struct shm_vm_ops = { .open = shm_open, /* callback for a new vm-area open */ @@ -334,11 +353,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) key_t key = params->key; int shmflg = params->flg; size_t size = params->u.size; + int shmid = params->id; int error; struct shmid_kernel *shp; int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; struct file * file; - char name[13]; + char name[64]; int id; if (size < SHMMIN || size > ns->shm_ctlmax) @@ -362,7 +382,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) return error; } - sprintf (name, "SYSV%08x", key); + snprintf (name, sizeof(name), "VE%d-SYSV%08x", VEID(get_exec_env()), key); if (shmflg & SHM_HUGETLB) { /* hugetlb_file_setup takes care of mlock user accounting */ file = hugetlb_file_setup(name, size); @@ -382,7 +402,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (IS_ERR(file)) goto no_file; - id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); + id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni, shmid); if (id < 0) { error = id; goto no_id; @@ -455,6 +475,7 @@ asmlinkage long sys_shmget (key_t key, size_t size, int shmflg) shm_params.key = key; shm_params.flg = shmflg; shm_params.u.size = size; + shm_params.id = -1; return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); } @@ -764,14 +785,14 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) if(cmd==SHM_LOCK) { struct user_struct * user = current->user; if (!is_file_hugepages(shp->shm_file)) { - err = shmem_lock(shp->shm_file, 1, user); + err = shmem_lock(shp, 1, user); if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){ shp->shm_perm.mode |= SHM_LOCKED; shp->mlock_user = user; } } } else if (!is_file_hugepages(shp->shm_file)) { - shmem_lock(shp->shm_file, 0, shp->mlock_user); + shmem_lock(shp, 0, shp->mlock_user); shp->shm_perm.mode &= ~SHM_LOCKED; shp->mlock_user = NULL; } @@ -1070,3 +1091,67 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it) shp->shm_ctim); } #endif + +#ifdef CONFIG_VE +#include + +struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg) +{ + struct ipc_namespace *ns; + struct ipc_ops shm_ops; + struct ipc_params shm_params; + struct shmid_kernel *shp; + struct file *file; + int rv; + + ns = current->nsproxy->ipc_ns; + + shm_ops.getnew = newseg; + shm_ops.associate = shm_security; + shm_ops.more_checks = shm_more_checks; + + shm_params.key = key; + shm_params.flg = shmflg | IPC_CREAT; + shm_params.u.size = size; + shm_params.id = shmid; + + rv = ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); + if (rv < 0) + return ERR_PTR(rv); + shp = shm_lock(ns, rv); + BUG_ON(IS_ERR(shp)); + file = shp->shm_file; + get_file(file); + shm_unlock(shp); + return file; +} +EXPORT_SYMBOL_GPL(sysvipc_setup_shm); + +int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg) +{ + int err = 0; + struct shmid_kernel* shp; + struct ipc_namespace *ns; + int next_id; + int total, in_use; + + ns = current->nsproxy->ipc_ns; + + down_write(&shm_ids(ns).rw_mutex); + in_use = shm_ids(ns).in_use; + for (total = 0, next_id = 0; total < in_use; next_id++) { + shp = idr_find(&shm_ids(ns).ipcs_idr, next_id); + if (shp == NULL) + continue; + ipc_lock_by_ptr(&shp->shm_perm); + err = func(shp, arg); + shm_unlock(shp); + if (err) + break; + total++; + } + up_write(&shm_ids(ns).rw_mutex); + return err; +} +EXPORT_SYMBOL_GPL(sysvipc_walk_shm); +#endif diff --git a/ipc/util.c b/ipc/util.c index 49b3ea6..59d302e 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -38,6 +38,8 @@ #include +#include + #include "util.h" struct ipc_proc_iface { @@ -247,6 +249,7 @@ int ipc_get_maxid(struct ipc_ids *ids) * @ids: IPC identifier set * @new: new IPC permission set * @size: limit for the number of used ids + * @reqid: if >= 0, get this id exactly. If -1 -- don't care. * * Add an entry 'new' to the IPC ids idr. The permissions object is * initialised and the first free entry is set up and the id assigned @@ -256,10 +259,18 @@ int ipc_get_maxid(struct ipc_ids *ids) * Called with ipc_ids.rw_mutex held as a writer. */ -int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) +int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid) { int id, err; + if (reqid >= 0) { + id = reqid % SEQ_MULTIPLIER; + err = idr_get_new_above(&ids->ipcs_idr, new, id, &id); + if (err || id != (reqid % SEQ_MULTIPLIER)) + return -EEXIST; + goto found; + } + if (size > IPCMNI) size = IPCMNI; @@ -270,14 +281,19 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) if (err) return err; +found: ids->in_use++; new->cuid = new->uid = current->euid; new->gid = new->cgid = current->egid; - new->seq = ids->seq++; - if(ids->seq > ids->seq_max) - ids->seq = 0; + if (reqid >= 0) { + new->seq = reqid/SEQ_MULTIPLIER; + } else { + new->seq = ids->seq++; + if(ids->seq > ids->seq_max) + ids->seq = 0; + } new->id = ipc_buildid(id, new->seq); spin_lock_init(&new->lock); @@ -445,9 +461,9 @@ void* ipc_alloc(int size) { void* out; if(size > PAGE_SIZE) - out = vmalloc(size); + out = ub_vmalloc(size); else - out = kmalloc(size, GFP_KERNEL); + out = kmalloc(size, GFP_KERNEL_UBC); return out; } @@ -530,14 +546,14 @@ void* ipc_rcu_alloc(int size) * workqueue if necessary (for vmalloc). */ if (rcu_use_vmalloc(size)) { - out = vmalloc(HDRLEN_VMALLOC + size); + out = ub_vmalloc(HDRLEN_VMALLOC + size); if (out) { out += HDRLEN_VMALLOC; container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1; container_of(out, struct ipc_rcu_hdr, data)->refcount = 1; } } else { - out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); + out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL_UBC); if (out) { out += HDRLEN_KMALLOC; container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0; @@ -715,6 +731,7 @@ struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id) return out; } +EXPORT_SYMBOL_GPL(ipc_lock); struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id) { @@ -804,7 +821,7 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd, goto out_unlock; } if (current->euid == ipcp->cuid || - current->euid == ipcp->uid || capable(CAP_SYS_ADMIN)) + current->euid == ipcp->uid || capable(CAP_VE_SYS_ADMIN)) return ipcp; err = -EPERM; diff --git a/ipc/util.h b/ipc/util.h index 3646b45..5b6df8e 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -39,6 +39,7 @@ struct ipc_params { size_t size; /* for shared memories */ int nsems; /* for semaphores */ } u; /* holds the getnew() specific param */ + int id; }; /* @@ -68,14 +69,10 @@ void __init ipc_init_proc_interface(const char *path, const char *header, #define ipc_init_proc_interface(path, header, ids, show) do {} while (0) #endif -#define IPC_SEM_IDS 0 -#define IPC_MSG_IDS 1 -#define IPC_SHM_IDS 2 - #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) /* must be called with ids->rw_mutex acquired for writing */ -int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); +int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int, int); /* must be called with ids->rw_mutex acquired for reading */ int ipc_get_maxid(struct ipc_ids *); @@ -102,7 +99,6 @@ void* ipc_rcu_alloc(int size); void ipc_rcu_getref(void *ptr); void ipc_rcu_putref(void *ptr); -struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out); void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out); @@ -144,12 +140,6 @@ static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm) spin_lock(&perm->lock); } -static inline void ipc_unlock(struct kern_ipc_perm *perm) -{ - spin_unlock(&perm->lock); - rcu_read_unlock(); -} - struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id); int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, struct ipc_ops *ops, struct ipc_params *params); diff --git a/kernel/Kconfig.openvz b/kernel/Kconfig.openvz new file mode 100644 index 0000000..8e0a503 --- /dev/null +++ b/kernel/Kconfig.openvz @@ -0,0 +1,91 @@ +# Copyright (C) 2005 SWsoft +# All rights reserved. +# Licensing governed by "linux/COPYING.SWsoft" file. + +menu "OpenVZ" + +config VE + bool "Virtual Environment support" + default y + select NAMESPACES + select PID_NS + select IPC_NS + select UTS_NS + select NET_NS + select USER_NS + select CGROUPS + select CGROUP_DEVICE + select GROUP_SCHED + select FAIR_GROUP_SCHED + help + This option adds support of virtual Linux running on the original box + with fully supported virtual network driver, tty subsystem and + configurable access for hardware and other resources. + +config VE_CALLS + tristate "VE calls interface" + depends on VE + select VZ_DEV + default m + help + This option controls how to build vzmon code containing VE calls. + By default it's build in module vzmon.o + +config VZ_GENCALLS + bool + default y + +config VE_NETDEV + tristate "VE network device" + depends on VE_CALLS && NET + select VZ_DEV + default m + help + This option controls whether to build venet device. This is a + common interface for networking in VE. + +config VE_ETHDEV + tristate "Virtual ethernet device" + depends on VE_CALLS && NET + select VZ_DEV + default m + help + This option controls whether to build virtual ethernet device. + +config VZ_DEV + tristate "VE device" + default m + help + This option adds support of vzdev device, which is used by + user-space applications to control Virtual Environments. + +config VE_IPTABLES + bool "VE netfiltering" + depends on VE && VE_NETDEV && INET && NETFILTER + default y + help + This option controls whether to build VE netfiltering code. + +config VZ_WDOG + tristate "VE watchdog module" + depends on VE_CALLS + default m + help + This option controls building of vzwdog module, which dumps + a lot of useful system info on console periodically. + +config VZ_CHECKPOINT + tristate "Checkpointing & restoring Virtual Environments" + depends on VE_CALLS + select PM + select PM_SLEEP + select TUN + select VE_ETHDEV + select VE_NETDEV + default m + help + This option adds two modules, "cpt" and "rst", which allow + to save a running Virtual Environment and restore it + on another host (live migration) or on the same host (checkpointing). + +endmenu diff --git a/kernel/Makefile b/kernel/Makefile index 4e1d7df..d8b742c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -28,6 +28,10 @@ obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ +obj-$(CONFIG_BEANCOUNTERS) += bc/ +obj-y += ve/ +obj-$(CONFIG_VZ_CHECKPOINT) += cpt/ + obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) @@ -53,7 +57,11 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o +ifeq ($(CONFIG_VE),n) obj-$(CONFIG_CGROUPS) += cgroup.o +else +obj-$(CONFIG_CGROUPS) += cgroup_lite.o +endif obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o @@ -84,6 +92,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_VZ_FAIRSCHED) += fairsched.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FTRACE) += trace/ obj-$(CONFIG_TRACING) += trace/ diff --git a/kernel/audit.c b/kernel/audit.c index 4414e93..eb95bca 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -666,6 +666,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) char *ctx = NULL; u32 len; + if (!ve_is_super(skb->owner_env)) + return -ECONNREFUSED; + err = audit_netlink_ok(skb, msg_type); if (err) return err; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b7d354e..278ed36 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -164,8 +164,8 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp) inotify_init_watch(&parent->wdata); /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ get_inotify_watch(&parent->wdata); - wd = inotify_add_watch(audit_ih, &parent->wdata, - ndp->path.dentry->d_inode, AUDIT_IN_WATCH); + wd = inotify_add_watch_dget(audit_ih, &parent->wdata, + &ndp->path, AUDIT_IN_WATCH); if (wd < 0) { audit_free_parent(&parent->wdata); return ERR_PTR(wd); diff --git a/kernel/bc/Kconfig b/kernel/bc/Kconfig new file mode 100644 index 0000000..2c3de4a --- /dev/null +++ b/kernel/bc/Kconfig @@ -0,0 +1,111 @@ +# +# User resources part (UBC) +# +# Copyright (C) 2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +menu "User resources" + +config BEANCOUNTERS + bool "Enable user resource accounting" + default y + help + This patch provides accounting and allows to configure + limits for user's consumption of exhaustible system resources. + The most important resource controlled by this patch is unswappable + memory (either mlock'ed or used by internal kernel structures and + buffers). The main goal of this patch is to protect processes + from running short of important resources because of an accidental + misbehavior of processes or malicious activity aiming to ``kill'' + the system. It's worth to mention that resource limits configured + by setrlimit(2) do not give an acceptable level of protection + because they cover only small fraction of resources and work on a + per-process basis. Per-process accounting doesn't prevent malicious + users from spawning a lot of resource-consuming processes. + +config BC_RSS_ACCOUNTING + bool "Account physical memory usage" + default y + depends on BEANCOUNTERS + help + This allows to estimate per beancounter physical memory usage. + Implemented alghorithm accounts shared pages of memory as well, + dividing them by number of beancounter which use the page. + +config BC_IO_ACCOUNTING + bool "Account disk IO" + default y + depends on BC_RSS_ACCOUNTING + help + When on this option allows seeing disk IO activity caused by + tasks from each UB + +config BC_IO_SCHED + bool "UBC I/O priority" + default y + depends on BC_IO_ACCOUNTING && IOSCHED_CFQ + help + This option controls whether to build CFQ I/O scheduler + with support of UBC I/O priority. + +config BC_SWAP_ACCOUNTING + bool "Account swap usage" + default y + depends on BEANCOUNTERS + help + This allows accounting of swap usage. + +config BC_PROC + bool "Report resource usage in /proc" + default y + depends on BEANCOUNTERS + help + Allows a system administrator to inspect resource accounts and limits. + +config BC_DEBUG + bool "User resources debug features" + default n + depends on BEANCOUNTERS + help + Enables to setup debug features for user resource accounting + +config BC_DEBUG_IO + bool "Debug IO accounting" + default y + depends on BC_DEBUG && BC_IO_ACCOUNTING + help + Debugging for IO accointing. + +config BC_DEBUG_KMEM + bool "Debug kmemsize with cache counters" + default n + depends on BC_DEBUG + help + Adds /proc/user_beancounters_debug entry to get statistics + about cache usage of each beancounter + +config BC_KEEP_UNUSED + bool "Keep unused beancounter alive" + default y + depends on BC_DEBUG + help + If on, unused beancounters are kept on the hash and maxheld value + can be looked through. + +config BC_DEBUG_ITEMS + bool "Account resources in items rather than in bytes" + default y + depends on BC_DEBUG + help + When true some of the resources (e.g. kmemsize) are accounted + in items instead of bytes. + +config BC_UNLIMITED + bool "Use unlimited ubc settings" + default y + depends on BC_DEBUG + help + When ON all limits and barriers are set to max values. +endmenu diff --git a/kernel/bc/Makefile b/kernel/bc/Makefile new file mode 100644 index 0000000..e0e6529 --- /dev/null +++ b/kernel/bc/Makefile @@ -0,0 +1,16 @@ +# +# User resources part (UBC) +# +# Copyright (C) 2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +obj-y := sys.o beancounter.o dcache.o kmem.o misc.o \ + vm_pages.o statd.o oom_kill.o + +obj-$(CONFIG_NET) += net.o +obj-$(CONFIG_BC_RSS_ACCOUNTING) += rss_pages.o +obj-$(CONFIG_BC_PROC) += proc.o +obj-$(CONFIG_BC_IO_ACCOUNTING) += io_acct.o +obj-$(CONFIG_BC_IO_SCHED) += io_prio.o diff --git a/kernel/bc/beancounter.c b/kernel/bc/beancounter.c new file mode 100644 index 0000000..8cd0ef0 --- /dev/null +++ b/kernel/bc/beancounter.c @@ -0,0 +1,673 @@ +/* + * linux/kernel/bc/beancounter.c + * + * Copyright (C) 1998 Alan Cox + * 1998-2000 Andrey V. Savochkin + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * TODO: + * - more intelligent limit check in mremap(): currently the new size is + * charged and _then_ old size is uncharged + * (almost done: !move_vma case is completely done, + * move_vma in its current implementation requires too many conditions to + * do things right, because it may be not only expansion, but shrinking + * also, plus do_munmap will require an additional parameter...) + * - problem: bad pmd page handling + * - consider /proc redesign + * - TCP/UDP ports + * + consider whether __charge_beancounter_locked should be inline + * + * Changes: + * 1999/08/17 Marcelo Tosatti + * - Set "barrier" and "limit" parts of limits atomically. + * 1999/10/06 Marcelo Tosatti + * - setublimit system call. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +static struct kmem_cache *ub_cachep; +static struct user_beancounter default_beancounter; +struct user_beancounter ub0; +EXPORT_SYMBOL_GPL(ub0); + +const char *ub_rnames[] = { + "kmemsize", /* 0 */ + "lockedpages", + "privvmpages", + "shmpages", + "dummy", + "numproc", /* 5 */ + "physpages", + "vmguarpages", + "oomguarpages", + "numtcpsock", + "numflock", /* 10 */ + "numpty", + "numsiginfo", + "tcpsndbuf", + "tcprcvbuf", + "othersockbuf", /* 15 */ + "dgramrcvbuf", + "numothersock", + "dcachesize", + "numfile", + "dummy", /* 20 */ + "dummy", + "dummy", + "numiptent", + "unused_privvmpages", /* UB_RESOURCES */ + "tmpfs_respages", + "swap_pages", + "held_pages", +}; + +static void init_beancounter_struct(struct user_beancounter *ub); +static void init_beancounter_store(struct user_beancounter *ub); +static void init_beancounter_nolimits(struct user_beancounter *ub); + +int print_ub_uid(struct user_beancounter *ub, char *buf, int size) +{ + if (ub->parent != NULL) + return snprintf(buf, size, "%u.%u", + ub->parent->ub_uid, ub->ub_uid); + else + return snprintf(buf, size, "%u", ub->ub_uid); +} +EXPORT_SYMBOL(print_ub_uid); + +#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1)) +#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17) +struct hlist_head ub_hash[UB_HASH_SIZE]; +DEFINE_SPINLOCK(ub_hash_lock); +LIST_HEAD(ub_list_head); /* protected by ub_hash_lock */ +EXPORT_SYMBOL(ub_hash); +EXPORT_SYMBOL(ub_hash_lock); +EXPORT_SYMBOL(ub_list_head); + +/* + * Per user resource beancounting. Resources are tied to their luid. + * The resource structure itself is tagged both to the process and + * the charging resources (a socket doesn't want to have to search for + * things at irq time for example). Reference counters keep things in + * hand. + * + * The case where a user creates resource, kills all his processes and + * then starts new ones is correctly handled this way. The refcounters + * will mean the old entry is still around with resource tied to it. + */ + +static inline void free_ub(struct user_beancounter *ub) +{ + free_percpu(ub->ub_percpu); + kmem_cache_free(ub_cachep, ub); +} + +static inline struct user_beancounter *bc_lookup_hash(struct hlist_head *hash, + uid_t uid, struct user_beancounter *parent) +{ + struct user_beancounter *ub; + struct hlist_node *ptr; + + hlist_for_each_entry (ub, ptr, hash, ub_hash) + if (ub->ub_uid == uid && ub->parent == parent) + return get_beancounter(ub); + + return NULL; +} + +struct user_beancounter *get_beancounter_byuid(uid_t uid, int create) +{ + struct user_beancounter *new_ub, *ub; + unsigned long flags; + struct hlist_head *hash; + + hash = &ub_hash[ub_hash_fun(uid)]; + new_ub = NULL; +retry: + spin_lock_irqsave(&ub_hash_lock, flags); + ub = bc_lookup_hash(hash, uid, NULL); + if (ub != NULL) { + spin_unlock_irqrestore(&ub_hash_lock, flags); + + if (new_ub != NULL) + free_ub(new_ub); + return ub; + } + + if (!create) { + /* no ub found */ + spin_unlock_irqrestore(&ub_hash_lock, flags); + return NULL; + } + + if (new_ub != NULL) { + list_add_rcu(&new_ub->ub_list, &ub_list_head); + hlist_add_head(&new_ub->ub_hash, hash); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return new_ub; + } + spin_unlock_irqrestore(&ub_hash_lock, flags); + + /* alloc new ub */ + new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, + GFP_KERNEL); + if (new_ub == NULL) + return NULL; + + ub_debug(UBD_ALLOC, "Creating ub %p\n", new_ub); + memcpy(new_ub, &default_beancounter, sizeof(*new_ub)); + init_beancounter_struct(new_ub); + new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct); + if (new_ub->ub_percpu == NULL) + goto fail_free; + new_ub->ub_uid = uid; + goto retry; + +fail_free: + kmem_cache_free(ub_cachep, new_ub); + return NULL; +} +EXPORT_SYMBOL(get_beancounter_byuid); + +struct user_beancounter *get_subbeancounter_byid(struct user_beancounter *p, + int id, int create) +{ + struct user_beancounter *new_ub, *ub; + unsigned long flags; + struct hlist_head *hash; + + hash = &ub_hash[ub_subhash_fun(p, id)]; + new_ub = NULL; +retry: + spin_lock_irqsave(&ub_hash_lock, flags); + ub = bc_lookup_hash(hash, id, p); + if (ub != NULL) { + spin_unlock_irqrestore(&ub_hash_lock, flags); + + if (new_ub != NULL) { + put_beancounter(new_ub->parent); + free_ub(new_ub); + } + return ub; + } + + if (!create) { + /* no ub found */ + spin_unlock_irqrestore(&ub_hash_lock, flags); + return NULL; + } + + if (new_ub != NULL) { + list_add_rcu(&new_ub->ub_list, &ub_list_head); + hlist_add_head(&new_ub->ub_hash, hash); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return new_ub; + } + spin_unlock_irqrestore(&ub_hash_lock, flags); + + /* alloc new ub */ + new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, + GFP_KERNEL); + if (new_ub == NULL) + return NULL; + + ub_debug(UBD_ALLOC, "Creating sub %p\n", new_ub); + memset(new_ub, 0, sizeof(*new_ub)); + init_beancounter_nolimits(new_ub); + init_beancounter_store(new_ub); + init_beancounter_struct(new_ub); + new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct); + if (new_ub->ub_percpu == NULL) + goto fail_free; + new_ub->ub_uid = id; + new_ub->parent = get_beancounter(p); + goto retry; + +fail_free: + kmem_cache_free(ub_cachep, new_ub); + return NULL; +} +EXPORT_SYMBOL(get_subbeancounter_byid); + +static void put_warn(struct user_beancounter *ub) +{ + char id[64]; + + print_ub_uid(ub, id, sizeof(id)); + printk(KERN_ERR "UB: Bad refcount (%d) on put of %s (%p)\n", + atomic_read(&ub->ub_refcount), id, ub); +} + +#ifdef CONFIG_BC_KEEP_UNUSED +#define release_beancounter(ub) do { } while (0) +#else +static int verify_res(struct user_beancounter *ub, int resource, + unsigned long held) +{ + char id[64]; + + if (likely(held == 0)) + return 1; + + print_ub_uid(ub, id, sizeof(id)); + printk(KERN_WARNING "Ub %s helds %lu in %s on put\n", + id, held, ub_rnames[resource]); + return 0; +} + +static inline void bc_verify_held(struct user_beancounter *ub) +{ + int i, clean; + + clean = 1; + for (i = 0; i < UB_RESOURCES; i++) + clean &= verify_res(ub, i, ub->ub_parms[i].held); + + clean &= verify_res(ub, UB_UNUSEDPRIVVM, ub->ub_unused_privvmpages); + clean &= verify_res(ub, UB_TMPFSPAGES, ub->ub_tmpfs_respages); + clean &= verify_res(ub, UB_SWAPPAGES, ub->ub_swap_pages); + clean &= verify_res(ub, UB_HELDPAGES, (unsigned long)ub->ub_held_pages); + + ub_debug_trace(!clean, 5, 60*HZ); +} + +static void bc_free_rcu(struct rcu_head *rcu) +{ + struct user_beancounter *ub; + + ub = container_of(rcu, struct user_beancounter, rcu); + free_ub(ub); +} + +static void delayed_release_beancounter(struct work_struct *w) +{ + struct user_beancounter *ub, *parent; + unsigned long flags; + + ub = container_of(w, struct user_beancounter, cleanup.work); +again: + local_irq_save(flags); + if (!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock)) { + /* raced with get_beancounter_byuid */ + local_irq_restore(flags); + return; + } + + hlist_del(&ub->ub_hash); + list_del_rcu(&ub->ub_list); + spin_unlock_irqrestore(&ub_hash_lock, flags); + + bc_verify_held(ub); + ub_free_counters(ub); + bc_fini_ioprio(&ub->iopriv); + parent = ub->parent; + + call_rcu(&ub->rcu, bc_free_rcu); + if (parent) { + ub = parent; + goto again; + } +} + +static inline void release_beancounter(struct user_beancounter *ub) +{ + struct execute_work *ew; + + ew = &ub->cleanup; + INIT_WORK(&ew->work, delayed_release_beancounter); + schedule_work(&ew->work); +} +#endif + +void __put_beancounter(struct user_beancounter *ub) +{ + unsigned long flags; + + /* equevalent to atomic_dec_and_lock_irqsave() */ + local_irq_save(flags); + if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) { + if (unlikely(atomic_read(&ub->ub_refcount) < 0)) + put_warn(ub); + local_irq_restore(flags); + return; + } + + if (unlikely(ub == get_ub0())) { + printk(KERN_ERR "Trying to put ub0\n"); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return; + } + + /* prevent get_beancounter_byuid + put_beancounter() reentrance */ + atomic_inc(&ub->ub_refcount); + spin_unlock_irqrestore(&ub_hash_lock, flags); + + release_beancounter(ub); +} +EXPORT_SYMBOL(__put_beancounter); + +void put_beancounter_safe(struct user_beancounter *ub) +{ + synchronize_rcu(); + __put_beancounter(ub); +} +EXPORT_SYMBOL(put_beancounter_safe); + +/* + * Generic resource charging stuff + */ + +int __charge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val, enum ub_severity strict) +{ + ub_debug_resource(resource, "Charging %lu for %d of %p with %lu\n", + val, resource, ub, ub->ub_parms[resource].held); + /* + * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition + * at the moment is possible so an overflow is impossible. + */ + ub->ub_parms[resource].held += val; + + switch (strict) { + case UB_HARD: + if (ub->ub_parms[resource].held > + ub->ub_parms[resource].barrier) + break; + case UB_SOFT: + if (ub->ub_parms[resource].held > + ub->ub_parms[resource].limit) + break; + case UB_FORCE: + ub_adjust_maxheld(ub, resource); + return 0; + default: + BUG(); + } + + if (strict == UB_SOFT && ub_ratelimit(&ub->ub_limit_rl)) + printk(KERN_INFO "Fatal resource shortage: %s, UB %d.\n", + ub_rnames[resource], ub->ub_uid); + ub->ub_parms[resource].failcnt++; + ub->ub_parms[resource].held -= val; + return -ENOMEM; +} + +int charge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val, enum ub_severity strict) +{ + int retval; + struct user_beancounter *p, *q; + unsigned long flags; + + retval = -EINVAL; + if (val > UB_MAXVALUE) + goto out; + + local_irq_save(flags); + for (p = ub; p != NULL; p = p->parent) { + spin_lock(&p->ub_lock); + retval = __charge_beancounter_locked(p, resource, val, strict); + spin_unlock(&p->ub_lock); + if (retval) + goto unroll; + } +out_restore: + local_irq_restore(flags); +out: + return retval; + +unroll: + for (q = ub; q != p; q = q->parent) { + spin_lock(&q->ub_lock); + __uncharge_beancounter_locked(q, resource, val); + spin_unlock(&q->ub_lock); + } + goto out_restore; +} + +EXPORT_SYMBOL(charge_beancounter); + +void __charge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + struct user_beancounter *p; + unsigned long flags; + + local_irq_save(flags); + for (p = ub; p->parent != NULL; p = p->parent) { + spin_lock(&p->ub_lock); + __charge_beancounter_locked(p, resource, val, UB_FORCE); + spin_unlock(&p->ub_lock); + } + local_irq_restore(flags); +} + +EXPORT_SYMBOL(__charge_beancounter_notop); + +void uncharge_warn(struct user_beancounter *ub, int resource, + unsigned long val, unsigned long held) +{ + char id[64]; + + print_ub_uid(ub, id, sizeof(id)); + printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %s\n", + val, held, ub_rnames[resource], id); + ub_debug_trace(1, 10, 10*HZ); +} + +void __uncharge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val) +{ + ub_debug_resource(resource, "Uncharging %lu for %d of %p with %lu\n", + val, resource, ub, ub->ub_parms[resource].held); + if (ub->ub_parms[resource].held < val) { + uncharge_warn(ub, resource, + val, ub->ub_parms[resource].held); + val = ub->ub_parms[resource].held; + } + ub->ub_parms[resource].held -= val; +} + +void uncharge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val) +{ + unsigned long flags; + struct user_beancounter *p; + + for (p = ub; p != NULL; p = p->parent) { + spin_lock_irqsave(&p->ub_lock, flags); + __uncharge_beancounter_locked(p, resource, val); + spin_unlock_irqrestore(&p->ub_lock, flags); + } +} + +EXPORT_SYMBOL(uncharge_beancounter); + +void __uncharge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + struct user_beancounter *p; + unsigned long flags; + + local_irq_save(flags); + for (p = ub; p->parent != NULL; p = p->parent) { + spin_lock(&p->ub_lock); + __uncharge_beancounter_locked(p, resource, val); + spin_unlock(&p->ub_lock); + } + local_irq_restore(flags); +} + +EXPORT_SYMBOL(__uncharge_beancounter_notop); + + +/* + * Rate limiting stuff. + */ +int ub_ratelimit(struct ub_rate_info *p) +{ + unsigned long cjif, djif; + unsigned long flags; + static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; + long new_bucket; + + spin_lock_irqsave(&ratelimit_lock, flags); + cjif = jiffies; + djif = cjif - p->last; + if (djif < p->interval) { + if (p->bucket >= p->burst) { + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 0; + } + p->bucket++; + } else { + new_bucket = p->bucket - (djif / (unsigned)p->interval); + if (new_bucket < 0) + new_bucket = 0; + p->bucket = new_bucket + 1; + } + p->last = cjif; + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 1; +} +EXPORT_SYMBOL(ub_ratelimit); + + +/* + * Initialization + * + * struct user_beancounter contains + * - limits and other configuration settings, + * with a copy stored for accounting purposes, + * - structural fields: lists, spinlocks and so on. + * + * Before these parts are initialized, the structure should be memset + * to 0 or copied from a known clean structure. That takes care of a lot + * of fields not initialized explicitly. + */ + +static void init_beancounter_struct(struct user_beancounter *ub) +{ + ub->ub_magic = UB_MAGIC; + atomic_set(&ub->ub_refcount, 1); + spin_lock_init(&ub->ub_lock); + INIT_LIST_HEAD(&ub->ub_tcp_sk_list); + INIT_LIST_HEAD(&ub->ub_other_sk_list); +#ifdef CONFIG_BC_DEBUG_KMEM + INIT_LIST_HEAD(&ub->ub_cclist); +#endif + bc_init_ioprio(&ub->iopriv); +} + +static void init_beancounter_store(struct user_beancounter *ub) +{ + int k; + + for (k = 0; k < UB_RESOURCES; k++) { + memcpy(&ub->ub_store[k], &ub->ub_parms[k], + sizeof(struct ubparm)); + } +} + +static void init_beancounter_nolimits(struct user_beancounter *ub) +{ + int k; + + for (k = 0; k < UB_RESOURCES; k++) { + ub->ub_parms[k].limit = UB_MAXVALUE; + /* FIXME: whether this is right for physpages and guarantees? */ + ub->ub_parms[k].barrier = UB_MAXVALUE; + } + + /* FIXME: set unlimited rate? */ + ub->ub_limit_rl.burst = 4; + ub->ub_limit_rl.interval = 300*HZ; +} + +static void init_beancounter_syslimits(struct user_beancounter *ub) +{ + unsigned long mp; + extern int max_threads; + int k; + + mp = num_physpages; + ub->ub_parms[UB_KMEMSIZE].limit = + mp > (192*1024*1024 >> PAGE_SHIFT) ? + 32*1024*1024 : (mp << PAGE_SHIFT) / 6; + ub->ub_parms[UB_LOCKEDPAGES].limit = 8; + ub->ub_parms[UB_PRIVVMPAGES].limit = UB_MAXVALUE; + ub->ub_parms[UB_SHMPAGES].limit = 64; + ub->ub_parms[UB_NUMPROC].limit = max_threads / 2; + ub->ub_parms[UB_NUMTCPSOCK].limit = 1024; + ub->ub_parms[UB_TCPSNDBUF].limit = 1024*4*1024; /* 4k per socket */ + ub->ub_parms[UB_TCPRCVBUF].limit = 1024*6*1024; /* 6k per socket */ + ub->ub_parms[UB_NUMOTHERSOCK].limit = 256; + ub->ub_parms[UB_DGRAMRCVBUF].limit = 256*4*1024; /* 4k per socket */ + ub->ub_parms[UB_OTHERSOCKBUF].limit = 256*8*1024; /* 8k per socket */ + ub->ub_parms[UB_NUMFLOCK].limit = 1024; + ub->ub_parms[UB_NUMPTY].limit = 16; + ub->ub_parms[UB_NUMSIGINFO].limit = 1024; + ub->ub_parms[UB_DCACHESIZE].limit = 1024*1024; + ub->ub_parms[UB_NUMFILE].limit = 1024; + + for (k = 0; k < UB_RESOURCES; k++) + ub->ub_parms[k].barrier = ub->ub_parms[k].limit; + + ub->ub_limit_rl.burst = 4; + ub->ub_limit_rl.interval = 300*HZ; +} + +DEFINE_PER_CPU_STATIC(struct ub_percpu_struct, ub0_percpu); + +void __init ub_init_early(void) +{ + struct user_beancounter *ub; + + init_cache_counters(); + ub = get_ub0(); + memset(ub, 0, sizeof(*ub)); + ub->ub_uid = 0; + init_beancounter_nolimits(ub); + init_beancounter_store(ub); + init_beancounter_struct(ub); + ub->ub_percpu = percpu_static_init(ub0_percpu); + + memset(¤t->task_bc, 0, sizeof(struct task_beancounter)); + (void)set_exec_ub(ub); + current->task_bc.task_ub = get_beancounter(ub); + __charge_beancounter_locked(ub, UB_NUMPROC, 1, UB_FORCE); + current->task_bc.fork_sub = get_beancounter(ub); + ub_init_task_bc(¤t->task_bc); + init_mm.mm_ub = get_beancounter(ub); + + hlist_add_head(&ub->ub_hash, &ub_hash[ub->ub_uid]); + list_add(&ub->ub_list, &ub_list_head); +} + +void __init ub_init_late(void) +{ + ub_cachep = kmem_cache_create("user_beancounters", + sizeof(struct user_beancounter), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + memset(&default_beancounter, 0, sizeof(default_beancounter)); +#ifdef CONFIG_BC_UNLIMITED + init_beancounter_nolimits(&default_beancounter); +#else + init_beancounter_syslimits(&default_beancounter); +#endif + init_beancounter_store(&default_beancounter); + init_beancounter_struct(&default_beancounter); +} diff --git a/kernel/bc/dcache.c b/kernel/bc/dcache.c new file mode 100644 index 0000000..2242d64 --- /dev/null +++ b/kernel/bc/dcache.c @@ -0,0 +1,399 @@ +/* + * kernel/bc/dcache.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Locking + * traverse dcache_lock d_lock + * ub_dentry_charge + - + + * ub_dentry_uncharge + + - + * ub_dentry_charge_nofail + + - + * + * d_inuse changes are atomic, with special handling of "not in use" <-> + * "in use" (-1 <-> 0) transitions. We have two sources of non-atomicity + * here: (1) in many operations we need to change d_inuse of both dentry and + * its parent, and (2) on state transitions we need to adjust the account. + * + * Regarding (1): we do not have (and do not want) a single lock covering all + * operations, so in general it's impossible to get a consistent view of + * a tree with respect to d_inuse counters (except by swsuspend). It also + * means if a dentry with d_inuse of 0 gets one new in-use child and loses + * one, it's d_inuse counter will go either 0 -> 1 -> 0 path or 0 -> -1 -> 0, + * and we can't say which way. + * Note that path -1 -> 0 -> -1 can't turn into -1 -> -2 -> -1, since + * uncharge can be done only after return from charge (with d_genocide being + * the only apparent exception). + * Regarding (2): there is a similar uncertainty with the dcache account. + * If the account is equal to the limit, one more dentry is started to be + * used and one is put, the account will either hit the limit (and an error + * will be returned), or decrement will happen before increment. + * + * These races do not really matter. + * The only things we want are: + * - if a system is suspenede with no in-use dentries, all d_inuse counters + * should be correct (-1); + * - d_inuse counters should always be >= -1. + * This holds if ->parent references are accessed and maintained properly. + * In subtle moments (like d_move) dentries exchanging their parents should + * both be in-use. At d_genocide time, lookups and charges are assumed to be + * impossible. + */ + +/* + * Hierarchical accounting + * UB argument must NOT be NULL + */ + +static int do_charge_dcache(struct user_beancounter *ub, unsigned long size, + enum ub_severity sv) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + if (__charge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size), sv)) + goto out_mem; + if (__charge_beancounter_locked(ub, UB_DCACHESIZE, size, sv)) + goto out_dcache; + spin_unlock_irqrestore(&ub->ub_lock, flags); + return 0; + +out_dcache: + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); +out_mem: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return -ENOMEM; +} + +static void do_uncharge_dcache(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); + __uncharge_beancounter_locked(ub, UB_DCACHESIZE, size); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +static int charge_dcache(struct user_beancounter *ub, unsigned long size, + enum ub_severity sv) +{ + struct user_beancounter *p, *q; + + for (p = ub; p != NULL; p = p->parent) { + if (do_charge_dcache(p, size, sv)) + goto unroll; + } + return 0; + +unroll: + for (q = ub; q != p; q = q->parent) + do_uncharge_dcache(q, size); + return -ENOMEM; +} + +void uncharge_dcache(struct user_beancounter *ub, unsigned long size) +{ + for (; ub != NULL; ub = ub->parent) + do_uncharge_dcache(ub, size); +} + +/* + * Simple helpers to do maintain account and d_ub field. + */ + +static inline int d_charge(struct dentry_beancounter *d_bc) +{ + struct user_beancounter *ub; + + ub = get_beancounter(get_exec_ub()); + if (charge_dcache(ub, d_bc->d_ubsize, UB_SOFT)) { + put_beancounter(ub); + return -1; + } + d_bc->d_ub = ub; + return 0; +} + +static inline void d_forced_charge(struct dentry_beancounter *d_bc) +{ + struct user_beancounter *ub; + + ub = get_beancounter(get_exec_ub()); + charge_dcache(ub, d_bc->d_ubsize, UB_FORCE); + d_bc->d_ub = ub; +} + +/* + * Minor helpers + */ + +extern struct kmem_cache *dentry_cache; +extern struct kmem_cache *inode_cachep; +static struct rw_semaphore ub_dentry_alloc_sem; + +static inline unsigned long d_charge_size(struct dentry *dentry) +{ + /* dentry's d_name is already set to appropriate value (see d_alloc) */ + return kmem_cache_objuse(inode_cachep) + kmem_cache_objuse(dentry_cache) + + (dname_external(dentry) ? + kmem_dname_objuse((void *)dentry->d_name.name) : 0); +} + +/* + * Entry points from dcache.c + */ + +/* + * Set initial d_inuse on d_alloc. + * Called with no locks, preemption disabled. + */ +int __ub_dentry_alloc(struct dentry *dentry) +{ + struct dentry_beancounter *d_bc; + + d_bc = &dentry->dentry_bc; + d_bc->d_ub = get_beancounter(get_exec_ub()); + atomic_set(&d_bc->d_inuse, INUSE_INIT); /* see comment in dcache.h */ + d_bc->d_ubsize = d_charge_size(dentry); + + if (charge_dcache(d_bc->d_ub, d_bc->d_ubsize, UB_HARD)) + goto failure; + return 0; + +failure: + put_beancounter(d_bc->d_ub); + d_bc->d_ub = NULL; + return -ENOMEM; +} +void __ub_dentry_alloc_start(void) +{ + down_read(&ub_dentry_alloc_sem); + current->task_bc.dentry_alloc = 1; +} + +void __ub_dentry_alloc_end(void) +{ + current->task_bc.dentry_alloc = 0; + up_read(&ub_dentry_alloc_sem); +} + +/* + * It is assumed that parent is already in use, so traverse upwards is + * limited to one ancestor only. + * Called under d_lock and rcu_read_lock. + */ +int __ub_dentry_charge(struct dentry *dentry) +{ + struct dentry_beancounter *d_bc; + struct dentry *parent; + int ret; + + if (ub_dget_testone(dentry)) { + d_bc = &dentry->dentry_bc; + /* state transition -1 => 0 */ + if (d_charge(d_bc)) + goto failure; + + if (dentry != dentry->d_parent) { + parent = dentry->d_parent; + if (ub_dget_testone(parent)) + BUG(); + } + } + return 0; + +failure: + /* + * Here we would like to fail the lookup. + * It is not easy: if d_lookup fails, callers expect that a dentry + * with the given name doesn't exist, and create a new one. + * So, first we forcedly charge for this dentry. + * Then try to remove it from cache safely. If it turns out to be + * possible, we can return error. + */ + d_forced_charge(d_bc); + + if (dentry != dentry->d_parent) { + parent = dentry->d_parent; + if (ub_dget_testone(parent)) + BUG(); + } + + ret = 0; + if (spin_trylock(&dcache_lock)) { + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); + rcu_read_unlock(); + shrink_dcache_parent(dentry); + rcu_read_lock(); + spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); + } + if (atomic_read(&dentry->d_count) == 1) { + __d_drop(dentry); + ret = -1; + } + spin_unlock(&dcache_lock); + } + + return ret; +} + +/* + * Go up in the tree decreasing d_inuse. + * Called under dcache_lock. + */ +void __ub_dentry_uncharge(struct dentry *dentry) +{ + struct dentry *parent; + struct user_beancounter *ub; + unsigned long size; + + /* go up until state doesn't change or and root is reached */ + size = dentry->dentry_bc.d_ubsize; + ub = dentry->dentry_bc.d_ub; + while (ub_dput_testzero(dentry)) { + /* state transition 0 => -1 */ + uncharge_dcache(ub, size); + put_beancounter(ub); + + parent = dentry->d_parent; + if (dentry == parent) + break; + + dentry = parent; + size = dentry->dentry_bc.d_ubsize; + ub = dentry->dentry_bc.d_ub; + } +} + +/* + * Forced charge for __dget_locked, where API doesn't allow to return error. + * Called under dcache_lock. + */ +void __ub_dentry_charge_nofail(struct dentry *dentry) +{ + struct dentry *parent; + + while (ub_dget_testone(dentry)) { + /* state transition -1 => 0 */ + d_forced_charge(&dentry->dentry_bc); + + parent = dentry->d_parent; + if (dentry == parent) + break; + dentry = parent; + } +} + +/* + * Adaptive accounting + */ + +int ub_dentry_on = 1; +int ub_dentry_alloc_barrier; +EXPORT_SYMBOL(ub_dentry_on); + +static unsigned long checklowat = 0; +static unsigned long checkhiwat = ULONG_MAX; + +static int sysctl_ub_dentry_chk = 10; +#define sysctl_ub_lowat sysctl_ub_watermark[0] +#define sysctl_ub_hiwat sysctl_ub_watermark[1] +static DECLARE_RWSEM(ub_dentry_alloc_sem); +/* 1024th of lowmem size */ +static unsigned int sysctl_ub_watermark[2] = {0, 100}; + +static void ub_dentry_set_limits(unsigned long pages, unsigned long cap) +{ + down_write(&ub_dentry_alloc_sem); + preempt_disable(); + checklowat = (pages >> 10) * sysctl_ub_lowat; + checkhiwat = (pages >> 10) * sysctl_ub_hiwat; + if (checkhiwat > cap) { + checkhiwat = cap; + checklowat = cap / sysctl_ub_hiwat * sysctl_ub_lowat; + } + preempt_enable(); + up_write(&ub_dentry_alloc_sem); +} + +static int ub_dentry_proc_handler(ctl_table *ctl, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int r; + + r = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + if (!r && write) + ub_dentry_set_limits(totalram_pages - totalhigh_pages, + ULONG_MAX); + return r; +} + +static ctl_table ub_dentry_sysctl_table[] = { + { + .procname = "dentry_check", + .data = &sysctl_ub_dentry_chk, + .maxlen = sizeof(sysctl_ub_dentry_chk), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "dentry_watermark", + .data = &sysctl_ub_lowat, + .maxlen = sizeof(sysctl_ub_lowat) * 2, + .mode = 0644, + .proc_handler = ub_dentry_proc_handler, + }, + { .ctl_name = 0 } +}; +static ctl_table ub_dentry_sysctl_root[] = { + { + .procname = "ubc", + .mode = 0555, + .child = ub_dentry_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static int __init ub_dentry_init(void) +{ + /* + * Initial watermarks are limited, to limit walk time. + * 384MB translates into 0.8 sec on PIII 866MHz. + */ + ub_dentry_set_limits(totalram_pages - totalhigh_pages, + 384 * 1024 * 1024 / PAGE_SIZE); + if (register_sysctl_table(ub_dentry_sysctl_root) == NULL) + return -ENOMEM; + return 0; +} +__initcall(ub_dentry_init); diff --git a/kernel/bc/io_acct.c b/kernel/bc/io_acct.c new file mode 100644 index 0000000..e8d6c38 --- /dev/null +++ b/kernel/bc/io_acct.c @@ -0,0 +1,500 @@ +/* + * kernel/bc/io_acct.c + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Pavel Emelianov + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static struct mempool_s *pb_pool; + +#define PB_MIN_IO (1024) + +static inline struct page_beancounter *io_pb_alloc(void) +{ + return mempool_alloc(pb_pool, GFP_ATOMIC); +} + +static inline void io_pb_free(struct page_beancounter *pb) +{ + mempool_free(pb, pb_pool); +} + +struct page_beancounter **page_pblist(struct page *page) +{ + struct page_beancounter **pb, *iopb; + + pb = &page_pbc(page); + iopb = iopb_to_pb(*pb); + + return iopb == NULL ? pb : &iopb->page_pb_list; +} + +/* + * We save the context page was set dirty to use it later + * when the real write starts. If the page is mapped then + * IO pb is stores like this: + * + * Before saving: + * + * +- page -------+ + * | ... | + * | page_pb +---+ + * +--------------+ | +-----+ +-----+ +-----+ + * +-> | pb1 | -> | pb2 | - ... -> | pbN | -+ + * +-----+ +-----+ +-----+ | + * ^ | + * +---------------------------------+ + * + * After saving: + * + * +- page -------+ +- io pb ------+ + * | ... | | ... | + * | page_pb +----> | page_pb_list +-+ + * +--------------+ +--------------+ | + * | + * +-------------------+ + * | + * | +-----+ +-----+ +-----+ + * +-> | pb1 | -> | pb2 | - ... -> | pbN | -+ + * +-----+ +-----+ +-----+ | + * ^ | + * +---------------------------------+ + * + * And the page_pblist(...) function returns pointer to the place that + * points to this pbX ring. + */ + +#ifdef CONFIG_BC_DEBUG_IO +static LIST_HEAD(pb_io_list); +static unsigned long anon_pages, not_released; + +static inline void io_debug_save(struct page_beancounter *pb, + struct page_beancounter *mpb) +{ + pb->io_debug = (mpb == NULL); + list_add(&pb->io_list, &pb_io_list); +} + +static inline void io_debug_release(struct page_beancounter *pb) +{ + list_del(&pb->io_list); +} + +void ub_io_release_debug(struct page *page) +{ + struct page_beancounter *pb; + static int once = 0; + + pb = page_pbc(page); + if (likely(iopb_to_pb(pb) == NULL)) + return; + + if (!once) { + printk("BUG: Page has an IO bc but is not expectd to\n"); + dump_stack(); + once = 1; + } + + spin_lock(&pb_lock); + not_released++; + pb = iopb_to_pb(pb); + page_pbc(page) = NULL; + io_debug_release(pb); + pb->ub->io_pb_held--; + spin_unlock(&pb_lock); + + put_beancounter(pb->ub); + io_pb_free(pb); +} + +static inline int io_debug_precheck_save(struct page *page) +{ + if (unlikely(PageAnon(page))) { + anon_pages++; + return 1; + } + + return 0; +} + +static inline int io_debug_precheck_release(struct page *page) +{ + return 0; +} +#else +#define io_debug_save(pb, mpb) do { } while (0) +#define io_debug_release(pb) do { } while (0) +#define io_debug_precheck_save(page) (0) +#define io_debug_precheck_release(p) (0) +#endif + +static inline void set_page_io(struct page *page, struct page_beancounter *pb, + struct page_beancounter *mapped_pb) +{ + unsigned long val; + + val = (unsigned long)pb | PAGE_IO_MARK; + pb->page = page; + + page_pbc(page) = (struct page_beancounter *)val; + io_debug_save(pb, mapped_pb); + pb->ub->io_pb_held++; +} + +static inline void put_page_io(struct page *page, struct page_beancounter *pb) +{ + pb->ub->io_pb_held--; + io_debug_release(pb); + page_pbc(page) = pb->page_pb_list; +} + +void ub_io_save_context(struct page *page, size_t bytes_dirtied) +{ + struct user_beancounter *ub; + struct page_beancounter *pb, *mapped_pb, *io_pb; + + if (unlikely(in_interrupt())) { + WARN_ON_ONCE(1); + return; + } + + /* + * FIXME - this can happen from atomic context and + * it's probably not that good to loose some requests + */ + + pb = io_pb_alloc(); + io_pb = NULL; + + spin_lock(&pb_lock); + if (io_debug_precheck_save(page)) + goto out_unlock; + + mapped_pb = page_pbc(page); + io_pb = iopb_to_pb(mapped_pb); + if (io_pb != NULL) { + /* + * this page has an IO - release it and force a new one + * We could also race with page cleaning - see below + */ + mapped_pb = io_pb->page_pb_list; + put_page_io(page, io_pb); + } + + /* + * If the page is mapped we must save the context + * it maps to. If the page isn't mapped we use current + * context as this is a regular write. + */ + + if (mapped_pb != NULL) + ub = top_beancounter(mapped_pb->ub); + else + ub = get_io_ub(); + + if (!PageDirty(page)) { + /* + * race with clear_page_dirty(_for_io) - account + * writes for ub_io_release_context() + */ + if (io_pb != NULL) + io_pb->ub->bytes_wrote += PAGE_CACHE_SIZE; + if (pb != NULL) + io_pb_free(pb); + goto out_unlock; + } + + if (pb == NULL) { + ub->bytes_dirty_missed += bytes_dirtied; + goto out_unlock; + } + + /* + * the page may become clean here, but the context will be seen + * in ub_io_release_context() + */ + + pb->ub = get_beancounter(ub); + pb->page_pb_list = mapped_pb; + ub->bytes_dirtied += bytes_dirtied; + + set_page_io(page, pb, mapped_pb); + +out_unlock: + spin_unlock(&pb_lock); + + if (io_pb != NULL) { + put_beancounter(io_pb->ub); + io_pb_free(io_pb); + } +} + +void ub_io_release_context(struct page *page, size_t wrote) +{ + struct page_beancounter *pb; + + if (io_debug_precheck_release(page)) + return; + + if (unlikely(in_interrupt())) { + WARN_ON_ONCE(1); + return; + } + + spin_lock(&pb_lock); + pb = iopb_to_pb(page_pbc(page)); + if (unlikely(pb == NULL)) + /* + * this may happen if we failed to allocate + * context in ub_io_save_context or raced with it + */ + goto out_unlock; + + if (wrote) + pb->ub->bytes_wrote += wrote; + + put_page_io(page, pb); +out_unlock: + spin_unlock(&pb_lock); + + if (pb != NULL) { + put_beancounter(pb->ub); + io_pb_free(pb); + } +} + +void __init ub_init_io(struct kmem_cache *pb_cachep) +{ + pb_pool = mempool_create_slab_pool(PB_MIN_IO, pb_cachep); + if (pb_pool == NULL) + panic("Can't create pb_pool"); +} + +#ifdef CONFIG_PROC_FS +#define in_flight(var) (var > var##_done ? var - var##_done : 0) + +static int bc_ioacct_show(struct seq_file *f, void *v) +{ + int i; + unsigned long long read, write, cancel; + unsigned long sync, sync_done; + unsigned long fsync, fsync_done; + unsigned long fdsync, fdsync_done; + unsigned long frsync, frsync_done; + unsigned long reads, writes; + unsigned long long rchar, wchar; + struct user_beancounter *ub; + + ub = seq_beancounter(f); + + read = write = cancel = 0; + sync = sync_done = fsync = fsync_done = + fdsync = fdsync_done = frsync = frsync_done = 0; + reads = writes = 0; + rchar = wchar = 0; + for_each_online_cpu(i) { + struct ub_percpu_struct *ub_percpu; + ub_percpu = per_cpu_ptr(ub->ub_percpu, i); + + read += ub_percpu->bytes_read; + write += ub_percpu->bytes_wrote; + cancel += ub_percpu->bytes_cancelled; + + sync += ub_percpu->sync; + fsync += ub_percpu->fsync; + fdsync += ub_percpu->fdsync; + frsync += ub_percpu->frsync; + sync_done += ub_percpu->sync_done; + fsync_done += ub_percpu->fsync_done; + fdsync_done += ub_percpu->fdsync_done; + frsync_done += ub_percpu->frsync_done; + + reads += ub_percpu->read; + writes += ub_percpu->write; + rchar += ub_percpu->rchar; + wchar += ub_percpu->wchar; + } + + seq_printf(f, bc_proc_llu_fmt, "read", read); + seq_printf(f, bc_proc_llu_fmt, "write", ub->bytes_wrote + write); + seq_printf(f, bc_proc_llu_fmt, "dirty", ub->bytes_dirtied); + seq_printf(f, bc_proc_llu_fmt, "cancel", cancel); + seq_printf(f, bc_proc_llu_fmt, "missed", ub->bytes_dirty_missed); + + seq_printf(f, bc_proc_lu_lfmt, "syncs_total", sync); + seq_printf(f, bc_proc_lu_lfmt, "fsyncs_total", fsync); + seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_total", fdsync); + seq_printf(f, bc_proc_lu_lfmt, "range_syncs_total", frsync); + + seq_printf(f, bc_proc_lu_lfmt, "syncs_active", in_flight(sync)); + seq_printf(f, bc_proc_lu_lfmt, "fsyncs_active", in_flight(fsync)); + seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_active", in_flight(fsync)); + seq_printf(f, bc_proc_lu_lfmt, "range_syncs_active", in_flight(frsync)); + + seq_printf(f, bc_proc_lu_lfmt, "vfs_reads", reads); + seq_printf(f, bc_proc_llu_fmt, "vfs_read_chars", rchar); + seq_printf(f, bc_proc_lu_lfmt, "vfs_writes", writes); + seq_printf(f, bc_proc_llu_fmt, "vfs_write_chars", wchar); + + seq_printf(f, bc_proc_lu_lfmt, "io_pbs", ub->io_pb_held); + return 0; +} + +static struct bc_proc_entry bc_ioacct_entry = { + .name = "ioacct", + .u.show = bc_ioacct_show, +}; + +#ifdef CONFIG_BC_DEBUG_IO +#define PTR_SIZE (int)(sizeof(void *) * 2) +#define INT_SIZE (int)(sizeof(int) * 2) + +static int bc_io_show(struct seq_file *f, void *v) +{ + struct list_head *lh; + struct page_beancounter *pb; + struct page *pg; + + lh = (struct list_head *)v; + if (lh == &pb_io_list) { + seq_printf(f, "Races: anon %lu missed %lu\n", + anon_pages, not_released); + + seq_printf(f, "%-*s %-1s %-*s %-4s %*s %*s " + "%-*s %-*s %-1s %-*s %-*s\n", + PTR_SIZE, "pb", "", + PTR_SIZE, "page", "flg", + INT_SIZE, "cnt", INT_SIZE, "mcnt", + PTR_SIZE, "pb_list", + PTR_SIZE, "page_pb", "", + PTR_SIZE, "mapping", + INT_SIZE, "ub"); + return 0; + } + + pb = list_entry(lh, struct page_beancounter, io_list); + pg = pb->page; + seq_printf(f, "%p %c %p %c%c%c%c %*d %*d %p %p %c %p %d\n", + pb, pb->io_debug ? 'e' : 'm', pg, + PageDirty(pg) ? 'D' : 'd', + PageAnon(pg) ? 'A' : 'a', + PageWriteback(pg) ? 'W' : 'w', + PageLocked(pg) ? 'L' : 'l', + INT_SIZE, page_count(pg), + INT_SIZE, page_mapcount(pg), + pb->page_pb_list, page_pbc(pg), + iopb_to_pb(page_pbc(pg)) == pb ? ' ' : '!', + pg->mapping, pb->ub->ub_uid); + return 0; +} + +static void *bc_io_start(struct seq_file *f, loff_t *ppos) +{ + spin_lock(&pb_lock); + return seq_list_start_head(&pb_io_list, *ppos); +} + +static void *bc_io_next(struct seq_file *f, void *v, loff_t *ppos) +{ + return seq_list_next(v, &pb_io_list, ppos); +} + +static void bc_io_stop(struct seq_file *f, void *v) +{ + spin_unlock(&pb_lock); +} + +static struct seq_operations bc_io_seq_ops = { + .start = bc_io_start, + .next = bc_io_next, + .stop = bc_io_stop, + .show = bc_io_show, +}; + +static int bc_io_open(struct inode *inode, struct file *filp) +{ + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EACCES; + + return seq_open(filp, &bc_io_seq_ops); +} +static struct file_operations bc_io_debug_ops = { + .open = bc_io_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct bc_proc_entry bc_ioacct_debug_entry = { + .name = "ioacct_debug", + .u.fops = &bc_io_debug_ops, +}; +#endif + +static int bc_ioacct_notify(struct vnotifier_block *self, + unsigned long event, void *arg, int old_ret) +{ + struct user_beancounter *ub; + unsigned long *vm_events; + unsigned long long bin, bout; + int i; + + if (event != VIRTINFO_VMSTAT) + return old_ret; + + ub = top_beancounter(get_exec_ub()); + if (ub == get_ub0()) + return old_ret; + + /* Think over: do we need to account here bytes_dirty_missed? */ + bout = ub->bytes_wrote; + bin = 0; + for_each_online_cpu(i) { + bout += per_cpu_ptr(ub->ub_percpu, i)->bytes_wrote; + bin += per_cpu_ptr(ub->ub_percpu, i)->bytes_read; + } + + /* convert to Kbytes */ + bout >>= 10; + bin >>= 10; + + vm_events = ((unsigned long *)arg) + NR_VM_ZONE_STAT_ITEMS; + vm_events[PGPGOUT] = (unsigned long)bout; + vm_events[PGPGIN] = (unsigned long)bin; + return NOTIFY_OK; +} + +static struct vnotifier_block bc_ioacct_nb = { + .notifier_call = bc_ioacct_notify, +}; + +static int __init bc_ioacct_init(void) +{ +#ifdef CONFIG_BC_DEBUG_IO + bc_register_proc_root_entry(&bc_ioacct_debug_entry); +#endif + bc_register_proc_entry(&bc_ioacct_entry); + + virtinfo_notifier_register(VITYPE_GENERAL, &bc_ioacct_nb); + return 0; +} + +late_initcall(bc_ioacct_init); +#endif diff --git a/kernel/bc/io_prio.c b/kernel/bc/io_prio.c new file mode 100644 index 0000000..20aa133 --- /dev/null +++ b/kernel/bc/io_prio.c @@ -0,0 +1,288 @@ +/* + * kernel/bc/io_prio.c + * + * Copyright (C) 2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Vasily Tarasov + * + */ + +#include +#include +#include +#include +#include +#include +#include + +struct cfq_bc_data *__find_cfq_bc(struct ub_iopriv *iopriv, + struct cfq_data *cfqd) +{ + struct cfq_bc_data *cfq_bc; + + list_for_each_entry(cfq_bc, &iopriv->cfq_bc_head, cfq_bc_list) + if (cfq_bc->cfqd == cfqd) + return cfq_bc; + + return NULL; +} + +struct cfq_bc_data *bc_find_cfq_bc(struct ub_iopriv *iopriv, + struct cfq_data *cfqd) +{ + struct cfq_bc_data *cfq_bc; + unsigned long flags; + + read_lock_irqsave(&iopriv->cfq_bc_list_lock, flags); + cfq_bc = __find_cfq_bc(iopriv, cfqd); + read_unlock_irqrestore(&iopriv->cfq_bc_list_lock, flags); + return cfq_bc; +} +struct cfq_bc_data *bc_findcreate_cfq_bc(struct ub_iopriv *iopriv, + struct cfq_data *cfqd, gfp_t gfp_mask) +{ + struct cfq_bc_data *cfq_bc_new; + struct cfq_bc_data *cfq_bc; + unsigned long flags; + + cfq_bc = bc_find_cfq_bc(iopriv, cfqd); + if (cfq_bc) + return cfq_bc; + + cfq_bc_new = kzalloc(sizeof(*cfq_bc_new), gfp_mask); + if (!cfq_bc_new) + return NULL; + + cfq_init_cfq_bc(cfq_bc_new); + cfq_bc_new->cfqd = cfqd; + cfq_bc_new->ub_iopriv = iopriv; + + write_lock_irqsave(&iopriv->cfq_bc_list_lock, flags); + cfq_bc = __find_cfq_bc(iopriv, cfqd); + if (cfq_bc) + kfree(cfq_bc_new); + else { + list_add_tail(&cfq_bc_new->cfq_bc_list, + &iopriv->cfq_bc_head); + cfq_bc = cfq_bc_new; + } + write_unlock_irqrestore(&iopriv->cfq_bc_list_lock, flags); + + return cfq_bc; +} + +void bc_init_ioprio(struct ub_iopriv *iopriv) +{ + INIT_LIST_HEAD(&iopriv->cfq_bc_head); + rwlock_init(&iopriv->cfq_bc_list_lock); + iopriv->ioprio = UB_IOPRIO_BASE; +} + +static void inline bc_cfq_bc_check_empty(struct cfq_bc_data *cfq_bc) +{ + BUG_ON(!RB_EMPTY_ROOT(&cfq_bc->service_tree.rb)); +} + +static void bc_release_cfq_bc(struct cfq_bc_data *cfq_bc) +{ + struct cfq_data *cfqd; + elevator_t *eq; + int i; + + cfqd = cfq_bc->cfqd; + eq = cfqd->queue->elevator; + + for (i = 0; i < CFQ_PRIO_LISTS; i++) { + if (cfq_bc->async_cfqq[0][i]) { + eq->ops->put_queue(cfq_bc->async_cfqq[0][i]); + cfq_bc->async_cfqq[0][i] = NULL; + } + if (cfq_bc->async_cfqq[1][i]) { + eq->ops->put_queue(cfq_bc->async_cfqq[1][i]); + cfq_bc->async_cfqq[1][i] = NULL; + } + } + if (cfq_bc->async_idle_cfqq) { + eq->ops->put_queue(cfq_bc->async_idle_cfqq); + cfq_bc->async_idle_cfqq = NULL; + } + /* + * Note: this cfq_bc is already not in active list, + * but can be still pointed from cfqd as active. + */ + cfqd->active_cfq_bc = NULL; + + bc_cfq_bc_check_empty(cfq_bc); + list_del(&cfq_bc->cfq_bc_list); + kfree(cfq_bc); +} + +void bc_fini_ioprio(struct ub_iopriv *iopriv) +{ + struct cfq_bc_data *cfq_bc; + struct cfq_bc_data *cfq_bc_tmp; + unsigned long flags; + spinlock_t *queue_lock; + + /* + * Don't get cfq_bc_list_lock since ub is already dead, + * but async cfqqs are still in hash list, consequently + * queue_lock should be hold. + */ + list_for_each_entry_safe(cfq_bc, cfq_bc_tmp, + &iopriv->cfq_bc_head, cfq_bc_list) { + queue_lock = cfq_bc->cfqd->queue->queue_lock; + spin_lock_irqsave(queue_lock, flags); + bc_release_cfq_bc(cfq_bc); + spin_unlock_irqrestore(queue_lock, flags); + } +} + +void bc_cfq_exit_queue(struct cfq_data *cfqd) +{ + struct cfq_bc_data *cfq_bc; + struct user_beancounter *ub; + + local_irq_disable(); + for_each_beancounter(ub) { + write_lock(&ub->iopriv.cfq_bc_list_lock); + cfq_bc = __find_cfq_bc(&ub->iopriv, cfqd); + if (!cfq_bc) { + write_unlock(&ub->iopriv.cfq_bc_list_lock); + continue; + } + bc_release_cfq_bc(cfq_bc); + write_unlock(&ub->iopriv.cfq_bc_list_lock); + } + local_irq_enable(); +} + +int bc_expired(struct cfq_data *cfqd) +{ + return time_after(jiffies, cfqd->slice_end) ? 1 : 0; +} + +static inline int bc_empty(struct cfq_bc_data *cfq_bc) +{ + /* + * consider BC as empty only if there is no requests + * in elevator _and_ in driver + */ + if (!cfq_bc->rqnum && !cfq_bc->on_dispatch) + return 1; + + return 0; +} + +static inline unsigned long bc_time_slice_by_ioprio(unsigned int ioprio, + unsigned int base_slice) +{ + return base_slice + + (base_slice * (ioprio - UB_IOPRIO_MIN)) + / (UB_IOPRIO_MAX - UB_IOPRIO_MIN - 1); +} + +static inline void bc_set_active(struct cfq_data *cfqd) +{ + if (list_empty(&cfqd->act_cfq_bc_head)) { + cfqd->active_cfq_bc = NULL; + return; + } + + cfqd->active_cfq_bc = list_first_entry(&cfqd->act_cfq_bc_head, + struct cfq_bc_data, act_cfq_bc_list); + list_move_tail(&cfqd->active_cfq_bc->act_cfq_bc_list, + &cfqd->act_cfq_bc_head); + cfqd->slice_end = jiffies + + bc_time_slice_by_ioprio(cfqd->active_cfq_bc->ub_iopriv->ioprio, + cfqd->cfq_ub_slice); +} + +void bc_schedule_active(struct cfq_data *cfqd) +{ + if (bc_expired(cfqd) || !cfqd->active_cfq_bc || + bc_empty(cfqd->active_cfq_bc)) + bc_set_active(cfqd); +} + +void bc_inc_rqnum(struct cfq_queue *cfqq) +{ + struct cfq_bc_data *cfq_bc; + + cfq_bc = cfqq->cfq_bc; + + if (!cfq_bc->rqnum) + list_add_tail(&cfq_bc->act_cfq_bc_list, + &cfqq->cfqd->act_cfq_bc_head); + + cfq_bc->rqnum++; +} + +void bc_dec_rqnum(struct cfq_queue *cfqq) +{ + struct cfq_bc_data *cfq_bc; + + cfq_bc = cfqq->cfq_bc; + + cfq_bc->rqnum--; + + if (!cfq_bc->rqnum) + list_del(&cfq_bc->act_cfq_bc_list); +} + +unsigned long bc_set_ioprio(int ubid, int ioprio) +{ + struct user_beancounter *ub; + + if (ioprio < UB_IOPRIO_MIN || ioprio >= UB_IOPRIO_MAX) + return -ERANGE; + + ub = get_beancounter_byuid(ubid, 0); + if (!ub) + return -ESRCH; + + ub->iopriv.ioprio = ioprio; + put_beancounter(ub); + + return 0; +} + +struct user_beancounter *bc_io_switch_context(struct page *page) +{ + struct page_beancounter *pb; + struct user_beancounter *old_ub = NULL; + + pb = page_iopb(page); + pb = iopb_to_pb(pb); + if (pb) { + get_beancounter(pb->ub); + old_ub = set_exec_ub(pb->ub); + } + + return old_ub; +} + +void bc_io_restore_context(struct user_beancounter *ub) +{ + struct user_beancounter *old_ub; + + if (ub) { + old_ub = set_exec_ub(ub); + put_beancounter(old_ub); + } +} + +EXPORT_SYMBOL(bc_io_switch_context); +EXPORT_SYMBOL(bc_io_restore_context); +EXPORT_SYMBOL(__find_cfq_bc); +EXPORT_SYMBOL(bc_fini_ioprio); +EXPORT_SYMBOL(bc_init_ioprio); +EXPORT_SYMBOL(bc_findcreate_cfq_bc); +EXPORT_SYMBOL(bc_cfq_exit_queue); +EXPORT_SYMBOL(bc_expired); +EXPORT_SYMBOL(bc_schedule_active); +EXPORT_SYMBOL(bc_inc_rqnum); +EXPORT_SYMBOL(bc_dec_rqnum); diff --git a/kernel/bc/kmem.c b/kernel/bc/kmem.c new file mode 100644 index 0000000..74c4179 --- /dev/null +++ b/kernel/bc/kmem.c @@ -0,0 +1,406 @@ +/* + * kernel/bc/kmem.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Initialization + */ + +/* + * Slab accounting + */ + +#ifdef CONFIG_BC_DEBUG_KMEM + +#define CC_HASH_SIZE 1024 +static struct ub_cache_counter *cc_hash[CC_HASH_SIZE]; +spinlock_t cc_lock; + +static void __free_cache_counters(struct user_beancounter *ub, + struct kmem_cache *cachep) +{ + struct ub_cache_counter *cc, **pprev, *del; + int i; + unsigned long flags; + + del = NULL; + spin_lock_irqsave(&cc_lock, flags); + for (i = 0; i < CC_HASH_SIZE; i++) { + pprev = &cc_hash[i]; + cc = cc_hash[i]; + while (cc != NULL) { + if (cc->ub != ub && cc->cachep != cachep) { + pprev = &cc->next; + cc = cc->next; + continue; + } + + list_del(&cc->ulist); + *pprev = cc->next; + cc->next = del; + del = cc; + cc = *pprev; + } + } + spin_unlock_irqrestore(&cc_lock, flags); + + while (del != NULL) { + cc = del->next; + kfree(del); + del = cc; + } +} + +void ub_free_counters(struct user_beancounter *ub) +{ + __free_cache_counters(ub, NULL); +} + +void ub_kmemcache_free(struct kmem_cache *cachep) +{ + __free_cache_counters(NULL, cachep); +} + +void __init init_cache_counters(void) +{ + memset(cc_hash, 0, CC_HASH_SIZE * sizeof(cc_hash[0])); + spin_lock_init(&cc_lock); +} + +#define cc_hash_fun(ub, cachep) ( \ + (((unsigned long)(ub) >> L1_CACHE_SHIFT) ^ \ + ((unsigned long)(ub) >> (BITS_PER_LONG / 2)) ^ \ + ((unsigned long)(cachep) >> L1_CACHE_SHIFT) ^ \ + ((unsigned long)(cachep) >> (BITS_PER_LONG / 2)) \ + ) & (CC_HASH_SIZE - 1)) + +static int change_slab_charged(struct user_beancounter *ub, + struct kmem_cache *cachep, long val) +{ + struct ub_cache_counter *cc, *new_cnt, **pprev; + unsigned long flags; + + new_cnt = NULL; +again: + spin_lock_irqsave(&cc_lock, flags); + cc = cc_hash[cc_hash_fun(ub, cachep)]; + while (cc) { + if (cc->ub == ub && cc->cachep == cachep) + goto found; + cc = cc->next; + } + + if (new_cnt != NULL) + goto insert; + + spin_unlock_irqrestore(&cc_lock, flags); + + new_cnt = kmalloc(sizeof(*new_cnt), GFP_ATOMIC); + if (new_cnt == NULL) + return -ENOMEM; + + new_cnt->counter = 0; + new_cnt->ub = ub; + new_cnt->cachep = cachep; + goto again; + +insert: + pprev = &cc_hash[cc_hash_fun(ub, cachep)]; + new_cnt->next = *pprev; + *pprev = new_cnt; + list_add(&new_cnt->ulist, &ub->ub_cclist); + cc = new_cnt; + new_cnt = NULL; + +found: + cc->counter += val; + spin_unlock_irqrestore(&cc_lock, flags); + if (new_cnt) + kfree(new_cnt); + return 0; +} + +static inline int inc_slab_charged(struct user_beancounter *ub, + struct kmem_cache *cachep) +{ + return change_slab_charged(ub, cachep, 1); +} + +static inline void dec_slab_charged(struct user_beancounter *ub, + struct kmem_cache *cachep) +{ + if (change_slab_charged(ub, cachep, -1) < 0) + BUG(); +} + +#include + +#define inc_pages_charged(ub, order) ub_percpu_add(ub, \ + pages_charged, 1 << order) +#define dec_pages_charged(ub, order) ub_percpu_sub(ub, \ + pages_charged, 1 << order) + +#ifdef CONFIG_PROC_FS +static int bc_kmem_debug_show(struct seq_file *f, void *v) +{ + struct user_beancounter *ub; + struct ub_cache_counter *cc; + long pages, vmpages, pbc; + int i; + + ub = seq_beancounter(f); + + pages = vmpages = pbc = 0; + for_each_online_cpu(i) { + pages += per_cpu_ptr(ub->ub_percpu, i)->pages_charged; + vmpages += per_cpu_ptr(ub->ub_percpu, i)->vmalloc_charged; + pbc += per_cpu_ptr(ub->ub_percpu, i)->pbcs; + } + if (pages < 0) + pages = 0; + if (vmpages < 0) + vmpages = 0; + + seq_printf(f, bc_proc_lu_lu_fmt, "pages", pages, PAGE_SIZE); + seq_printf(f, bc_proc_lu_lu_fmt, "vmalloced", vmpages, PAGE_SIZE); + seq_printf(f, bc_proc_lu_lu_fmt, "pbcs", pbc, + sizeof(struct page_beancounter)); + + spin_lock_irq(&cc_lock); + list_for_each_entry (cc, &ub->ub_cclist, ulist) { + struct kmem_cache *cachep; + + cachep = cc->cachep; + seq_printf(f, bc_proc_lu_lu_fmt, + kmem_cache_name(cachep), + cc->counter, + kmem_cache_objuse(cachep)); + } + spin_unlock_irq(&cc_lock); + return 0; +} + +static struct bc_proc_entry bc_kmem_debug_entry = { + .name = "kmem_debug", + .u.show = bc_kmem_debug_show, +}; + +static int __init bc_kmem_debug_init(void) +{ + bc_register_proc_entry(&bc_kmem_debug_entry); + return 0; +} + +late_initcall(bc_kmem_debug_init); +#endif + +#else +#define inc_slab_charged(ub, cache) (0) +#define dec_slab_charged(ub, cache) do { } while (0) +#define inc_pages_charged(ub, cache) do { } while (0) +#define dec_pages_charged(ub, cache) do { } while (0) +#endif + +#define UB_KMEM_QUANT (PAGE_SIZE * 4) + +/* called with IRQ disabled */ +int ub_kmemsize_charge(struct user_beancounter *ub, + unsigned long size, + enum ub_severity strict) +{ + struct task_beancounter *tbc; + + tbc = ¤t->task_bc; + if (ub != tbc->task_ub || size > UB_KMEM_QUANT) + goto just_charge; + if (tbc->kmem_precharged >= size) { + tbc->kmem_precharged -= size; + return 0; + } + + if (charge_beancounter(ub, UB_KMEMSIZE, UB_KMEM_QUANT, UB_HARD) == 0) { + tbc->kmem_precharged += UB_KMEM_QUANT - size; + return 0; + } + +just_charge: + return charge_beancounter(ub, UB_KMEMSIZE, size, strict); +} + +/* called with IRQ disabled */ +void ub_kmemsize_uncharge(struct user_beancounter *ub, + unsigned long size) +{ + struct task_beancounter *tbc; + + if (size > UB_MAXVALUE) { + printk("ub_kmemsize_uncharge: size %lu\n", size); + dump_stack(); + } + + tbc = ¤t->task_bc; + if (ub != tbc->task_ub) + goto just_uncharge; + + tbc->kmem_precharged += size; + if (tbc->kmem_precharged < UB_KMEM_QUANT * 2) + return; + size = tbc->kmem_precharged - UB_KMEM_QUANT; + tbc->kmem_precharged -= size; + +just_uncharge: + uncharge_beancounter(ub, UB_KMEMSIZE, size); +} + +/* called with IRQ disabled */ +int ub_slab_charge(struct kmem_cache *cachep, void *objp, gfp_t flags) +{ + unsigned int size; + struct user_beancounter *ub; + + ub = get_beancounter(get_exec_ub()); + if (ub == NULL) + return 0; + + size = CHARGE_SIZE(kmem_cache_objuse(cachep)); + if (ub_kmemsize_charge(ub, size, + (flags & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) + goto out_err; + + if (inc_slab_charged(ub, cachep) < 0) { + ub_kmemsize_uncharge(ub, size); + goto out_err; + } + *ub_slab_ptr(cachep, objp) = ub; + return 0; + +out_err: + put_beancounter(ub); + return -ENOMEM; +} + +/* called with IRQ disabled */ +void ub_slab_uncharge(struct kmem_cache *cachep, void *objp) +{ + unsigned int size; + struct user_beancounter **ub_ref; + + ub_ref = ub_slab_ptr(cachep, objp); + if (*ub_ref == NULL) + return; + + dec_slab_charged(*ub_ref, cachep); + size = CHARGE_SIZE(kmem_cache_objuse(cachep)); + ub_kmemsize_uncharge(*ub_ref, size); + put_beancounter(*ub_ref); + *ub_ref = NULL; +} + +/* + * Pages accounting + */ + +int ub_page_charge(struct page *page, int order, gfp_t mask) +{ + struct user_beancounter *ub; + unsigned long flags; + + ub = NULL; + if (!(mask & __GFP_UBC)) + goto out; + + ub = get_beancounter(get_exec_ub()); + if (ub == NULL) + goto out; + + local_irq_save(flags); + if (ub_kmemsize_charge(ub, CHARGE_ORDER(order), + (mask & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) + goto err; + + inc_pages_charged(ub, order); + local_irq_restore(flags); +out: + BUG_ON(page_ub(page) != NULL); + page_ub(page) = ub; + return 0; + +err: + local_irq_restore(flags); + BUG_ON(page_ub(page) != NULL); + put_beancounter(ub); + return -ENOMEM; +} + +void ub_page_uncharge(struct page *page, int order) +{ + struct user_beancounter *ub; + unsigned long flags; + + ub = page_ub(page); + if (ub == NULL) + return; + + BUG_ON(ub->ub_magic != UB_MAGIC); + dec_pages_charged(ub, order); + local_irq_save(flags); + ub_kmemsize_uncharge(ub, CHARGE_ORDER(order)); + local_irq_restore(flags); + put_beancounter(ub); + page_ub(page) = NULL; +} + +/* + * takes init_mm.page_table_lock + * some outer lock to protect pages from vmalloced area must be held + */ +struct user_beancounter *vmalloc_ub(void *obj) +{ + struct page *pg; + + pg = vmalloc_to_page(obj); + if (pg == NULL) + return NULL; + + return page_ub(pg); +} + +EXPORT_SYMBOL(vmalloc_ub); + +struct user_beancounter *mem_ub(void *obj) +{ + struct user_beancounter *ub; + + if ((unsigned long)obj >= VMALLOC_START && + (unsigned long)obj < VMALLOC_END) + ub = vmalloc_ub(obj); + else + ub = slab_ub(obj); + + return ub; +} + +EXPORT_SYMBOL(mem_ub); diff --git a/kernel/bc/misc.c b/kernel/bc/misc.c new file mode 100644 index 0000000..20c28a7 --- /dev/null +++ b/kernel/bc/misc.c @@ -0,0 +1,455 @@ +/* + * kernel/bc/misc.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define UB_FILE_MINQUANT 3 +#define UB_FILE_MAXQUANT 10 +#define UB_FILE_INIQUANT 4 + +static unsigned long ub_file_precharge(struct task_beancounter *task_bc, + struct user_beancounter *ub, unsigned long *kmemsize); + +extern struct kmem_cache *filp_cachep; + +static inline unsigned long ub_file_kmemsize(unsigned long nr) +{ + return CHARGE_SIZE(kmem_cache_objuse(filp_cachep)) * nr; +} + +/* + * Task staff + */ + +static void init_task_sub(struct task_struct *parent, + struct task_struct *tsk, + struct task_beancounter *old_bc) +{ + struct task_beancounter *new_bc; + struct user_beancounter *sub; + + new_bc = &tsk->task_bc; + sub = old_bc->fork_sub; + new_bc->fork_sub = get_beancounter(sub); + new_bc->task_fnode = NULL; + new_bc->task_freserv = old_bc->task_freserv; + old_bc->task_freserv = NULL; + memset(&new_bc->task_data, 0, sizeof(new_bc->task_data)); + new_bc->pgfault_handle = 0; + new_bc->pgfault_allot = 0; +} + +void ub_init_task_bc(struct task_beancounter *tbc) +{ + tbc->file_precharged = 0; + tbc->file_quant = UB_FILE_INIQUANT; + tbc->file_count = 0; + + tbc->kmem_precharged = 0; + tbc->dentry_alloc = 0; +} + +int ub_task_charge(struct task_struct *parent, struct task_struct *task) +{ + struct task_beancounter *old_bc; + struct task_beancounter *new_bc; + struct user_beancounter *ub, *pub; + unsigned long file_nr, kmemsize; + unsigned long flags; + + old_bc = &parent->task_bc; + ub = old_bc->fork_sub; + new_bc = &task->task_bc; + new_bc->task_ub = get_beancounter(ub); + new_bc->exec_ub = get_beancounter(ub); + + pub = top_beancounter(ub); + spin_lock_irqsave(&pub->ub_lock, flags); + if (unlikely(__charge_beancounter_locked(pub, UB_NUMPROC, + 1, UB_HARD) < 0)) + goto out_numproc; + + ub_init_task_bc(new_bc); + file_nr = ub_file_precharge(new_bc, pub, &kmemsize); + spin_unlock_irqrestore(&pub->ub_lock, flags); + + charge_beancounter_notop(ub, UB_NUMPROC, 1); + if (likely(file_nr)) { + charge_beancounter_notop(ub, UB_NUMFILE, file_nr); + charge_beancounter_notop(ub, UB_KMEMSIZE, kmemsize); + } + + init_task_sub(parent, task, old_bc); + return 0; + +out_numproc: + spin_unlock_irqrestore(&pub->ub_lock, flags); + __put_beancounter_batch(ub, 2); + return -ENOMEM; +} + +extern atomic_t dbgpre; + +void ub_task_uncharge(struct task_struct *task) +{ + struct task_beancounter *task_bc; + struct user_beancounter *pub; + unsigned long file_nr, file_kmemsize; + unsigned long flags; + + task_bc = &task->task_bc; + pub = top_beancounter(task_bc->task_ub); + spin_lock_irqsave(&pub->ub_lock, flags); + __uncharge_beancounter_locked(pub, UB_NUMPROC, 1); + file_nr = task_bc->file_precharged; + if (likely(file_nr)) + __uncharge_beancounter_locked(pub, + UB_NUMFILE, file_nr); + + /* see comment in ub_file_charge */ + task_bc->file_precharged = 0; + file_kmemsize = ub_file_kmemsize(file_nr); + if (likely(file_kmemsize)) + __uncharge_beancounter_locked(pub, + UB_KMEMSIZE, file_kmemsize); + spin_unlock_irqrestore(&pub->ub_lock, flags); + + uncharge_beancounter_notop(task_bc->task_ub, UB_NUMPROC, 1); + if (likely(file_nr)) { + uncharge_beancounter_notop(task_bc->task_ub, + UB_NUMFILE, file_nr); + __put_beancounter_batch(task_bc->task_ub, file_nr); + } + if (likely(file_kmemsize)) + uncharge_beancounter_notop(task_bc->task_ub, + UB_KMEMSIZE, file_kmemsize); +} + +void ub_task_put(struct task_struct *task) +{ + struct task_beancounter *task_bc; + struct user_beancounter *pub; + unsigned long kmemsize, flags; + + task_bc = &task->task_bc; + + pub = top_beancounter(task_bc->task_ub); + spin_lock_irqsave(&pub->ub_lock, flags); + kmemsize = task_bc->kmem_precharged; + task_bc->kmem_precharged = 0; + if (likely(kmemsize)) + __uncharge_beancounter_locked(pub, UB_KMEMSIZE, kmemsize); + spin_unlock_irqrestore(&pub->ub_lock, flags); + if (likely(kmemsize)) + uncharge_beancounter_notop(task_bc->task_ub, UB_KMEMSIZE, kmemsize); + + put_beancounter(task_bc->exec_ub); + put_beancounter(task_bc->task_ub); + put_beancounter(task_bc->fork_sub); + /* can't be freed elsewhere, failures possible in the middle of fork */ + if (task_bc->task_freserv != NULL) + kfree(task_bc->task_freserv); + + task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc; + task_bc->task_ub = (struct user_beancounter *)0xdead100c; + BUG_ON(task_bc->kmem_precharged != 0); +} + +/* + * Files and file locks. + */ +/* + * For NUMFILE, we do not take a lock and call charge function + * for every file. We try to charge in batches, keeping local reserve on + * task. For experimental purposes, batch size is adaptive and depends + * on numfile barrier, number of processes, and the history of successes and + * failures of batch charges. + * + * Per-task fields have the following meaning + * file_precharged number of files charged to beancounter in advance, + * file_quant logarithm of batch size + * file_count counter of charge successes, to reduce batch size + * fluctuations. + */ +static unsigned long ub_file_precharge(struct task_beancounter *task_bc, + struct user_beancounter *ub, unsigned long *kmemsize) +{ + unsigned long n, kmem; + + n = 1UL << task_bc->file_quant; + if (ub->ub_parms[UB_NUMPROC].held > + (ub->ub_parms[UB_NUMFILE].barrier >> + task_bc->file_quant)) + goto nopre; + if (unlikely(__charge_beancounter_locked(ub, UB_NUMFILE, n, UB_HARD))) + goto nopre; + kmem = ub_file_kmemsize(n); + if (unlikely(__charge_beancounter_locked(ub, UB_KMEMSIZE, + kmem, UB_HARD))) + goto nopre_kmem; + + task_bc->file_precharged += n; + get_beancounter_batch(task_bc->task_ub, n); + task_bc->file_count++; + if (task_bc->file_quant < UB_FILE_MAXQUANT && + task_bc->file_count >= task_bc->file_quant) { + task_bc->file_quant++; + task_bc->file_count = 0; + } + *kmemsize = kmem; + return n; + +nopre_kmem: + __uncharge_beancounter_locked(ub, UB_NUMFILE, n); +nopre: + if (task_bc->file_quant > UB_FILE_MINQUANT) + task_bc->file_quant--; + task_bc->file_count = 0; + return 0; +} + +int ub_file_charge(struct file *f) +{ + struct user_beancounter *ub, *pub; + struct task_beancounter *task_bc; + unsigned long file_nr, kmem; + unsigned long flags; + int err; + + task_bc = ¤t->task_bc; + ub = get_exec_ub(); + if (unlikely(ub != task_bc->task_ub)) + goto just_charge; + + if (likely(task_bc->file_precharged > 0)) { + /* + * files are put via RCU in 2.6.16 so during + * this decrement an IRQ can happen and called + * ub_files_uncharge() will mess file_precharged + * + * ub_task_uncharge() is called via RCU also so no + * protection is needed there + * + * Xemul + */ + + local_irq_save(flags); + task_bc->file_precharged--; + local_irq_restore(flags); + + f->f_ub = ub; + return 0; + } + + pub = top_beancounter(ub); + spin_lock_irqsave(&pub->ub_lock, flags); + file_nr = ub_file_precharge(task_bc, pub, &kmem); + if (unlikely(!file_nr)) + goto last_try; + spin_unlock(&pub->ub_lock); + task_bc->file_precharged--; + local_irq_restore(flags); + + charge_beancounter_notop(ub, UB_NUMFILE, file_nr); + charge_beancounter_notop(ub, UB_KMEMSIZE, kmem); + f->f_ub = ub; + return 0; + +just_charge: + pub = top_beancounter(ub); + spin_lock_irqsave(&pub->ub_lock, flags); +last_try: + kmem = ub_file_kmemsize(1); + err = __charge_beancounter_locked(pub, UB_NUMFILE, 1, UB_HARD); + if (likely(!err)) { + err = __charge_beancounter_locked(pub, UB_KMEMSIZE, + kmem, UB_HARD); + if (unlikely(err)) + __uncharge_beancounter_locked(pub, UB_NUMFILE, 1); + } + spin_unlock_irqrestore(&pub->ub_lock, flags); + if (likely(!err)) { + charge_beancounter_notop(ub, UB_NUMFILE, 1); + charge_beancounter_notop(ub, UB_KMEMSIZE, kmem); + f->f_ub = get_beancounter(ub); + } + return err; +} + +void ub_file_uncharge(struct file *f) +{ + struct user_beancounter *ub, *pub; + struct task_beancounter *task_bc; + unsigned long nr; + + ub = f->f_ub; + task_bc = ¤t->task_bc; + if (likely(ub == task_bc->task_ub)) { + task_bc->file_precharged++; + pub = top_beancounter(ub); + if (ub_barrier_farnr(pub, UB_NUMFILE) && + ub_barrier_farsz(pub, UB_KMEMSIZE)) + return; + if (task_bc->file_precharged < (1UL << task_bc->file_quant)) + return; + nr = task_bc->file_precharged + - (1UL << (task_bc->file_quant - 1)); + task_bc->file_precharged -= nr; + __put_beancounter_batch(ub, nr); + uncharge_beancounter(ub, UB_NUMFILE, nr); + uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(nr)); + } else { + uncharge_beancounter(ub, UB_NUMFILE, 1); + uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(1)); + put_beancounter(ub); + } +} + +int ub_flock_charge(struct file_lock *fl, int hard) +{ + struct user_beancounter *ub; + int err; + + /* No need to get_beancounter here since it's already got in slab */ + ub = slab_ub(fl); + if (ub == NULL) + return 0; + + err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT); + if (!err) + fl->fl_charged = 1; + return err; +} + +void ub_flock_uncharge(struct file_lock *fl) +{ + struct user_beancounter *ub; + + /* Ub will be put in slab */ + ub = slab_ub(fl); + if (ub == NULL || !fl->fl_charged) + return; + + uncharge_beancounter(ub, UB_NUMFLOCK, 1); + fl->fl_charged = 0; +} + +/* + * Signal handling + */ + +static int do_ub_siginfo_charge(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + if (__charge_beancounter_locked(ub, UB_KMEMSIZE, size, UB_HARD)) + goto out_kmem; + + if (__charge_beancounter_locked(ub, UB_NUMSIGINFO, 1, UB_HARD)) + goto out_num; + + spin_unlock_irqrestore(&ub->ub_lock, flags); + return 0; + +out_num: + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); +out_kmem: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return -ENOMEM; +} + +static void do_ub_siginfo_uncharge(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); + __uncharge_beancounter_locked(ub, UB_NUMSIGINFO, 1); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +int ub_siginfo_charge(struct sigqueue *sq, struct user_beancounter *ub) +{ + unsigned long size; + struct user_beancounter *p, *q; + + size = CHARGE_SIZE(kmem_obj_objuse(sq)); + for (p = ub; p != NULL; p = p->parent) { + if (do_ub_siginfo_charge(p, size)) + goto unroll; + } + + sq->sig_ub = get_beancounter(ub); + return 0; + +unroll: + for (q = ub; q != p; q = q->parent) + do_ub_siginfo_uncharge(q, size); + return -ENOMEM; +} +EXPORT_SYMBOL(ub_siginfo_charge); + +void ub_siginfo_uncharge(struct sigqueue *sq) +{ + unsigned long size; + struct user_beancounter *ub, *p; + + p = ub = sq->sig_ub; + sq->sig_ub = NULL; + size = CHARGE_SIZE(kmem_obj_objuse(sq)); + for (; ub != NULL; ub = ub->parent) + do_ub_siginfo_uncharge(ub, size); + put_beancounter(p); +} + +/* + * PTYs + */ + +int ub_pty_charge(struct tty_struct *tty) +{ + struct user_beancounter *ub; + int retval; + + ub = slab_ub(tty); + retval = 0; + if (ub && tty->driver->subtype == PTY_TYPE_MASTER && + !test_bit(TTY_CHARGED, &tty->flags)) { + retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD); + if (!retval) + set_bit(TTY_CHARGED, &tty->flags); + } + return retval; +} + +void ub_pty_uncharge(struct tty_struct *tty) +{ + struct user_beancounter *ub; + + ub = slab_ub(tty); + if (ub && tty->driver->subtype == PTY_TYPE_MASTER && + test_bit(TTY_CHARGED, &tty->flags)) { + uncharge_beancounter(ub, UB_NUMPTY, 1); + clear_bit(TTY_CHARGED, &tty->flags); + } +} diff --git a/kernel/bc/net.c b/kernel/bc/net.c new file mode 100644 index 0000000..ace2cb6 --- /dev/null +++ b/kernel/bc/net.c @@ -0,0 +1,1152 @@ +/* + * linux/kernel/bc/net.c + * + * Copyright (C) 1998-2004 Andrey V. Savochkin + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * TODO: + * - sizeof(struct inode) charge + * = tcp_mem_schedule() feedback based on ub limits + * + measures so that one socket won't exhaust all send buffers, + * see bug in bugzilla + * = sk->socket check for NULL in snd_wakeups + * (tcp_write_space checks for NULL itself) + * + in tcp_close(), orphaned socket abortion should be based on ubc + * resources (same in tcp_out_of_resources) + * Beancounter should also have separate orphaned socket counter... + * + for rcv, in-order segment should be accepted + * if only barrier is exceeded + * = tcp_rmem_schedule() feedback based on ub limits + * - repair forward_alloc mechanism for receive buffers + * It's idea is that some buffer space is pre-charged so that receive fast + * path doesn't need to take spinlocks and do other heavy stuff + * + tcp_prune_queue actions based on ub limits + * + window adjustments depending on available buffers for receive + * - window adjustments depending on available buffers for send + * + race around usewreserv + * + avoid allocating new page for each tiny-gram, see letter from ANK + * + rename ub_sock_lock + * + sk->sleep wait queue probably can be used for all wakeups, and + * sk->ub_wait is unnecessary + * + for UNIX sockets, the current algorithm will lead to + * UB_UNIX_MINBUF-sized messages only for non-blocking case + * - charge for af_packet sockets + * + all datagram sockets should be charged to NUMUNIXSOCK + * - we do not charge for skb copies and clones staying in device queues + * + live-lock if number of sockets is big and buffer limits are small + * [diff-ubc-dbllim3] + * - check that multiple readers/writers on the same socket won't cause fatal + * consequences + * - check allocation/charge orders + * + There is potential problem with callback_lock. In *snd_wakeup we take + * beancounter first, in sock_def_error_report - callback_lock first. + * then beancounter. This is not a problem if callback_lock taken + * readonly, but anyway... + * - SKB_CHARGE_SIZE doesn't include the space wasted by slab allocator + * General kernel problems: + * - in tcp_sendmsg(), if allocation fails, non-blocking sockets with ASYNC + * notification won't get signals + * - datagram_poll looks racy + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +/* by some reason it is not used currently */ +#define UB_SOCK_MAINTAIN_WMEMPRESSURE 0 + + +/* Skb truesize definition. Bad place. Den */ + +static inline int skb_chargesize_head(struct sk_buff *skb) +{ + return skb_charge_size(skb_end_pointer(skb) - skb->head + + sizeof(struct skb_shared_info)); +} + +int skb_charge_fullsize(struct sk_buff *skb) +{ + int chargesize; + struct sk_buff *skbfrag; + + chargesize = skb_chargesize_head(skb) + + PAGE_SIZE * skb_shinfo(skb)->nr_frags; + if (likely(skb_shinfo(skb)->frag_list == NULL)) + return chargesize; + for (skbfrag = skb_shinfo(skb)->frag_list; + skbfrag != NULL; + skbfrag = skbfrag->next) { + chargesize += skb_charge_fullsize(skbfrag); + } + return chargesize; +} +EXPORT_SYMBOL(skb_charge_fullsize); + +static int ub_sock_makewreserv_locked(struct sock *sk, + int bufid, unsigned long size); + +int __ub_too_many_orphans(struct sock *sk, int count) +{ + struct user_beancounter *ub; + + if (sock_has_ubc(sk)) { + ub = top_beancounter(sock_bc(sk)->ub); + if (count >= ub->ub_parms[UB_NUMTCPSOCK].barrier >> 2) + return 1; + } + return 0; +} + +/* + * Queueing + */ + +static void ub_sock_snd_wakeup(struct user_beancounter *ub) +{ + struct list_head *p; + struct sock *sk; + struct sock_beancounter *skbc; + struct socket *sock; + unsigned long added; + + while (!list_empty(&ub->ub_other_sk_list)) { + p = ub->ub_other_sk_list.next; + skbc = list_entry(p, struct sock_beancounter, ub_sock_list); + sk = skbc_sock(skbc); + + added = 0; + sock = sk->sk_socket; + if (sock == NULL) { + /* sk being destroyed */ + list_del_init(&skbc->ub_sock_list); + continue; + } + + ub_debug(UBD_NET_SLEEP, + "Checking queue, waiting %lu, reserv %lu\n", + skbc->ub_waitspc, skbc->poll_reserv); + added = -skbc->poll_reserv; + if (ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, + skbc->ub_waitspc)) + break; + added += skbc->poll_reserv; + + list_del_init(&skbc->ub_sock_list); + + /* + * See comments in ub_tcp_snd_wakeup. + * Locking note: both unix_write_space and + * sock_def_write_space take callback_lock themselves. + * We take it here just to be on the safe side and to + * act the same way as ub_tcp_snd_wakeup does. + */ + sock_hold(sk); + read_lock(&sk->sk_callback_lock); + spin_unlock(&ub->ub_lock); + + sk->sk_write_space(sk); + read_unlock(&sk->sk_callback_lock); + + if (skbc->ub != ub && added) + charge_beancounter_notop(skbc->ub, + UB_OTHERSOCKBUF, added); + sock_put(sk); + + spin_lock(&ub->ub_lock); + } +} + +static void ub_tcp_snd_wakeup(struct user_beancounter *ub) +{ + struct list_head *p; + struct sock *sk; + struct sock_beancounter *skbc; + struct socket *sock; + unsigned long added; + + while (!list_empty(&ub->ub_tcp_sk_list)) { + p = ub->ub_tcp_sk_list.next; + skbc = list_entry(p, struct sock_beancounter, ub_sock_list); + sk = skbc_sock(skbc); + + added = 0; + sock = sk->sk_socket; + if (sock == NULL) { + /* sk being destroyed */ + list_del_init(&skbc->ub_sock_list); + continue; + } + + ub_debug(UBD_NET_SLEEP, + "Checking queue, waiting %lu, reserv %lu\n", + skbc->ub_waitspc, skbc->poll_reserv); + added = -skbc->poll_reserv; + if (ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, + skbc->ub_waitspc)) + break; + added += skbc->poll_reserv; + + list_del_init(&skbc->ub_sock_list); + + /* + * Send async notifications and wake up. + * Locking note: we get callback_lock here because + * tcp_write_space is over-optimistic about calling context + * (socket lock is presumed). So we get the lock here although + * it belongs to the callback. + */ + sock_hold(sk); + read_lock(&sk->sk_callback_lock); + spin_unlock(&ub->ub_lock); + + sk->sk_write_space(sk); + read_unlock(&sk->sk_callback_lock); + + if (skbc->ub != ub && added) + charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, added); + sock_put(sk); + + spin_lock(&ub->ub_lock); + } +} + +void ub_sock_snd_queue_add(struct sock *sk, int res, unsigned long size) +{ + unsigned long flags; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long added_reserv; + + if (!sock_has_ubc(sk)) + return; + + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ub_debug(UBD_NET_SLEEP, "attempt to charge for %lu\n", size); + added_reserv = -skbc->poll_reserv; + if (!ub_sock_makewreserv_locked(sk, res, size)) { + /* + * It looks a bit hackish, but it is compatible with both + * wait_for_xx_ubspace and poll. + * This __set_current_state is equivalent to a wakeup event + * right after spin_unlock_irqrestore. + */ + __set_current_state(TASK_RUNNING); + added_reserv += skbc->poll_reserv; + spin_unlock_irqrestore(&ub->ub_lock, flags); + if (added_reserv) + charge_beancounter_notop(skbc->ub, res, added_reserv); + return; + } + + ub_debug(UBD_NET_SLEEP, "Adding sk to queue\n"); + skbc->ub_waitspc = size; + if (!list_empty(&skbc->ub_sock_list)) { + ub_debug(UBD_NET_SOCKET, + "re-adding socket to beancounter %p.\n", ub); + goto out; + } + + switch (res) { + case UB_TCPSNDBUF: + list_add_tail(&skbc->ub_sock_list, + &ub->ub_tcp_sk_list); + break; + case UB_OTHERSOCKBUF: + list_add_tail(&skbc->ub_sock_list, + &ub->ub_other_sk_list); + break; + default: + BUG(); + } +out: + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +EXPORT_SYMBOL(ub_sock_snd_queue_add); + +long ub_sock_wait_for_space(struct sock *sk, long timeo, unsigned long size) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(sk->sk_sleep, &wait); + for (;;) { + if (signal_pending(current)) + break; + set_current_state(TASK_INTERRUPTIBLE); + if (!ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size)) + break; + + if (sk->sk_shutdown & SEND_SHUTDOWN) + break; + if (sk->sk_err) + break; + ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, size); + timeo = schedule_timeout(timeo); + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + return timeo; +} + +void ub_sock_sndqueuedel(struct sock *sk) +{ + struct user_beancounter *ub; + struct sock_beancounter *skbc; + unsigned long flags; + + if (!sock_has_ubc(sk)) + return; + skbc = sock_bc(sk); + + /* race with write_space callback of other socket */ + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + list_del_init(&skbc->ub_sock_list); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +/* + * Helpers + */ + +static inline void __ub_skb_set_charge(struct sk_buff *skb, struct sock *sk, + unsigned long size, int resource) +{ + WARN_ON_ONCE(skb_bc(skb)->ub != NULL); + + skb_bc(skb)->ub = sock_bc(sk)->ub; + skb_bc(skb)->charged = size; + skb_bc(skb)->resource = resource; +} + +void ub_skb_set_charge(struct sk_buff *skb, struct sock *sk, + unsigned long size, int resource) +{ + if (!sock_has_ubc(sk)) + return; + + if (sock_bc(sk)->ub == NULL) + BUG(); + + __ub_skb_set_charge(skb, sk, size, resource); + + /* Ugly. Ugly. Skb in sk writequeue can live without ref to sk */ + if (skb->sk == NULL) + skb->sk = sk; +} + +EXPORT_SYMBOL(ub_skb_set_charge); + +static inline void ub_skb_set_uncharge(struct sk_buff *skb) +{ + skb_bc(skb)->ub = NULL; + skb_bc(skb)->charged = 0; + skb_bc(skb)->resource = 0; +} + +static void ub_update_rmem_thres(struct sock_beancounter *skub) +{ + struct user_beancounter *ub; + + if (skub && skub->ub) { + ub = top_beancounter(skub->ub); + ub->ub_rmem_thres = ub->ub_parms[UB_TCPRCVBUF].barrier / + (ub->ub_parms[UB_NUMTCPSOCK].held + 1); + } +} + +static inline void ub_sock_wcharge_dec(struct sock *sk, + unsigned long chargesize) +{ + /* The check sk->sk_family != PF_NETLINK is made as the skb is + * queued to the kernel end of socket while changed to the user one. + * Den */ + if (unlikely(sock_bc(sk)->ub_wcharged) && sk->sk_family != PF_NETLINK) { + if (sock_bc(sk)->ub_wcharged > chargesize) + sock_bc(sk)->ub_wcharged -= chargesize; + else + sock_bc(sk)->ub_wcharged = 0; + } +} + +/* + * Charge socket number + */ + +static inline void sk_alloc_beancounter(struct sock *sk) +{ + struct sock_beancounter *skbc; + + skbc = sock_bc(sk); + memset(skbc, 0, sizeof(struct sock_beancounter)); +} + +static inline void sk_free_beancounter(struct sock *sk) +{ +} + +static int __sock_charge(struct sock *sk, int res) +{ + struct sock_beancounter *skbc; + struct user_beancounter *cub, *ub; + unsigned long added_reserv, added_forw; + unsigned long flags; + + cub = get_exec_ub(); + if (unlikely(cub == NULL)) + return 0; + + sk_alloc_beancounter(sk); + skbc = sock_bc(sk); + INIT_LIST_HEAD(&skbc->ub_sock_list); + + ub = top_beancounter(cub); + spin_lock_irqsave(&ub->ub_lock, flags); + if (unlikely(__charge_beancounter_locked(ub, res, 1, UB_HARD) < 0)) + goto out_limit; + + added_reserv = 0; + added_forw = 0; + if (res == UB_NUMTCPSOCK) { + added_reserv = skb_charge_size(MAX_TCP_HEADER + + 1500 - sizeof(struct iphdr) - + sizeof(struct tcphdr)); + added_reserv *= 4; + ub->ub_parms[UB_TCPSNDBUF].held += added_reserv; + if (!ub_barrier_farsz(ub, UB_TCPSNDBUF)) { + ub->ub_parms[UB_TCPSNDBUF].held -= added_reserv; + added_reserv = 0; + } + skbc->poll_reserv = added_reserv; + + added_forw = SK_MEM_QUANTUM * 4; + ub->ub_parms[UB_TCPRCVBUF].held += added_forw; + if (!ub_barrier_farsz(ub, UB_TCPRCVBUF)) { + ub->ub_parms[UB_TCPRCVBUF].held -= added_forw; + added_forw = 0; + } + skbc->forw_space = added_forw; + } + spin_unlock_irqrestore(&ub->ub_lock, flags); + + charge_beancounter_notop(cub, res, 1); + if (added_reserv) + charge_beancounter_notop(cub, UB_TCPSNDBUF, added_reserv); + if (added_forw) + charge_beancounter_notop(cub, UB_TCPRCVBUF, added_forw); + + skbc->ub = get_beancounter(cub); + return 0; + +out_limit: + spin_unlock_irqrestore(&ub->ub_lock, flags); + sk_free_beancounter(sk); + return -ENOMEM; +} + +int ub_tcp_sock_charge(struct sock *sk) +{ + int ret; + + ret = __sock_charge(sk, UB_NUMTCPSOCK); + ub_update_rmem_thres(sock_bc(sk)); + + return ret; +} + +int ub_other_sock_charge(struct sock *sk) +{ + return __sock_charge(sk, UB_NUMOTHERSOCK); +} + +EXPORT_SYMBOL(ub_other_sock_charge); + +int ub_sock_charge(struct sock *sk, int family, int type) +{ + return (IS_TCP_SOCK(family, type) ? + ub_tcp_sock_charge(sk) : ub_other_sock_charge(sk)); +} + +EXPORT_SYMBOL(ub_sock_charge); + +/* + * Uncharge socket number + */ + +void ub_sock_uncharge(struct sock *sk) +{ + int is_tcp_sock; + unsigned long flags; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long reserv, forw; + + if (unlikely(!sock_has_ubc(sk))) + return; + + is_tcp_sock = IS_TCP_SOCK(sk->sk_family, sk->sk_type); + skbc = sock_bc(sk); + ub_debug(UBD_NET_SOCKET, "Calling ub_sock_uncharge on %p\n", sk); + + ub = top_beancounter(skbc->ub); + + spin_lock_irqsave(&ub->ub_lock, flags); + if (!list_empty(&skbc->ub_sock_list)) { + ub_debug(UBD_NET_SOCKET, + "ub_sock_uncharge: removing from ub(%p) queue.\n", + skbc); + list_del_init(&skbc->ub_sock_list); + } + + reserv = skbc->poll_reserv; + forw = skbc->forw_space; + __uncharge_beancounter_locked(ub, + (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), + reserv); + if (forw) + __uncharge_beancounter_locked(ub, + (is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF), + forw); + __uncharge_beancounter_locked(ub, + (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); + + ub_sock_wcharge_dec(sk, reserv); + if (unlikely(skbc->ub_wcharged)) + printk(KERN_WARNING + "ub_sock_uncharge: wch=%lu for ub %p (%d).\n", + skbc->ub_wcharged, skbc->ub, skbc->ub->ub_uid); + skbc->poll_reserv = 0; + skbc->forw_space = 0; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + uncharge_beancounter_notop(skbc->ub, + (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), + reserv); + if (forw) + uncharge_beancounter_notop(skbc->ub, + (is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF), + forw); + uncharge_beancounter_notop(skbc->ub, + (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); + + put_beancounter(skbc->ub); + sk_free_beancounter(sk); +} + +/* + * Special case for netlink_dump - (un)charges precalculated size + */ + +int ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk) +{ + int ret; + unsigned long chargesize; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + chargesize = skb_charge_fullsize(skb); + ret = charge_beancounter(sock_bc(sk)->ub, + UB_OTHERSOCKBUF, chargesize, UB_HARD); + if (ret < 0) + return ret; + ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF); + return ret; +} + +/* + * Poll reserve accounting + * + * This is the core of socket buffer management (along with queueing/wakeup + * functions. The rest of buffer accounting either call these functions, or + * repeat parts of their logic for some simpler cases. + */ + +static int ub_sock_makewreserv_locked(struct sock *sk, + int bufid, unsigned long size) +{ + unsigned long wcharge_added; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + + skbc = sock_bc(sk); + if (skbc->poll_reserv >= size) /* no work to be done */ + goto out; + + ub = top_beancounter(skbc->ub); + ub->ub_parms[bufid].held += size - skbc->poll_reserv; + + wcharge_added = 0; + /* + * Logic: + * 1) when used memory hits barrier, we set wmem_pressure; + * wmem_pressure is reset under barrier/2; + * between barrier/2 and barrier we limit per-socket buffer growth; + * 2) each socket is guaranteed to get (limit-barrier)/maxsockets + * calculated on the base of memory eaten after the barrier is hit + */ + skbc = sock_bc(sk); +#if UB_SOCK_MAINTAIN_WMEMPRESSURE + if (!ub_hfbarrier_hit(ub, bufid)) { + if (ub->ub_wmem_pressure) + ub_debug(UBD_NET_SEND, "makewres: pressure -> 0 " + "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", + sk, size, skbc->poll_reserv, + ub->ub_parms[bufid].held, + skbc->ub_wcharged, sk->sk_sndbuf); + ub->ub_wmem_pressure = 0; + } +#endif + if (ub_barrier_hit(ub, bufid)) { +#if UB_SOCK_MAINTAIN_WMEMPRESSURE + if (!ub->ub_wmem_pressure) + ub_debug(UBD_NET_SEND, "makewres: pressure -> 1 " + "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", + sk, size, skbc->poll_reserv, + ub->ub_parms[bufid].held, + skbc->ub_wcharged, sk->sk_sndbuf); + ub->ub_wmem_pressure = 1; +#endif + if (sk->sk_family == PF_NETLINK) + goto unroll; + wcharge_added = size - skbc->poll_reserv; + skbc->ub_wcharged += wcharge_added; + if (skbc->ub_wcharged * ub->ub_parms[bid2sid(bufid)].limit + + ub->ub_parms[bufid].barrier > + ub->ub_parms[bufid].limit) + goto unroll_wch; + } + if (ub->ub_parms[bufid].held > ub->ub_parms[bufid].limit) + goto unroll; + + ub_adjust_maxheld(ub, bufid); + skbc->poll_reserv = size; +out: + return 0; + +unroll_wch: + skbc->ub_wcharged -= wcharge_added; +unroll: + ub_debug(UBD_NET_SEND, + "makewres: deny " + "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", + sk, size, skbc->poll_reserv, ub->ub_parms[bufid].held, + skbc->ub_wcharged, sk->sk_sndbuf); + ub->ub_parms[bufid].failcnt++; + ub->ub_parms[bufid].held -= size - skbc->poll_reserv; + + if (sk->sk_socket != NULL) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + return -ENOMEM; +} + +int ub_sock_make_wreserv(struct sock *sk, int bufid, unsigned long size) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long flags; + unsigned long added_reserv; + int err; + + skbc = sock_bc(sk); + + /* + * This function provides that there is sufficient reserve upon return + * only if sk has only one user. We can check poll_reserv without + * serialization and avoid locking if the reserve already exists. + */ + if (unlikely(!sock_has_ubc(sk)) || likely(skbc->poll_reserv >= size)) + return 0; + + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + added_reserv = -skbc->poll_reserv; + err = ub_sock_makewreserv_locked(sk, bufid, size); + added_reserv += skbc->poll_reserv; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + if (added_reserv) + charge_beancounter_notop(skbc->ub, bufid, added_reserv); + + return err; +} + +EXPORT_SYMBOL(ub_sock_make_wreserv); + +int ub_sock_get_wreserv(struct sock *sk, int bufid, unsigned long size) +{ + struct sock_beancounter *skbc; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + /* optimize for the case if socket has sufficient reserve */ + ub_sock_make_wreserv(sk, bufid, size); + skbc = sock_bc(sk); + if (likely(skbc->poll_reserv >= size)) { + skbc->poll_reserv -= size; + return 0; + } + return -ENOMEM; +} + +EXPORT_SYMBOL(ub_sock_get_wreserv); + +static void ub_sock_do_ret_wreserv(struct sock *sk, int bufid, + unsigned long size, unsigned long ressize) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long extra; + unsigned long flags; + + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + + extra = 0; + spin_lock_irqsave(&ub->ub_lock, flags); + skbc->poll_reserv += size; + if (skbc->poll_reserv > ressize) { + extra = skbc->poll_reserv - ressize; + ub_sock_wcharge_dec(sk, extra); + skbc->poll_reserv = ressize; + + __uncharge_beancounter_locked(ub, bufid, extra); + if (bufid == UB_TCPSNDBUF) + ub_tcp_snd_wakeup(ub); + else + ub_sock_snd_wakeup(ub); + } + spin_unlock_irqrestore(&ub->ub_lock, flags); + + if (extra) + uncharge_beancounter_notop(skbc->ub, bufid, extra); +} + +void ub_sock_ret_wreserv(struct sock *sk, int bufid, + unsigned long size, unsigned long ressize) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + + if (unlikely(!sock_has_ubc(sk))) + return; + + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + /* check if the reserve can be kept */ + if (ub_barrier_farsz(ub, bufid)) { + skbc->poll_reserv += size; + return; + } + ub_sock_do_ret_wreserv(sk, bufid, size, ressize); +} + +/* + * UB_DGRAMRCVBUF + */ + +static int ub_dgramrcvbuf_charge(struct sock *sk, struct sk_buff *skb) +{ + unsigned long chargesize; + + chargesize = skb_charge_fullsize(skb); + if (charge_beancounter(sock_bc(sk)->ub, UB_DGRAMRCVBUF, + chargesize, UB_HARD)) + return -ENOMEM; + + ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF); + return 0; +} + +int ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb) +{ + if (unlikely(!sock_has_ubc(sk))) + return 0; + + if (IS_TCP_SOCK(sk->sk_family, sk->sk_type)) + return ub_tcprcvbuf_charge(sk, skb); + else + return ub_dgramrcvbuf_charge(sk, skb); +} + +EXPORT_SYMBOL(ub_sockrcvbuf_charge); + +static void ub_sockrcvbuf_uncharge(struct sk_buff *skb) +{ + uncharge_beancounter(skb_bc(skb)->ub, UB_DGRAMRCVBUF, + skb_bc(skb)->charged); + ub_skb_set_uncharge(skb); +} + +/* + * UB_TCPRCVBUF + */ + +int ub_sock_tcp_chargerecv(struct sock *sk, struct sk_buff *skb, + enum ub_severity strict) +{ + int retval; + unsigned long flags; + struct user_beancounter *ub; + struct sock_beancounter *skbc; + unsigned long chargesize; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + skbc = sock_bc(sk); + + chargesize = skb_charge_fullsize(skb); + if (likely(skbc->forw_space >= chargesize)) { + skbc->forw_space -= chargesize; + __ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF); + return 0; + } + + /* + * Memory pressure reactions: + * 1) set UB_RMEM_KEEP (clearing UB_RMEM_EXPAND) + * 2) set UB_RMEM_SHRINK and tcp_clamp_window() + * tcp_collapse_queues() if rmem_alloc > rcvbuf + * 3) drop OFO, tcp_purge_ofo() + * 4) drop all. + * Currently, we do #2 and #3 at once (which means that current + * collapsing of OFO queue in tcp_collapse_queues() is a waste of time, + * for example...) + * On memory pressure we jump from #0 to #3, and when the pressure + * subsides, to #1. + */ + retval = 0; + ub = top_beancounter(sock_bc(sk)->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_parms[UB_TCPRCVBUF].held += chargesize; + if (ub->ub_parms[UB_TCPRCVBUF].held > + ub->ub_parms[UB_TCPRCVBUF].barrier && + strict != UB_FORCE) + goto excess; + ub_adjust_maxheld(ub, UB_TCPRCVBUF); + spin_unlock_irqrestore(&ub->ub_lock, flags); + +out: + if (retval == 0) { + charge_beancounter_notop(sock_bc(sk)->ub, UB_TCPRCVBUF, + chargesize); + ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF); + } + return retval; + +excess: + ub->ub_rmem_pressure = UB_RMEM_SHRINK; + if (strict == UB_HARD) + retval = -ENOMEM; + if (ub->ub_parms[UB_TCPRCVBUF].held > ub->ub_parms[UB_TCPRCVBUF].limit) + retval = -ENOMEM; + /* + * We try to leave numsock*maxadvmss as a reserve for sockets not + * queueing any data yet (if the difference between the barrier and the + * limit is enough for this reserve). + */ + if (ub->ub_parms[UB_TCPRCVBUF].held + + ub->ub_parms[UB_NUMTCPSOCK].limit * ub->ub_maxadvmss + > ub->ub_parms[UB_TCPRCVBUF].limit && + atomic_read(&sk->sk_rmem_alloc)) + retval = -ENOMEM; + if (retval) { + ub->ub_parms[UB_TCPRCVBUF].held -= chargesize; + ub->ub_parms[UB_TCPRCVBUF].failcnt++; + } + ub_adjust_maxheld(ub, UB_TCPRCVBUF); + spin_unlock_irqrestore(&ub->ub_lock, flags); + goto out; +} +EXPORT_SYMBOL(ub_sock_tcp_chargerecv); + +static void ub_tcprcvbuf_uncharge(struct sk_buff *skb) +{ + unsigned long flags; + unsigned long held, bar; + int prev_pres; + struct user_beancounter *ub; + + ub = top_beancounter(skb_bc(skb)->ub); + if (ub_barrier_farsz(ub, UB_TCPRCVBUF)) { + sock_bc(skb->sk)->forw_space += skb_bc(skb)->charged; + ub_skb_set_uncharge(skb); + return; + } + + spin_lock_irqsave(&ub->ub_lock, flags); + if (ub->ub_parms[UB_TCPRCVBUF].held < skb_bc(skb)->charged) { + printk(KERN_ERR "Uncharging %d for tcprcvbuf of %p with %lu\n", + skb_bc(skb)->charged, + ub, ub->ub_parms[UB_TCPRCVBUF].held); + /* ass-saving bung */ + skb_bc(skb)->charged = ub->ub_parms[UB_TCPRCVBUF].held; + } + ub->ub_parms[UB_TCPRCVBUF].held -= skb_bc(skb)->charged; + held = ub->ub_parms[UB_TCPRCVBUF].held; + bar = ub->ub_parms[UB_TCPRCVBUF].barrier; + prev_pres = ub->ub_rmem_pressure; + if (held <= bar - (bar >> 2)) + ub->ub_rmem_pressure = UB_RMEM_EXPAND; + else if (held <= bar) + ub->ub_rmem_pressure = UB_RMEM_KEEP; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + uncharge_beancounter_notop(skb_bc(skb)->ub, UB_TCPRCVBUF, + skb_bc(skb)->charged); + ub_skb_set_uncharge(skb); +} + + +/* + * UB_OTHERSOCKBUF and UB_TCPSNDBUF + */ + +static void ub_socksndbuf_uncharge(struct sk_buff *skb) +{ + unsigned long flags; + struct user_beancounter *ub, *cub; + unsigned long chargesize; + + cub = skb_bc(skb)->ub; + ub = top_beancounter(cub); + chargesize = skb_bc(skb)->charged; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_OTHERSOCKBUF, chargesize); + if (skb->sk != NULL && sock_has_ubc(skb->sk)) + ub_sock_wcharge_dec(skb->sk, chargesize); + ub_sock_snd_wakeup(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); + + uncharge_beancounter_notop(cub, UB_OTHERSOCKBUF, chargesize); + ub_skb_set_uncharge(skb); +} + +/* expected to be called under socket lock */ +static void ub_tcpsndbuf_uncharge(struct sk_buff *skb) +{ + /* + * ub_sock_ret_wreserv call is abused here, we just want to uncharge + * skb size. However, to reduce duplication of the code doing + * ub_hfbarrier_hit check, ub_wcharged reduction, and wakeup we call + * a function that already does all of this. 2006/04/27 SAW + */ + ub_sock_ret_wreserv(skb->sk, UB_TCPSNDBUF, skb_bc(skb)->charged, + sock_bc(skb->sk)->poll_reserv); + ub_skb_set_uncharge(skb); +} + +void ub_skb_uncharge(struct sk_buff *skb) +{ + switch (skb_bc(skb)->resource) { + case UB_TCPSNDBUF: + ub_tcpsndbuf_uncharge(skb); + break; + case UB_TCPRCVBUF: + ub_tcprcvbuf_uncharge(skb); + break; + case UB_DGRAMRCVBUF: + ub_sockrcvbuf_uncharge(skb); + break; + case UB_OTHERSOCKBUF: + ub_socksndbuf_uncharge(skb); + break; + } +} + +EXPORT_SYMBOL(ub_skb_uncharge); /* due to skb_orphan()/conntracks */ + +/* + * Other sock reserve managment + */ + +int ub_sock_getwres_other(struct sock *sk, unsigned long size) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long flags; + unsigned long added_reserv; + int err; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + /* + * Nothing except beancounter lock protects skbc->poll_reserv. + * So, take the lock and do the job. + * Dances with added_reserv repeat ub_sock_make_wreserv. + */ + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + added_reserv = -skbc->poll_reserv; + err = ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, size); + added_reserv += skbc->poll_reserv; + if (!err) + skbc->poll_reserv -= size; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + if (added_reserv) + charge_beancounter_notop(skbc->ub, UB_OTHERSOCKBUF, added_reserv); + + return err; +} +EXPORT_SYMBOL(ub_sock_getwres_other); + +void ub_sock_retwres_other(struct sock *sk, + unsigned long size, unsigned long ressize) +{ + if (unlikely(!sock_has_ubc(sk))) + return; + + ub_sock_do_ret_wreserv(sk, UB_OTHERSOCKBUF, size, ressize); +} + +/* + * TCP send buffers accouting. Paged part + */ + +int ub_sock_tcp_chargepage(struct sock *sk) +{ + struct sock_beancounter *skbc; + unsigned long extra; + int err; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + skbc = sock_bc(sk); + ub_sock_make_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE); + if (likely(skbc->poll_reserv >= PAGE_SIZE)) { + skbc->poll_reserv -= PAGE_SIZE; + return 0; + } + + /* + * Ok, full page is not available. + * However, this function must succeed if poll previously indicated + * that write is possible. We better make a forced charge here + * than reserve a whole page in poll. + */ + err = ub_sock_make_wreserv(sk, UB_TCPSNDBUF, SOCK_MIN_UBCSPACE); + if (unlikely(err < 0)) + goto out; + if (skbc->poll_reserv < PAGE_SIZE) { + extra = PAGE_SIZE - skbc->poll_reserv; + err = charge_beancounter(skbc->ub, UB_TCPSNDBUF, extra, + UB_FORCE); + if (err < 0) + goto out; + skbc->poll_reserv += extra; + } + skbc->poll_reserv -= PAGE_SIZE; + return 0; + +out: + return err; +} + +void ub_sock_tcp_detachpage(struct sock *sk) +{ + struct sk_buff *skb; + + if (unlikely(!sock_has_ubc(sk))) + return; + + /* The page is just detached from socket. The last skb in queue + with paged part holds referrence to it */ + skb = skb_peek_tail(&sk->sk_write_queue); + if (skb == NULL) { + /* If the queue is empty - all data is sent and page is about + to be freed */ + ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE, + sock_bc(sk)->poll_reserv); + } else { + /* Last skb is a good aproximation for a last skb with + paged part */ + skb_bc(skb)->charged += PAGE_SIZE; + } +} + +/* + * TCPSNDBUF charge functions below are called in the following cases: + * - sending of SYN, SYN-ACK, FIN, the latter charge is forced by + * some technical reasons in TCP code; + * - fragmentation of TCP packets. + * These functions are allowed but not required to use poll_reserv. + * Originally, these functions didn't do that, since it didn't make + * any sense. Now, since poll_reserv now has a function of general reserve, + * they use it. + */ +int ub_sock_tcp_chargesend(struct sock *sk, struct sk_buff *skb, + enum ub_severity strict) +{ + int ret; + unsigned long chargesize; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long flags; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + skbc = sock_bc(sk); + chargesize = skb_charge_fullsize(skb); + if (likely(skbc->poll_reserv >= chargesize)) { + skbc->poll_reserv -= chargesize; + __ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); + /* XXX hack, see ub_skb_set_charge */ + skb->sk = sk; + return 0; + } + + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ret = __charge_beancounter_locked(ub, UB_TCPSNDBUF, + chargesize, strict); + /* + * Note: this check is not equivalent of the corresponding check + * in makewreserv. It's similar in spirit, but an equivalent check + * would be too long and complicated here. + */ + if (!ret && ub_barrier_hit(ub, UB_TCPSNDBUF)) + skbc->ub_wcharged += chargesize; + spin_unlock_irqrestore(&ub->ub_lock, flags); + if (likely(!ret)) { + charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, chargesize); + ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); + } + return ret; +} +EXPORT_SYMBOL(ub_sock_tcp_chargesend); + +/* + * Initialization + */ + +int __init skbc_cache_init(void) +{ + return 0; +} diff --git a/kernel/bc/oom_kill.c b/kernel/bc/oom_kill.c new file mode 100644 index 0000000..c79e826 --- /dev/null +++ b/kernel/bc/oom_kill.c @@ -0,0 +1,200 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define UB_OOM_TIMEOUT (5 * HZ) + +int oom_generation; +int oom_kill_counter; +static DEFINE_SPINLOCK(oom_lock); +static DECLARE_WAIT_QUEUE_HEAD(oom_wq); + +static inline int ub_oom_completed(struct task_struct *tsk) +{ + if (test_tsk_thread_flag(tsk, TIF_MEMDIE)) + /* we were oom killed - just die */ + return 1; + if (tsk->task_bc.oom_generation != oom_generation) + /* some task was succesfully killed */ + return 1; + return 0; +} + +static void ub_clear_oom(void) +{ + struct user_beancounter *ub; + + rcu_read_lock(); + for_each_beancounter(ub) + ub->ub_oom_noproc = 0; + rcu_read_unlock(); +} + +/* Called with cpuset_lock held */ +int ub_oom_lock(void) +{ + int timeout; + DEFINE_WAIT(oom_w); + struct task_struct *tsk; + + tsk = current; + + spin_lock(&oom_lock); + if (!oom_kill_counter) + goto out_do_oom; + + timeout = UB_OOM_TIMEOUT; + while (1) { + if (ub_oom_completed(tsk)) { + spin_unlock(&oom_lock); + return -EINVAL; + } + + if (timeout == 0) + break; + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&oom_wq, &oom_w); + spin_unlock(&oom_lock); + cpuset_unlock(); + + timeout = schedule_timeout(timeout); + + cpuset_lock(); + spin_lock(&oom_lock); + remove_wait_queue(&oom_wq, &oom_w); + } + +out_do_oom: + ub_clear_oom(); + return 0; +} + +static inline long ub_current_overdraft(struct user_beancounter *ub) +{ + return ub->ub_parms[UB_OOMGUARPAGES].held + + ((ub->ub_parms[UB_KMEMSIZE].held + + ub->ub_parms[UB_TCPSNDBUF].held + + ub->ub_parms[UB_TCPRCVBUF].held + + ub->ub_parms[UB_OTHERSOCKBUF].held + + ub->ub_parms[UB_DGRAMRCVBUF].held) + >> PAGE_SHIFT) - ub->ub_parms[UB_OOMGUARPAGES].barrier; +} + +int ub_oom_task_skip(struct user_beancounter *ub, struct task_struct *tsk) +{ + struct user_beancounter *mm_ub; + + if (ub == NULL) + return 0; + + task_lock(tsk); + if (tsk->mm == NULL) + mm_ub = NULL; + else + mm_ub = tsk->mm->mm_ub; + + while (mm_ub != NULL && mm_ub != ub) + mm_ub = mm_ub->parent; + task_unlock(tsk); + + return mm_ub != ub; +} + +struct user_beancounter *ub_oom_select_worst(void) +{ + struct user_beancounter *ub, *walkp; + long ub_maxover; + + ub_maxover = 0; + ub = NULL; + + rcu_read_lock(); + for_each_beancounter (walkp) { + long ub_overdraft; + + if (walkp->parent != NULL) + continue; + if (walkp->ub_oom_noproc) + continue; + + ub_overdraft = ub_current_overdraft(walkp); + if (ub_overdraft > ub_maxover && get_beancounter_rcu(walkp)) { + put_beancounter(ub); + ub = walkp; + ub_maxover = ub_overdraft; + } + } + + if (ub) + ub->ub_oom_noproc = 1; + rcu_read_unlock(); + + return ub; +} + +void ub_oom_mm_killed(struct user_beancounter *ub) +{ + static struct ub_rate_info ri = { 5, 60*HZ }; + + /* increment is serialized with oom_lock */ + ub->ub_parms[UB_OOMGUARPAGES].failcnt++; + + if (ub_ratelimit(&ri)) + show_mem(); +} + +void ub_oom_unlock(void) +{ + spin_unlock(&oom_lock); +} + +void ub_oom_task_dead(struct task_struct *tsk) +{ + spin_lock(&oom_lock); + oom_kill_counter = 0; + oom_generation++; + + printk("OOM killed process %s (pid=%d, ve=%d) exited, " + "free=%lu gen=%d.\n", + tsk->comm, tsk->pid, VEID(tsk->ve_task_info.owner_env), + nr_free_pages(), oom_generation); + /* if there is time to sleep in ub_oom_lock -> sleep will continue */ + wake_up_all(&oom_wq); + spin_unlock(&oom_lock); +} + +void ub_out_of_memory(struct user_beancounter *scope) +{ + struct user_beancounter *ub; + struct task_struct *p; + + cpuset_lock(); + spin_lock(&oom_lock); + ub_clear_oom(); + ub = get_beancounter(scope); + + read_lock(&tasklist_lock); +retry: + p = select_bad_process(ub, NULL); + if (p == NULL || PTR_ERR(p) == -1UL) + goto unlock; + + if (oom_kill_process(p, (gfp_t)-1, -1, NULL, "UB Out of memory")) + goto retry; + + put_beancounter(ub); + +unlock: + read_unlock(&tasklist_lock); + spin_unlock(&oom_lock); + cpuset_unlock(); +} +EXPORT_SYMBOL(ub_out_of_memory); diff --git a/kernel/bc/proc.c b/kernel/bc/proc.c new file mode 100644 index 0000000..5b1ae4b --- /dev/null +++ b/kernel/bc/proc.c @@ -0,0 +1,682 @@ +/* + * kernel/bc/proc.c + * + * Copyright (C) 2006 OpenVZ. SWsoft Inc. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Generic output formats */ +#if BITS_PER_LONG == 32 +const char *bc_proc_lu_fmt = "\t%-20s %10lu\n"; +const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n"; +const char *bc_proc_llu_fmt = "\t%-20s %21llu\n"; +const char *bc_proc_lu_lu_fmt = "\t%-20s %10lu %10lu\n"; +#else +const char *bc_proc_lu_fmt = "\t%-20s %21lu\n"; +const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n"; +const char *bc_proc_llu_fmt = "\t%-20s %21llu\n"; +const char *bc_proc_lu_lu_fmt = "\t%-20s %21lu %21lu\n"; +#endif + +#if BITS_PER_LONG == 32 +static const char *head_fmt = "%10s %-12s %10s %10s %10s %10s %10s\n"; +static const char *res_fmt = "%10s %-12s %10lu %10lu %10lu %10lu %10lu\n"; +#else +static const char *head_fmt = "%10s %-12s %20s %20s %20s %20s %20s\n"; +static const char *res_fmt = "%10s %-12s %20lu %20lu %20lu %20lu %20lu\n"; +#endif + +static void ub_show_res(struct seq_file *f, struct user_beancounter *ub, + int r, int show_uid) +{ + int len; + char ub_uid[64]; + + if (show_uid && r == 0) { + len = print_ub_uid(ub, ub_uid, sizeof(ub_uid) - 2); + ub_uid[len] = ':'; + ub_uid[len + 1] = '\0'; + } else + strcpy(ub_uid, ""); + + seq_printf(f, res_fmt, ub_uid, ub_rnames[r], + ub->ub_parms[r].held, + ub->ub_parms[r].maxheld, + ub->ub_parms[r].barrier, + ub->ub_parms[r].limit, + ub->ub_parms[r].failcnt); +} + +static void __show_resources(struct seq_file *f, struct user_beancounter *ub, + int show_uid) +{ + int i; + + for (i = 0; i < UB_RESOURCES_COMPAT; i++) + if (strcmp(ub_rnames[i], "dummy") != 0) + ub_show_res(f, ub, i, show_uid); + + for (i = UB_RESOURCES_COMPAT; i < UB_RESOURCES; i++) + ub_show_res(f, ub, i, show_uid); +} + +static int bc_resources_show(struct seq_file *f, void *v) +{ + __show_resources(f, seq_beancounter(f), 0); + return 0; +} + +static struct bc_proc_entry bc_resources_entry = { + .name = "resources", + .u.show = bc_resources_show, +}; + +#ifdef CONFIG_UBC_DEBUG +static int bc_debug_show(struct seq_file *f, void *v) +{ + struct user_beancounter *ub; + char buf[64]; + + ub = seq_beancounter(f); + print_ub_uid(ub, buf, sizeof(buf)); + seq_printf(f, "uid: %s\n", buf); + seq_printf(f, "ref: %d\n", atomic_read(&ub->ub_refcount)); + + seq_printf(f, "bc: %p\n", ub); + seq_printf(f, "par: %p\n", ub->parent); + seq_printf(f, "priv: %p\n", ub->private_data); + return 0; +} + +static struct bc_proc_entry bc_debug_entry = { + .name = "debug", + .u.show = bc_debug_show, +}; +#endif + +static int ub_show(struct seq_file *f, void *v) +{ + int i; + + for (i = 0; i < UB_RESOURCES_COMPAT; i++) + ub_show_res(f, (struct user_beancounter *)v, i, 1); + return 0; +} + +static int res_show(struct seq_file *f, void *v) +{ + __show_resources(f, (struct user_beancounter *)v, 1); + return 0; +} + +static int ub_accessible(struct user_beancounter *exec, + struct user_beancounter *target) +{ + struct user_beancounter *p, *q; + + p = top_beancounter(exec); + q = top_beancounter(target); + + return (p == get_ub0() || p == q); +} + +static void ub_show_header(struct seq_file *f) +{ + seq_printf(f, "Version: 2.5\n"); + seq_printf(f, head_fmt, "uid", "resource", + "held", "maxheld", "barrier", "limit", "failcnt"); +} + +static void *ub_start(struct seq_file *f, loff_t *ppos) +{ + struct user_beancounter *ub; + struct user_beancounter *exec_ub; + unsigned long pos; + + pos = *ppos; + if (pos == 0) + ub_show_header(f); + + exec_ub = get_exec_ub(); + + rcu_read_lock(); + for_each_beancounter(ub) { + if (ub->parent != NULL) + continue; + if (!ub_accessible(exec_ub, ub)) + continue; + if (pos-- == 0) + return ub; + } + return NULL; +} + +static void *ub_next(struct seq_file *f, void *v, loff_t *ppos) +{ + struct user_beancounter *ub; + struct list_head *entry; + struct user_beancounter *exec_ub; + + exec_ub = get_exec_ub(); + ub = (struct user_beancounter *)v; + + entry = &ub->ub_list; + + list_for_each_continue_rcu(entry, &ub_list_head) { + ub = list_entry(entry, struct user_beancounter, ub_list); + if (ub->parent != NULL) + continue; + if (!ub_accessible(exec_ub, ub)) + continue; + + (*ppos)++; + return ub; + } + return NULL; +} + +static void ub_stop(struct seq_file *f, void *v) +{ + rcu_read_unlock(); +} + +static struct seq_operations ub_seq_ops = { + .start = ub_start, + .next = ub_next, + .stop = ub_stop, + .show = ub_show, +}; + +static int ub_open(struct inode *inode, struct file *filp) +{ + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EACCES; + + return seq_open(filp, &ub_seq_ops); +} + +static struct file_operations ub_file_operations = { + .open = ub_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct seq_operations res_seq_ops = { + .start = ub_start, + .next = ub_next, + .stop = ub_stop, + .show = res_show, +}; + +static int res_open(struct inode *inode, struct file *filp) +{ + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EACCES; + + return seq_open(filp, &res_seq_ops); +} + +static struct file_operations resources_operations = { + .open = res_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct bc_proc_entry bc_all_resources_entry = { + .name = "resources", + .u.fops = &resources_operations, +}; + +/* + * Generic showing stuff + */ + +static int cookies, num_entries; +static struct bc_proc_entry *bc_entries __read_mostly; +static struct bc_proc_entry *bc_root_entries __read_mostly; +static DEFINE_SPINLOCK(bc_entries_lock); +static struct proc_dir_entry *bc_proc_root; + +void bc_register_proc_entry(struct bc_proc_entry *e) +{ + spin_lock(&bc_entries_lock); + e->cookie = ++cookies; + e->next = bc_entries; + bc_entries = e; + num_entries++; + spin_unlock(&bc_entries_lock); +} + +EXPORT_SYMBOL(bc_register_proc_entry); + +void bc_register_proc_root_entry(struct bc_proc_entry *e) +{ + spin_lock(&bc_entries_lock); + e->cookie = ++cookies; + e->next = bc_root_entries; + bc_root_entries = e; + bc_proc_root->nlink++; + spin_unlock(&bc_entries_lock); +} + +EXPORT_SYMBOL(bc_register_proc_root_entry); + +/* + * small helpers + */ + +static inline unsigned long bc_make_ino(struct user_beancounter *ub) +{ + unsigned long ret; + + ret = 0xbc000000; + if (ub->parent) + ret |= ((ub->parent->ub_uid) << 4); + ret |= (ub->ub_uid + 1); + return ret; +} + +static inline unsigned long bc_make_file_ino(struct bc_proc_entry *de) +{ + return 0xbe000000 + de->cookie; +} + +static int bc_d_delete(struct dentry *d) +{ + return 1; +} + +static void bc_d_release(struct dentry *d) +{ + put_beancounter((struct user_beancounter *)d->d_fsdata); +} + +static struct inode_operations bc_entry_iops; +static struct file_operations bc_entry_fops; +static struct dentry_operations bc_dentry_ops = { + .d_delete = bc_d_delete, + .d_release = bc_d_release, +}; + +/* + * common directory operations' helpers + */ + +static int bc_readdir(struct file *file, filldir_t filler, void *data, + struct user_beancounter *parent) +{ + int err = 0; + loff_t pos, filled; + struct user_beancounter *ub, *prev; + struct bc_proc_entry *pde; + + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EPERM; + + pos = file->f_pos; + if (pos == 0) { + err = (*filler)(data, ".", 1, pos, + file->f_dentry->d_inode->i_ino, DT_DIR); + if (err < 0) { + err = 0; + goto out; + } + pos++; + } + + if (pos == 1) { + err = (*filler)(data, "..", 2, pos, + parent_ino(file->f_dentry), DT_DIR); + if (err < 0) { + err = 0; + goto out; + } + pos++; + } + + filled = 2; + for (pde = (parent == NULL ? bc_root_entries : bc_entries); + pde != NULL; pde = pde->next) { + if (filled++ < pos) + continue; + + err = (*filler)(data, pde->name, strlen(pde->name), pos, + bc_make_file_ino(pde), DT_REG); + if (err < 0) { + err = 0; + goto out; + } + pos++; + } + + rcu_read_lock(); + prev = NULL; + ub = list_entry(&ub_list_head, struct user_beancounter, ub_list); + while (1) { + int len; + unsigned long ino; + char buf[64]; + + ub = list_entry(rcu_dereference(ub->ub_list.next), + struct user_beancounter, ub_list); + if (&ub->ub_list == &ub_list_head) + break; + + if (ub->parent != parent) + continue; + + if (filled++ < pos) + continue; + + if (!get_beancounter_rcu(ub)) + continue; + + rcu_read_unlock(); + put_beancounter(prev); + + len = print_ub_uid(ub, buf, sizeof(buf)); + ino = bc_make_ino(ub); + + err = (*filler)(data, buf, len, pos, ino, DT_DIR); + if (err < 0) { + err = 0; + put_beancounter(ub); + goto out; + } + + rcu_read_lock(); + prev = ub; + pos++; + } + rcu_read_unlock(); + put_beancounter(prev); +out: + file->f_pos = pos; + return err; +} + +static int bc_looktest(struct inode *ino, void *data) +{ + return ino->i_op == &bc_entry_iops && ino->i_private == data; +} + +static int bc_lookset(struct inode *ino, void *data) +{ + struct user_beancounter *ub; + + ub = (struct user_beancounter *)data; + ino->i_private = data; + ino->i_ino = bc_make_ino(ub); + ino->i_fop = &bc_entry_fops; + ino->i_op = &bc_entry_iops; + ino->i_mode = S_IFDIR | S_IRUSR | S_IXUGO; + /* subbeancounters are not included, but who cares? */ + ino->i_nlink = num_entries + 2; + ino->i_gid = 0; + ino->i_uid = 0; + return 0; +} + +static struct dentry *bc_lookup(struct user_beancounter *ub, struct inode *dir, + struct dentry *dentry) +{ + struct inode *ino; + + ino = iget5_locked(dir->i_sb, ub->ub_uid, bc_looktest, bc_lookset, ub); + if (ino == NULL) + goto out_put; + + unlock_new_inode(ino); + dentry->d_op = &bc_dentry_ops; + dentry->d_fsdata = ub; + d_add(dentry, ino); + return NULL; + +out_put: + put_beancounter(ub); + return ERR_PTR(-ENOENT); +} + +/* + * files (bc_proc_entry) manipulations + */ + +static struct dentry *bc_lookup_file(struct inode *dir, + struct dentry *dentry, struct bc_proc_entry *root, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *)) +{ + struct bc_proc_entry *pde; + struct inode *ino; + + for (pde = root; pde != NULL; pde = pde->next) + if (strcmp(pde->name, dentry->d_name.name) == 0) + break; + + if (pde == NULL) + return ERR_PTR(-ESRCH); + + ino = iget5_locked(dir->i_sb, pde->cookie, test, set, pde); + if (ino == NULL) + return ERR_PTR(-ENOENT); + + unlock_new_inode(ino); + dentry->d_op = &bc_dentry_ops; + d_add(dentry, ino); + return NULL; +} + +static int bc_file_open(struct inode *ino, struct file *filp) +{ + struct bc_proc_entry *de; + struct user_beancounter *ub; + + de = (struct bc_proc_entry *)ino->i_private; + ub = (struct user_beancounter *)filp->f_dentry->d_parent->d_fsdata; + BUG_ON(ub->ub_magic != UB_MAGIC); + + /* + * ub can't disappear: we hold d_parent, he holds the beancounter + */ + return single_open(filp, de->u.show, ub); +} + +static struct file_operations bc_file_ops = { + .open = bc_file_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int bc_looktest_entry(struct inode *ino, void *data) +{ + return ino->i_fop == &bc_file_ops && ino->i_private == data; +} + +static int bc_lookset_entry(struct inode *ino, void *data) +{ + struct bc_proc_entry *de; + + de = (struct bc_proc_entry *)data; + ino->i_private = data; + ino->i_ino = bc_make_file_ino(de); + ino->i_fop = &bc_file_ops, + ino->i_mode = S_IFREG | S_IRUSR; + ino->i_nlink = 1; + ino->i_gid = 0; + ino->i_uid = 0; + return 0; +} + +static inline struct dentry *bc_lookup_files(struct inode *dir, + struct dentry *de) +{ + return bc_lookup_file(dir, de, bc_entries, + bc_looktest_entry, bc_lookset_entry); +} + +static int bc_looktest_root_entry(struct inode *ino, void *data) +{ + struct bc_proc_entry *de; + + de = (struct bc_proc_entry *)data; + return ino->i_fop == de->u.fops && ino->i_private == data; +} + +static int bc_lookset_root_entry(struct inode *ino, void *data) +{ + struct bc_proc_entry *de; + + de = (struct bc_proc_entry *)data; + ino->i_private = data; + ino->i_ino = bc_make_file_ino(de); + ino->i_fop = de->u.fops; + ino->i_mode = S_IFREG | S_IRUSR; + ino->i_nlink = 1; + ino->i_gid = 0; + ino->i_uid = 0; + return 0; +} + +static inline struct dentry *bc_lookup_root_files(struct inode *dir, + struct dentry *de) +{ + return bc_lookup_file(dir, de, bc_root_entries, + bc_looktest_root_entry, bc_lookset_root_entry); +} + +/* + * /proc/bc/.../ directory operations + */ + +static int bc_entry_readdir(struct file *file, void *data, filldir_t filler) +{ + return bc_readdir(file, filler, data, + (struct user_beancounter *)file->f_dentry->d_fsdata); +} + +static struct dentry *bc_entry_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int id; + char *end; + struct user_beancounter *par, *ub; + struct dentry *de; + + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return ERR_PTR(-EPERM); + + de = bc_lookup_files(dir, dentry); + if (de != ERR_PTR(-ESRCH)) + return de; + + id = simple_strtol(dentry->d_name.name, &end, 10); + if (*end != '.') + return ERR_PTR(-ENOENT); + + par = (struct user_beancounter *)dir->i_private; + if (par->ub_uid != id) + return ERR_PTR(-ENOENT); + + id = simple_strtol(end + 1, &end, 10); + if (*end != '\0') + return ERR_PTR(-ENOENT); + + ub = get_subbeancounter_byid(par, id, 0); + if (ub == NULL) + return ERR_PTR(-ENOENT); + + return bc_lookup(ub, dir, dentry); +} + +static struct file_operations bc_entry_fops = { + .read = generic_read_dir, + .readdir = bc_entry_readdir, +}; + +static struct inode_operations bc_entry_iops = { + .lookup = bc_entry_lookup, +}; + +/* + * /proc/bc directory operations + */ + +static int bc_root_readdir(struct file *file, void *data, filldir_t filler) +{ + return bc_readdir(file, filler, data, NULL); +} + +static struct dentry *bc_root_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int id; + char *end; + struct user_beancounter *ub; + struct dentry *de; + + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return ERR_PTR(-EPERM); + + de = bc_lookup_root_files(dir, dentry); + if (de != ERR_PTR(-ESRCH)) + return de; + + id = simple_strtol(dentry->d_name.name, &end, 10); + if (*end != '\0') + return ERR_PTR(-ENOENT); + + ub = get_beancounter_byuid(id, 0); + if (ub == NULL) + return ERR_PTR(-ENOENT); + + return bc_lookup(ub, dir, dentry); +} + +static struct file_operations bc_root_fops = { + .read = generic_read_dir, + .readdir = bc_root_readdir, +}; + +static struct inode_operations bc_root_iops = { + .lookup = bc_root_lookup, +}; + +static int __init ub_init_proc(void) +{ + struct proc_dir_entry *entry; + + bc_proc_root = create_proc_entry("bc", + S_IFDIR | S_IRUGO | S_IXUGO, NULL); + if (bc_proc_root == NULL) + panic("Can't create /proc/bc entry"); + + bc_proc_root->proc_fops = &bc_root_fops; + bc_proc_root->proc_iops = &bc_root_iops; + + bc_register_proc_entry(&bc_resources_entry); +#ifdef CONFIG_UBC_DEBUG + bc_register_proc_entry(&bc_debug_entry); +#endif + bc_register_proc_root_entry(&bc_all_resources_entry); + + entry = proc_create("user_beancounters", + S_IRUGO, &glob_proc_root, &ub_file_operations); + return 0; +} + +core_initcall(ub_init_proc); diff --git a/kernel/bc/rss_pages.c b/kernel/bc/rss_pages.c new file mode 100644 index 0000000..391585e --- /dev/null +++ b/kernel/bc/rss_pages.c @@ -0,0 +1,437 @@ +/* + * kernel/bc/rss_pages.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static struct kmem_cache *pb_cachep; +spinlock_t pb_lock = SPIN_LOCK_UNLOCKED; +static struct page_beancounter **pb_hash_table; +static unsigned int pb_hash_mask; + +/* + * Auxiliary staff + */ + +static inline struct page_beancounter *next_page_pb(struct page_beancounter *p) +{ + return list_entry(p->page_list.next, struct page_beancounter, + page_list); +} + +static inline struct page_beancounter *prev_page_pb(struct page_beancounter *p) +{ + return list_entry(p->page_list.prev, struct page_beancounter, + page_list); +} + +/* + * Held pages manipulation + */ +static inline void set_held_pages(struct user_beancounter *bc) +{ + /* all three depend on ub_held_pages */ + __ub_update_physpages(bc); + __ub_update_oomguarpages(bc); + __ub_update_privvm(bc); +} + +static inline void do_dec_held_pages(struct user_beancounter *ub, int value) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_held_pages -= value; + set_held_pages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +static void dec_held_pages(struct user_beancounter *ub, int value) +{ + for (; ub != NULL; ub = ub->parent) + do_dec_held_pages(ub, value); +} + +static inline void do_inc_held_pages(struct user_beancounter *ub, int value) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_held_pages += value; + set_held_pages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +static void inc_held_pages(struct user_beancounter *ub, int value) +{ + for (; ub != NULL; ub = ub->parent) + do_inc_held_pages(ub, value); +} + +/* + * Alloc - free + */ + +inline int pb_alloc(struct page_beancounter **pbc) +{ + *pbc = kmem_cache_alloc(pb_cachep, GFP_KERNEL); + if (*pbc != NULL) { + (*pbc)->next_hash = NULL; + (*pbc)->pb_magic = PB_MAGIC; + } + return (*pbc == NULL); +} + +inline void pb_free(struct page_beancounter **pb) +{ + if (*pb != NULL) { + kmem_cache_free(pb_cachep, *pb); + *pb = NULL; + } +} + +void pb_free_list(struct page_beancounter **p_pb) +{ + struct page_beancounter *list, *pb; + + list = *p_pb; + if (list == PBC_COPY_SAME) + return; + + while (list) { + pb = list; + list = list->next_hash; + pb_free(&pb); + } + *p_pb = NULL; +} + +/* + * head -> -> -> ... + */ +static int __alloc_list(struct page_beancounter **head, int num) +{ + struct page_beancounter *pb; + + while (num > 0) { + if (pb_alloc(&pb)) + return -1; + pb->next_hash = *head; + *head = pb; + num--; + } + + return num; +} + +/* + * Ensure that the list contains at least num elements. + * p_pb points to an initialized list, may be of the zero length. + * + * mm->page_table_lock should be held + */ +int pb_alloc_list(struct page_beancounter **p_pb, int num) +{ + struct page_beancounter *list; + + for (list = *p_pb; list != NULL && num; list = list->next_hash, num--); + if (!num) + return 0; + + /* + * *p_pb(after) *p_pb (before) + * \ \ + * -...-> -> ... + */ + if (__alloc_list(p_pb, num) < 0) + goto nomem; + return 0; + +nomem: + pb_free_list(p_pb); + return -ENOMEM; +} + +/* + * Allocates a page_beancounter for each + * user_beancounter in a hash + */ +int pb_alloc_all(struct page_beancounter **pbs) +{ + int need_alloc; + struct user_beancounter *ub; + + need_alloc = 0; + rcu_read_lock(); + for_each_beancounter(ub) + need_alloc++; + rcu_read_unlock(); + + if (!__alloc_list(pbs, need_alloc)) + return 0; + + pb_free_list(pbs); + return -ENOMEM; +} + +/* + * Hash routines + */ + +static inline int pb_hash(struct user_beancounter *ub, struct page *page) +{ + return (page_to_pfn(page) + (ub->ub_uid << 10)) & pb_hash_mask; +} + +/* pb_lock should be held */ +static inline void insert_pb(struct page_beancounter *p, struct page *page, + struct user_beancounter *ub, int hash) +{ + p->page = page; + p->ub = get_beancounter(ub); + p->next_hash = pb_hash_table[hash]; + pb_hash_table[hash] = p; + inc_pbc_count(ub); +} + +/* + * Heart + */ + +static int __pb_dup_ref(struct page *page, struct user_beancounter *bc, + int hash) +{ + struct page_beancounter *p; + + for (p = pb_hash_table[hash]; + p != NULL && (p->page != page || p->ub != bc); + p = p->next_hash); + if (p == NULL) + return -1; + + PB_COUNT_INC(p->refcount); + return 0; +} + +static void __pb_add_ref(struct page *page, struct user_beancounter *bc, + struct page_beancounter **ppb, int hash) +{ + struct page_beancounter *head, *p, **hp; + int shift; + + p = *ppb; + *ppb = p->next_hash; + + insert_pb(p, page, bc, hash); + hp = page_pblist(page); + head = *hp; + + if (head != NULL) { + /* + * Move the first element to the end of the list. + * List head (pb_head) is set to the next entry. + * Note that this code works even if head is the only element + * on the list (because it's cyclic). + */ + BUG_ON(head->pb_magic != PB_MAGIC); + *hp = next_page_pb(head); + PB_SHIFT_INC(head->refcount); + shift = PB_SHIFT_GET(head->refcount); + /* + * Update user beancounter, the share of head has been changed. + * Note that the shift counter is taken after increment. + */ + dec_held_pages(head->ub, UB_PAGE_WEIGHT >> shift); + /* add the new page beancounter to the end of the list */ + head = *hp; + list_add_tail(&p->page_list, &head->page_list); + } else { + *hp = p; + shift = 0; + INIT_LIST_HEAD(&p->page_list); + } + + p->refcount = PB_REFCOUNT_MAKE(shift, 1); + /* update user beancounter for the new page beancounter */ + inc_held_pages(bc, UB_PAGE_WEIGHT >> shift); +} + +void pb_add_ref(struct page *page, struct mm_struct *mm, + struct page_beancounter **p_pb) +{ + int hash; + struct user_beancounter *bc; + + bc = mm->mm_ub; + if (bc == NULL) + return; + + if (!PageAnon(page) && is_shmem_mapping(page->mapping)) + return; + + hash = pb_hash(bc, page); + + spin_lock(&pb_lock); + if (__pb_dup_ref(page, bc, hash)) + __pb_add_ref(page, bc, p_pb, hash); + spin_unlock(&pb_lock); +} + +void pb_dup_ref(struct page *page, struct mm_struct *mm, + struct page_beancounter **p_pb) +{ + int hash; + struct user_beancounter *bc; + + bc = mm->mm_ub; + if (bc == NULL) + return; + + if (!PageAnon(page) && is_shmem_mapping(page->mapping)) + return; + + hash = pb_hash(bc, page); + + spin_lock(&pb_lock); + if (*page_pblist(page) == NULL) + /* + * pages like ZERO_PAGE must not be accounted in pbc + * so on fork we just skip them + */ + goto out_unlock; + + if (unlikely(*p_pb != PBC_COPY_SAME)) + __pb_add_ref(page, bc, p_pb, hash); + else if (unlikely(__pb_dup_ref(page, bc, hash))) + WARN_ON(1); +out_unlock: + spin_unlock(&pb_lock); +} + +void pb_remove_ref(struct page *page, struct mm_struct *mm) +{ + int hash; + struct user_beancounter *bc; + struct page_beancounter *p, **q, *f; + int shift, shiftt; + + bc = mm->mm_ub; + if (bc == NULL) + return; + + if (!PageAnon(page) && is_shmem_mapping(page->mapping)) + return; + + hash = pb_hash(bc, page); + + spin_lock(&pb_lock); + for (q = pb_hash_table + hash, p = *q; + p != NULL && (p->page != page || p->ub != bc); + q = &p->next_hash, p = *q); + if (p == NULL) + goto out_unlock; + + PB_COUNT_DEC(p->refcount); + if (PB_COUNT_GET(p->refcount)) + /* + * More references from the same user beancounter exist. + * Nothing needs to be done. + */ + goto out_unlock; + + /* remove from the hash list */ + f = p; + *q = p->next_hash; + + shift = PB_SHIFT_GET(p->refcount); + + dec_held_pages(p->ub, UB_PAGE_WEIGHT >> shift); + + q = page_pblist(page); + if (*q == p) { + if (list_empty(&p->page_list)) { + *q = NULL; + goto out_free; + } + + *q = next_page_pb(p); + } + list_del(&p->page_list); + + /* Now balance the list. Move the tail and adjust its shift counter. */ + p = prev_page_pb(*q); + shiftt = PB_SHIFT_GET(p->refcount); + *q = p; + PB_SHIFT_DEC(p->refcount); + + inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); + + /* + * If the shift counter of the moved beancounter is different from the + * removed one's, repeat the procedure for one more tail beancounter + */ + if (shiftt > shift) { + p = prev_page_pb(*q); + *q = p; + PB_SHIFT_DEC(p->refcount); + inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); + } +out_free: + dec_pbc_count(f->ub); + spin_unlock(&pb_lock); + + put_beancounter(f->ub); + pb_free(&f); + return; + +out_unlock: + spin_unlock(&pb_lock); +} + +struct user_beancounter *pb_grab_page_ub(struct page *page) +{ + struct page_beancounter *pb; + struct user_beancounter *ub; + + spin_lock(&pb_lock); + pb = *page_pblist(page); + ub = (pb == NULL ? ERR_PTR(-EINVAL) : + get_beancounter(pb->ub)); + spin_unlock(&pb_lock); + return ub; +} + +void __init ub_init_pbc(void) +{ + unsigned long hash_size; + + pb_cachep = kmem_cache_create("page_beancounter", + sizeof(struct page_beancounter), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + hash_size = num_physpages >> 2; + for (pb_hash_mask = 1; + (hash_size & pb_hash_mask) != hash_size; + pb_hash_mask = (pb_hash_mask << 1) + 1); + hash_size = pb_hash_mask + 1; + printk(KERN_INFO "Page beancounter hash is %lu entries.\n", hash_size); + pb_hash_table = vmalloc(hash_size * sizeof(struct page_beancounter *)); + memset(pb_hash_table, 0, hash_size * sizeof(struct page_beancounter *)); + + ub_init_io(pb_cachep); +} diff --git a/kernel/bc/statd.c b/kernel/bc/statd.c new file mode 100644 index 0000000..bf6354b --- /dev/null +++ b/kernel/bc/statd.c @@ -0,0 +1,453 @@ +/* + * kernel/bc/statd.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +static spinlock_t ubs_notify_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(ubs_notify_list); +static long ubs_min_interval; +static ubstattime_t ubs_start_time, ubs_end_time; +static struct timer_list ubs_timer; + +static int ubstat_get_list(void __user *buf, long size) +{ + int retval; + struct user_beancounter *ub, *ubp; + long *page, *ptr, *end; + int len; + + page = (long *)__get_free_page(GFP_KERNEL); + if (page == NULL) + return -ENOMEM; + + retval = 0; + ubp = NULL; + ptr = page; + end = page + PAGE_SIZE / sizeof(*ptr); + + spin_lock_irq(&ub_hash_lock); + for_each_beancounter(ub) { + if (ub->parent != NULL) + continue; + *ptr++ = ub->ub_uid; + if (ptr != end) + continue; + + get_beancounter(ub); + spin_unlock_irq(&ub_hash_lock); + + put_beancounter(ubp); + ubp = ub; + + len = min_t(long, (ptr - page) * sizeof(*ptr), size); + if (copy_to_user(buf, page, len)) { + retval = -EFAULT; + goto out_put; + } + retval += len; + if (len < PAGE_SIZE) + goto out_put; + buf += len; + size -= len; + + ptr = page; + end = page + PAGE_SIZE / sizeof(*ptr); + + spin_lock_irq(&ub_hash_lock); + } + spin_unlock_irq(&ub_hash_lock); + + put_beancounter(ubp); + size = min_t(long, (ptr - page) * sizeof(*ptr), size); + if (size > 0 && copy_to_user(buf, page, size)) { + retval = -EFAULT; + goto out_put; + } + retval += size; + +out_put: + put_beancounter(ubp); + free_page((unsigned long)page); + return retval; +} + +static int ubstat_gettime(void __user *buf, long size) +{ + ubgettime_t data; + int retval; + + spin_lock(&ubs_notify_lock); + data.start_time = ubs_start_time; + data.end_time = ubs_end_time; + data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ; + spin_unlock(&ubs_notify_lock); + + retval = min_t(long, sizeof(data), size); + if (copy_to_user(buf, &data, retval)) + retval = -EFAULT; + return retval; +} + +static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf) +{ + struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparm_t param[1]; + } *data; + + data = kbuf; + data->start_time = ubs_start_time; + data->end_time = ubs_end_time; + + data->param[0].maxheld = ub->ub_store[res].maxheld; + data->param[0].failcnt = ub->ub_store[res].failcnt; + + return sizeof(*data); +} + +static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size) +{ + int wrote; + struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparm_t param[UB_RESOURCES]; + } *data; + int resource; + + data = kbuf; + data->start_time = ubs_start_time; + data->end_time = ubs_end_time; + wrote = sizeof(data->start_time) + sizeof(data->end_time); + + for (resource = 0; resource < UB_RESOURCES; resource++) { + if (size < wrote + sizeof(data->param[resource])) + break; + data->param[resource].maxheld = ub->ub_store[resource].maxheld; + data->param[resource].failcnt = ub->ub_store[resource].failcnt; + wrote += sizeof(data->param[resource]); + } + + return wrote; +} + +static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf, + int size) +{ + int wrote; + struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparmf_t param[UB_RESOURCES]; + } *data; + int resource; + + data = kbuf; + data->start_time = ubs_start_time; + data->end_time = ubs_end_time; + wrote = sizeof(data->start_time) + sizeof(data->end_time); + + for (resource = 0; resource < UB_RESOURCES; resource++) { + if (size < wrote + sizeof(data->param[resource])) + break; + /* The beginning of ubstatparmf_t matches struct ubparm. */ + memcpy(&data->param[resource], &ub->ub_store[resource], + sizeof(ub->ub_store[resource])); + data->param[resource].__unused1 = 0; + data->param[resource].__unused2 = 0; + wrote += sizeof(data->param[resource]); + } + return wrote; +} + +static int ubstat_get_stat(struct user_beancounter *ub, long cmd, + void __user *buf, long size) +{ + void *kbuf; + int retval; + + kbuf = (void *)__get_free_page(GFP_KERNEL); + if (kbuf == NULL) + return -ENOMEM; + + spin_lock(&ubs_notify_lock); + switch (UBSTAT_CMD(cmd)) { + case UBSTAT_READ_ONE: + retval = -EINVAL; + if (UBSTAT_PARMID(cmd) >= UB_RESOURCES) + break; + retval = ubstat_do_read_one(ub, + UBSTAT_PARMID(cmd), kbuf); + break; + case UBSTAT_READ_ALL: + retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE); + break; + case UBSTAT_READ_FULL: + retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE); + break; + default: + retval = -EINVAL; + } + spin_unlock(&ubs_notify_lock); + + if (retval > 0) { + retval = min_t(long, retval, size); + if (copy_to_user(buf, kbuf, retval)) + retval = -EFAULT; + } + + free_page((unsigned long)kbuf); + return retval; +} + +static int ubstat_handle_notifrq(ubnotifrq_t *req) +{ + int retval; + struct ub_stat_notify *new_notify; + struct list_head *entry; + struct task_struct *tsk_to_free; + + new_notify = kmalloc(sizeof(new_notify), GFP_KERNEL); + if (new_notify == NULL) + return -ENOMEM; + + tsk_to_free = NULL; + INIT_LIST_HEAD(&new_notify->list); + + spin_lock(&ubs_notify_lock); + list_for_each(entry, &ubs_notify_list) { + struct ub_stat_notify *notify; + + notify = list_entry(entry, struct ub_stat_notify, list); + if (notify->task == current) { + kfree(new_notify); + new_notify = notify; + break; + } + } + + retval = -EINVAL; + if (req->maxinterval < 1) + goto out_unlock; + if (req->maxinterval > TIME_MAX_SEC) + req->maxinterval = TIME_MAX_SEC; + if (req->maxinterval < ubs_min_interval) { + unsigned long dif; + + ubs_min_interval = req->maxinterval; + dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ; + if (dif > req->maxinterval) + mod_timer(&ubs_timer, + ubs_timer.expires - + (dif - req->maxinterval) * HZ); + } + + if (entry != &ubs_notify_list) { + list_del(&new_notify->list); + tsk_to_free = new_notify->task; + } + if (req->signum) { + new_notify->task = current; + get_task_struct(new_notify->task); + new_notify->signum = req->signum; + list_add(&new_notify->list, &ubs_notify_list); + } else + kfree(new_notify); + retval = 0; +out_unlock: + spin_unlock(&ubs_notify_lock); + if (tsk_to_free != NULL) + put_task_struct(tsk_to_free); + return retval; +} + +/* + * former sys_ubstat + */ +long do_ubstat(int func, unsigned long arg1, unsigned long arg2, + void __user *buf, long size) +{ + int retval; + struct user_beancounter *ub; + + if (func == UBSTAT_UBPARMNUM) + return UB_RESOURCES; + if (func == UBSTAT_UBLIST) + return ubstat_get_list(buf, size); + if (!(capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))) + return -EPERM; + + if (func == UBSTAT_GETTIME) { + retval = ubstat_gettime(buf, size); + goto notify; + } + + ub = get_exec_ub(); + if (ub != NULL && ub->ub_uid == arg1) + get_beancounter(ub); + else /* FIXME must be if (ve_is_super) */ + ub = get_beancounter_byuid(arg1, 0); + + if (ub == NULL) + return -ESRCH; + + retval = ubstat_get_stat(ub, func, buf, size); + put_beancounter(ub); +notify: + /* Handle request for notification */ + if (retval >= 0) { + ubnotifrq_t notifrq; + int err; + + err = -EFAULT; + if (!copy_from_user(¬ifrq, (void __user *)arg2, + sizeof(notifrq))) + err = ubstat_handle_notifrq(¬ifrq); + if (err) + retval = err; + } + + return retval; +} + +static void ubstat_save_onestat(struct user_beancounter *ub) +{ + int resource; + + /* called with local irq disabled */ + spin_lock(&ub->ub_lock); + for (resource = 0; resource < UB_RESOURCES; resource++) { + memcpy(&ub->ub_store[resource], &ub->ub_parms[resource], + sizeof(struct ubparm)); + ub->ub_parms[resource].minheld = + ub->ub_parms[resource].maxheld = + ub->ub_parms[resource].held; + } + spin_unlock(&ub->ub_lock); +} + +static void ubstat_save_statistics(void) +{ + unsigned long flags; + struct user_beancounter *ub; + + local_irq_save(flags); + for_each_beancounter (ub) + ubstat_save_onestat(ub); + local_irq_restore(flags); +} + +static void ubstatd_timeout(unsigned long __data) +{ + struct task_struct *p; + + p = (struct task_struct *) __data; + wake_up_process(p); +} + +/* + * Safe wrapper for send_sig. It prevents a race with release_task + * for sighand. + * Should be called under tasklist_lock. + */ +static void task_send_sig(struct ub_stat_notify *notify) +{ + if (likely(notify->task->sighand != NULL)) + send_sig(notify->signum, notify->task, 1); +} + +static inline void do_notifies(void) +{ + LIST_HEAD(notif_free_list); + struct ub_stat_notify *notify; + struct ub_stat_notify *tmp; + + spin_lock(&ubs_notify_lock); + ubs_start_time = ubs_end_time; + /* + * the expression below relies on time being unsigned long and + * arithmetic promotion rules + */ + ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ; + mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ); + ubs_min_interval = TIME_MAX_SEC; + /* save statistics accumulated for the interval */ + ubstat_save_statistics(); + /* send signals */ + read_lock(&tasklist_lock); + while (!list_empty(&ubs_notify_list)) { + notify = list_entry(ubs_notify_list.next, + struct ub_stat_notify, list); + task_send_sig(notify); + list_del(¬ify->list); + list_add(¬ify->list, ¬if_free_list); + } + read_unlock(&tasklist_lock); + spin_unlock(&ubs_notify_lock); + + list_for_each_entry_safe(notify, tmp, ¬if_free_list, list) { + put_task_struct(notify->task); + kfree(notify); + } +} + +/* + * Kernel thread + */ +static int ubstatd(void *unused) +{ + /* daemonize call will take care of signals */ + daemonize("ubstatd"); + + ubs_timer.data = (unsigned long)current; + ubs_timer.function = ubstatd_timeout; + add_timer(&ubs_timer); + + while (1) { + set_task_state(current, TASK_INTERRUPTIBLE); + if (time_after(ubs_timer.expires, jiffies)) { + schedule(); + try_to_freeze(); + continue; + } + + __set_task_state(current, TASK_RUNNING); + do_notifies(); + } + return 0; +} + +static int __init ubstatd_init(void) +{ + init_timer(&ubs_timer); + ubs_timer.expires = TIME_MAX_JIF; + ubs_min_interval = TIME_MAX_SEC; + ubs_start_time = ubs_end_time = 0; + + kernel_thread(ubstatd, NULL, 0); + return 0; +} + +module_init(ubstatd_init); diff --git a/kernel/bc/sys.c b/kernel/bc/sys.c new file mode 100644 index 0000000..798166b --- /dev/null +++ b/kernel/bc/sys.c @@ -0,0 +1,173 @@ +/* + * kernel/bc/sys.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include + +#include + +/* + * The (rather boring) getluid syscall + */ +asmlinkage long sys_getluid(void) +{ + struct user_beancounter *ub; + + ub = get_exec_ub(); + if (ub == NULL) + return -EINVAL; + + return ub->ub_uid; +} + +/* + * The setluid syscall + */ +asmlinkage long sys_setluid(uid_t uid) +{ + struct user_beancounter *ub; + struct task_beancounter *task_bc; + int error; + + task_bc = ¤t->task_bc; + + /* You may not disown a setluid */ + error = -EINVAL; + if (uid == (uid_t)-1) + goto out; + + /* You may only set an ub as root */ + error = -EPERM; + if (!capable(CAP_SETUID)) + goto out; + /* + * The ub once set is irrevocable to all + * unless it's set from ve0. + */ + if (!ve_is_super(get_exec_env())) + goto out; + + /* Ok - set up a beancounter entry for this user */ + error = -ENOBUFS; + ub = get_beancounter_byuid(uid, 1); + if (ub == NULL) + goto out; + + ub_debug(UBD_ALLOC | UBD_LIMIT, "setluid, bean %p (count %d) " + "for %.20s pid %d\n", + ub, atomic_read(&ub->ub_refcount), + current->comm, current->pid); + /* install bc */ + error = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_NEWUBC, ub); + if (!(error & NOTIFY_FAIL)) { + put_beancounter(task_bc->exec_ub); + task_bc->exec_ub = ub; + if (!(error & NOTIFY_OK)) { + put_beancounter(task_bc->fork_sub); + task_bc->fork_sub = get_beancounter(ub); + } + error = 0; + } else { + put_beancounter(ub); + error = -ENOBUFS; + } +out: + return error; +} + +long do_setublimit(uid_t uid, unsigned long resource, + unsigned long *new_limits) +{ + int error; + unsigned long flags; + struct user_beancounter *ub; + + error = -EPERM; + if(!capable(CAP_SYS_RESOURCE)) + goto out; + + if (!ve_is_super(get_exec_env())) + goto out; + + error = -EINVAL; + if (resource >= UB_RESOURCES) + goto out; + + error = -EINVAL; + if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE) + goto out; + + error = -ENOENT; + ub = get_beancounter_byuid(uid, 0); + if (ub == NULL) { + ub_debug(UBD_LIMIT, "No login bc for uid %d\n", uid); + goto out; + } + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_parms[resource].barrier = new_limits[0]; + ub->ub_parms[resource].limit = new_limits[1]; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + put_beancounter(ub); + + error = 0; +out: + return error; +} + +/* + * The setbeanlimit syscall + */ +asmlinkage long sys_setublimit(uid_t uid, unsigned long resource, + unsigned long __user *limits) +{ + unsigned long new_limits[2]; + + if (copy_from_user(&new_limits, limits, sizeof(new_limits))) + return -EFAULT; + + return do_setublimit(uid, resource, new_limits); +} + +extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2, + void __user *buf, long size); +asmlinkage long sys_ubstat(int func, unsigned long arg1, unsigned long arg2, + void __user *buf, long size) +{ + if (!ve_is_super(get_exec_env())) + return -EPERM; + + return do_ubstat(func, arg1, arg2, buf, size); +} + +#ifdef CONFIG_COMPAT +asmlinkage long compat_sys_setublimit(uid_t uid, int resource, + unsigned int __user *limits) +{ + unsigned int u_new_limits[2]; + unsigned long new_limits[2]; + + if (copy_from_user(&u_new_limits, limits, sizeof(u_new_limits))) + return -EFAULT; + + new_limits[0] = u_new_limits[0]; + new_limits[1] = u_new_limits[1]; + + return do_setublimit(uid, resource, new_limits); +} + +asmlinkage long compat_sys_ubstat(int func, unsigned int arg1, + unsigned int arg2, compat_uptr_t *buf, long size) +{ + return sys_ubstat(func, arg1, arg2, buf, size); +} +#endif diff --git a/kernel/bc/vm_pages.c b/kernel/bc/vm_pages.c new file mode 100644 index 0000000..e98134b --- /dev/null +++ b/kernel/bc/vm_pages.c @@ -0,0 +1,549 @@ +/* + * kernel/bc/vm_pages.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +static inline unsigned long pages_in_pte_range(struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long end, + unsigned long *ret) +{ + pte_t *pte; + spinlock_t *ptl; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + do { + if (!pte_none(*pte) && pte_present(*pte)) + (*ret)++; + } while (pte++, addr += PAGE_SIZE, (addr != end)); + pte_unmap_unlock(pte - 1, ptl); + + return addr; +} + +static inline unsigned long pages_in_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end, + unsigned long *ret) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + next = pages_in_pte_range(vma, pmd, addr, next, ret); + } while (pmd++, addr = next, (addr != end)); + + return addr; +} + +static inline unsigned long pages_in_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end, + unsigned long *ret) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + next = pages_in_pmd_range(vma, pud, addr, next, ret); + } while (pud++, addr = next, (addr != end)); + + return addr; +} + +unsigned long pages_in_vma_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + unsigned long ret; + + ret = 0; + BUG_ON(addr >= end); + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + next = pages_in_pud_range(vma, pgd, addr, next, &ret); + } while (pgd++, addr = next, (addr != end)); + return ret; +} + +void __ub_update_physpages(struct user_beancounter *ub) +{ + ub->ub_parms[UB_PHYSPAGES].held = ub->ub_tmpfs_respages + + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT); + ub_adjust_maxheld(ub, UB_PHYSPAGES); +} + +void __ub_update_oomguarpages(struct user_beancounter *ub) +{ + ub->ub_parms[UB_OOMGUARPAGES].held = + ub->ub_parms[UB_PHYSPAGES].held + ub->ub_swap_pages; + ub_adjust_maxheld(ub, UB_OOMGUARPAGES); +} + +void __ub_update_privvm(struct user_beancounter *ub) +{ + ub->ub_parms[UB_PRIVVMPAGES].held = + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT) + + ub->ub_unused_privvmpages + + ub->ub_parms[UB_SHMPAGES].held; + ub_adjust_maxheld(ub, UB_PRIVVMPAGES); +} + +static inline int __charge_privvm_locked(struct user_beancounter *ub, + unsigned long s, enum ub_severity strict) +{ + if (__charge_beancounter_locked(ub, UB_PRIVVMPAGES, s, strict) < 0) + return -ENOMEM; + + ub->ub_unused_privvmpages += s; + return 0; +} + +static void __unused_privvm_dec_locked(struct user_beancounter *ub, + long size) +{ + /* catch possible overflow */ + if (ub->ub_unused_privvmpages < size) { + uncharge_warn(ub, UB_UNUSEDPRIVVM, + size, ub->ub_unused_privvmpages); + size = ub->ub_unused_privvmpages; + } + ub->ub_unused_privvmpages -= size; + __ub_update_privvm(ub); +} + +void __ub_unused_privvm_dec(struct mm_struct *mm, long size) +{ + unsigned long flags; + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + __unused_privvm_dec_locked(ub, size); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_unused_privvm_sub(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long count) +{ + if (VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) + __ub_unused_privvm_dec(mm, count); +} + +void ub_unused_privvm_add(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long size) +{ + unsigned long flags; + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL || !VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) + return; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_unused_privvmpages += size; + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +int ub_protected_charge(struct mm_struct *mm, unsigned long size, + unsigned long newflags, struct vm_area_struct *vma) +{ + unsigned long flags; + struct file *file; + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return PRIVVM_NO_CHARGE; + + flags = vma->vm_flags; + if (!((newflags ^ flags) & VM_WRITE)) + return PRIVVM_NO_CHARGE; + + file = vma->vm_file; + if (!VM_UB_PRIVATE(newflags | VM_WRITE, file)) + return PRIVVM_NO_CHARGE; + + if (flags & VM_WRITE) + return PRIVVM_TO_SHARED; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + if (__charge_privvm_locked(ub, size, UB_SOFT) < 0) + goto err; + spin_unlock_irqrestore(&ub->ub_lock, flags); + return PRIVVM_TO_PRIVATE; + +err: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return PRIVVM_ERROR; +} + +int ub_memory_charge(struct mm_struct *mm, unsigned long size, + unsigned vm_flags, struct file *vm_file, int sv) +{ + struct user_beancounter *ub, *ubl; + unsigned long flags; + + ub = mm->mm_ub; + if (ub == NULL) + return 0; + + size >>= PAGE_SHIFT; + if (size > UB_MAXVALUE) + return -EINVAL; + + BUG_ON(sv != UB_SOFT && sv != UB_HARD); + + if (vm_flags & VM_LOCKED) { + if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv)) + goto out_err; + } + if (VM_UB_PRIVATE(vm_flags, vm_file)) { + ubl = top_beancounter(ub); + spin_lock_irqsave(&ubl->ub_lock, flags); + if (__charge_privvm_locked(ubl, size, sv)) + goto out_private; + spin_unlock_irqrestore(&ubl->ub_lock, flags); + } + return 0; + +out_private: + spin_unlock_irqrestore(&ubl->ub_lock, flags); + if (vm_flags & VM_LOCKED) + uncharge_beancounter(ub, UB_LOCKEDPAGES, size); +out_err: + return -ENOMEM; +} + +void ub_memory_uncharge(struct mm_struct *mm, unsigned long size, + unsigned vm_flags, struct file *vm_file) +{ + struct user_beancounter *ub; + unsigned long flags; + + ub = mm->mm_ub; + if (ub == NULL) + return; + + size >>= PAGE_SHIFT; + + if (vm_flags & VM_LOCKED) + uncharge_beancounter(ub, UB_LOCKEDPAGES, size); + if (VM_UB_PRIVATE(vm_flags, vm_file)) { + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + __unused_privvm_dec_locked(ub, size); + spin_unlock_irqrestore(&ub->ub_lock, flags); + } +} + +int ub_locked_charge(struct mm_struct *mm, unsigned long size) +{ + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return 0; + + return charge_beancounter(ub, UB_LOCKEDPAGES, + size >> PAGE_SHIFT, UB_HARD); +} + +void ub_locked_uncharge(struct mm_struct *mm, unsigned long size) +{ + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return; + + uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); +} + +int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size) +{ + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return 0; + + return charge_beancounter(ub, UB_LOCKEDPAGES, + size >> PAGE_SHIFT, UB_HARD); +} + +void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size) +{ + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return; + + uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); +} + + +static inline void do_ub_tmpfs_respages_inc(struct user_beancounter *ub) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_tmpfs_respages++; + __ub_update_physpages(ub); + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_tmpfs_respages_inc(struct shmem_inode_info *shi) +{ + struct user_beancounter *ub; + + for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) + do_ub_tmpfs_respages_inc(ub); +} + +static inline void do_ub_tmpfs_respages_sub(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + /* catch possible overflow */ + if (ub->ub_tmpfs_respages < size) { + uncharge_warn(ub, UB_TMPFSPAGES, + size, ub->ub_tmpfs_respages); + size = ub->ub_tmpfs_respages; + } + ub->ub_tmpfs_respages -= size; + /* update values what is the most interesting */ + __ub_update_physpages(ub); + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_tmpfs_respages_sub(struct shmem_inode_info *shi, + unsigned long size) +{ + struct user_beancounter *ub; + + for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) + do_ub_tmpfs_respages_sub(ub, size); +} + +int ub_shmpages_charge(struct shmem_inode_info *shi, unsigned long size) +{ + int ret; + unsigned long flags; + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return 0; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ret = __charge_beancounter_locked(ub, UB_SHMPAGES, size, UB_HARD); + if (ret == 0) + __ub_update_privvm(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); + return ret; +} + +void ub_shmpages_uncharge(struct shmem_inode_info *shi, unsigned long size) +{ + unsigned long flags; + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_SHMPAGES, size); + __ub_update_privvm(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +#ifdef CONFIG_BC_SWAP_ACCOUNTING +static inline void do_ub_swapentry_inc(struct user_beancounter *ub) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_swap_pages++; + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_swapentry_inc(struct swap_info_struct *si, pgoff_t num, + struct user_beancounter *ub) +{ + si->swap_ubs[num] = get_beancounter(ub); + for (; ub != NULL; ub = ub->parent) + do_ub_swapentry_inc(ub); +} +EXPORT_SYMBOL(ub_swapentry_inc); + +static inline void do_ub_swapentry_dec(struct user_beancounter *ub) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + if (ub->ub_swap_pages <= 0) + uncharge_warn(ub, UB_SWAPPAGES, 1, ub->ub_swap_pages); + else + ub->ub_swap_pages--; + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_swapentry_dec(struct swap_info_struct *si, pgoff_t num) +{ + struct user_beancounter *ub, *ubp; + + ub = si->swap_ubs[num]; + si->swap_ubs[num] = NULL; + for (ubp = ub; ubp != NULL; ubp = ubp->parent) + do_ub_swapentry_dec(ubp); + put_beancounter(ub); +} +EXPORT_SYMBOL(ub_swapentry_dec); + +int ub_swap_init(struct swap_info_struct *si, pgoff_t num) +{ + struct user_beancounter **ubs; + + ubs = vmalloc(num * sizeof(struct user_beancounter *)); + if (ubs == NULL) + return -ENOMEM; + + memset(ubs, 0, num * sizeof(struct user_beancounter *)); + si->swap_ubs = ubs; + return 0; +} + +void ub_swap_fini(struct swap_info_struct *si) +{ + if (si->swap_ubs) { + vfree(si->swap_ubs); + si->swap_ubs = NULL; + } +} +#endif + +static int vmguar_enough_memory(struct vnotifier_block *self, + unsigned long event, void *arg, int old_ret) +{ + struct user_beancounter *ub; + + if (event != VIRTINFO_ENOUGHMEM) + return old_ret; + /* + * If it's a kernel thread, don't care about it. + * Added in order aufsd to run smoothly over ramfs. + */ + if (!current->mm) + return NOTIFY_DONE; + + ub = top_beancounter(current->mm->mm_ub); + if (ub->ub_parms[UB_PRIVVMPAGES].held > + ub->ub_parms[UB_VMGUARPAGES].barrier) + return old_ret; + + return NOTIFY_OK; +} + +static struct vnotifier_block vmguar_notifier_block = { + .notifier_call = vmguar_enough_memory +}; + +static int __init init_vmguar_notifier(void) +{ + virtinfo_notifier_register(VITYPE_GENERAL, &vmguar_notifier_block); + return 0; +} + +static void __exit fini_vmguar_notifier(void) +{ + virtinfo_notifier_unregister(VITYPE_GENERAL, &vmguar_notifier_block); +} + +module_init(init_vmguar_notifier); +module_exit(fini_vmguar_notifier); + +#ifdef CONFIG_PROC_FS +static int bc_vmaux_show(struct seq_file *f, void *v) +{ + struct user_beancounter *ub; + unsigned long swap, unmap; + int i; + + ub = seq_beancounter(f); + + swap = unmap = 0; + for_each_online_cpu(i) { + swap += per_cpu_ptr(ub->ub_percpu, i)->swapin; + unmap += per_cpu_ptr(ub->ub_percpu, i)->unmap; + } + + seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_UNUSEDPRIVVM], + ub->ub_unused_privvmpages); + seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_TMPFSPAGES], + ub->ub_tmpfs_respages); + seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_SWAPPAGES], + ub->ub_swap_pages); + + seq_printf(f, bc_proc_lu_fmt, "swapin", swap); + seq_printf(f, bc_proc_lu_fmt, "unmap", unmap); + return 0; +} +static struct bc_proc_entry bc_vmaux_entry = { + .name = "vmaux", + .u.show = bc_vmaux_show, +}; + +static int __init bc_vmaux_init(void) +{ + bc_register_proc_entry(&bc_vmaux_entry); + return 0; +} + +late_initcall(bc_vmaux_init); +#endif diff --git a/kernel/capability.c b/kernel/capability.c index 33e51e7..9c6a925 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -19,7 +19,8 @@ * This lock protects task->cap_* for all tasks including current. * Locking rule: acquire this prior to tasklist_lock. */ -static DEFINE_SPINLOCK(task_capability_lock); +DEFINE_SPINLOCK(task_capability_lock); +EXPORT_SYMBOL(task_capability_lock); /* * Leveraged for setting/resetting capabilities @@ -167,7 +168,7 @@ static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective, pgrp = find_vpid(pgrp_nr); do_each_pid_task(pgrp, PIDTYPE_PGID, g) { target = g; - while_each_thread(g, target) { + while_each_thread_ve(g, target) { if (!security_capset_check(target, effective, inheritable, permitted)) { security_capset_set(target, effective, @@ -201,7 +202,7 @@ static inline int cap_set_all(kernel_cap_t *effective, spin_lock(&task_capability_lock); read_lock(&tasklist_lock); - do_each_thread(g, target) { + do_each_thread_ve(g, target) { if (target == current || is_container_init(target->group_leader)) continue; @@ -211,7 +212,7 @@ static inline int cap_set_all(kernel_cap_t *effective, continue; ret = 0; security_capset_set(target, effective, inheritable, permitted); - } while_each_thread(g, target); + } while_each_thread_ve(g, target); read_unlock(&tasklist_lock); spin_unlock(&task_capability_lock); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a0123d7..8412865 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1773,7 +1773,7 @@ static void cgroup_enable_task_cg_lists(void) struct task_struct *p, *g; write_lock(&css_set_lock); use_task_css_set_links = 1; - do_each_thread(g, p) { + do_each_thread_all(g, p) { task_lock(p); /* * We should check if the process is exiting, otherwise @@ -1783,7 +1783,7 @@ static void cgroup_enable_task_cg_lists(void) if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) list_add(&p->cg_list, &p->cgroups->tasks); task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); write_unlock(&css_set_lock); } @@ -2873,9 +2873,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, again: root = subsys->root; if (root == &rootnode) { - printk(KERN_INFO - "Not cloning cgroup for unused subsystem %s\n", - subsys->name); mutex_unlock(&cgroup_mutex); return 0; } diff --git a/kernel/cgroup_lite.c b/kernel/cgroup_lite.c new file mode 100644 index 0000000..dd89b3b --- /dev/null +++ b/kernel/cgroup_lite.c @@ -0,0 +1,226 @@ +/* + * lite cgroups engine + */ + +#include +#include +#include +#include +#include +#include + +#define SUBSYS(_x) &_x ## _subsys, + +static struct cgroup_subsys *subsys[] = { +#include +}; + +static struct css_set init_css_set; +static struct cgroup init_cgroup; +static struct cftype *subsys_cftypes[CGROUP_SUBSYS_COUNT]; + +static int init_css_set_subsystems(struct cgroup *g, struct css_set *set) +{ + int i; + struct cgroup_subsys_state *ss; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *cs = subsys[i]; + + ss = cs->create(cs, g); + if (IS_ERR(ss)) + goto destroy; + + g->subsys[i] = ss; + set->subsys[i] = ss; + atomic_set(&ss->refcnt, 0); + ss->cgroup = g; + } + return 0; + +destroy: + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *cs = subsys[i]; + + if (g->subsys[i]) + cs->destroy(cs, g); + } + return PTR_ERR(ss); +} + +int init_ve_cgroups(struct ve_struct *ve) +{ + int err = -ENOMEM; + struct cgroup *g; + struct css_set *cs; + + g = kzalloc(sizeof(struct cgroup), GFP_KERNEL); + if (g == NULL) + goto err_galloc; + + cs = kzalloc(sizeof(struct css_set), GFP_KERNEL); + if (cs == NULL) + goto err_calloc; + + g->parent = &init_cgroup; + err = init_css_set_subsystems(g, cs); + if (err) + goto err_subsys; + + g->parent = &init_cgroup; + ve->ve_cgroup = g; + ve->ve_css_set = cs; + return 0; + +err_subsys: + kfree(cs); +err_calloc: + kfree(g); +err_galloc: + return err; +} +EXPORT_SYMBOL(init_ve_cgroups); + +void fini_ve_cgroups(struct ve_struct *ve) +{ + int i; + struct cgroup *g = ve->ve_cgroup; + struct css_set *css = ve->ve_css_set; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *cs = subsys[i]; + struct cgroup_subsys_state *ss = css->subsys[i]; + + BUG_ON(ss != g->subsys[i]); + + if (cs->pre_destroy) + cs->pre_destroy(cs, g); + + if (atomic_read(&ss->refcnt)) + printk(KERN_ERR "CG: leaking %d/%s subsys\n", + ve->veid, subsys[i]->name); + else + cs->destroy(cs, g); + } + + kfree(g); + kfree(css); + ve->ve_cgroup = NULL; + ve->ve_css_set = NULL; +} +EXPORT_SYMBOL(fini_ve_cgroups); + +/* + * task lifecycle + */ + +void cgroup_fork(struct task_struct *child) +{ + child->cgroups = current->cgroups; +} + +void cgroup_fork_callbacks(struct task_struct *child) +{ +} + +void cgroup_post_fork(struct task_struct *child) +{ +} + +void cgroup_exit(struct task_struct *tsk, int dummy) +{ + tsk->cgroups = &init_css_set; +} + +int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) +{ + return -ENODATA; +} + +/* + * proc struts + */ + +static int proc_cgroup_show(struct seq_file *m, void *v) +{ + struct task_struct *tsk; + + tsk = pid_task((struct pid *)m->private, PIDTYPE_PID); + seq_printf(m, "%p\n", tsk->cgroups); + return 0; +} + +static int cgroup_open(struct inode *inode, struct file *file) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return single_open(file, proc_cgroup_show, PROC_I(inode)->pid); +} + +struct file_operations proc_cgroup_operations = { + .open = cgroup_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * cgroups misc struts + */ + +int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, + const struct cftype cft[], int count) +{ + int idx = subsys->subsys_id; + static DEFINE_SPINLOCK(add_files_lock); + + if (unlikely(subsys_cftypes[idx] == NULL)) { + spin_lock(&add_files_lock); + if (subsys_cftypes[idx] == NULL) + subsys_cftypes[idx] = (struct cftype *)cft; + spin_unlock(&add_files_lock); + } + + BUG_ON(subsys_cftypes[idx] != cft); + return 0; +} + +void cgroup_lock(void) +{ +} + +void cgroup_unlock(void) +{ +} + +bool cgroup_lock_live_group(struct cgroup *cg) +{ + return 1; +} + + +int cgroup_is_removed(const struct cgroup *cgrp) +{ + return 0; +} + +int __init cgroup_init_early(void) +{ + int i; + + init_task.cgroups = &init_css_set; + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) + BUG_ON(subsys[i]->early_init); + + return 0; +} + +int __init cgroup_init(void) +{ + get_ve0()->ve_cgroup = &init_cgroup; + get_ve0()->ve_css_set = &init_css_set; + if (init_css_set_subsystems(&init_cgroup, &init_css_set) != 0) + panic("CG: Can't init initial set\n"); + return 0; +} diff --git a/kernel/compat.c b/kernel/compat.c index 32c254a..58506ef 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -40,7 +41,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -static long compat_nanosleep_restart(struct restart_block *restart) +long compat_nanosleep_restart(struct restart_block *restart) { struct compat_timespec __user *rmtp; struct timespec rmt; @@ -62,6 +63,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) return ret; } +EXPORT_SYMBOL_GPL(compat_nanosleep_restart); asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, struct compat_timespec __user *rmtp) diff --git a/kernel/cpt/Makefile b/kernel/cpt/Makefile new file mode 100644 index 0000000..d97cc31 --- /dev/null +++ b/kernel/cpt/Makefile @@ -0,0 +1,53 @@ +# +# +# kernel/cpt/Makefile +# +# Copyright (C) 2000-2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +obj-$(CONFIG_VZ_CHECKPOINT) += vzcpt.o vzrst.o + +vzcpt-objs := cpt_proc.o cpt_dump.o cpt_obj.o cpt_context.o cpt_process.o \ + cpt_mm.o cpt_files.o cpt_kernel.o \ + cpt_socket.o cpt_socket_in.o cpt_tty.o cpt_sysvipc.o cpt_net.o \ + cpt_conntrack.o cpt_epoll.o + +vzrst-objs := rst_proc.o rst_undump.o rst_context.o rst_process.o \ + rst_mm.o rst_files.o \ + rst_socket.o rst_socket_in.o rst_tty.o rst_sysvipc.o rst_net.o \ + rst_conntrack.o rst_epoll.o + +ifeq ($(CONFIG_BEANCOUNTERS), y) +vzcpt-objs += cpt_ubc.o +vzrst-objs += rst_ubc.o +endif + +ifeq ($(CONFIG_INOTIFY_USER), y) +vzcpt-objs += cpt_inotify.o +vzrst-objs += rst_inotify.o +endif + +vzrst-objs += cpt_exports.o + +ifeq ($(CONFIG_VZ_CHECKPOINT), m) +vzrst-objs += cpt_obj.o cpt_kernel.o +endif + +ifeq ($(CONFIG_VZ_CHECKPOINT_ITER), y) +vzcpt-objs += cpt_iterative.o +vzrst-objs += rst_iterative.o +endif + +ifeq ($(CONFIG_VZ_CHECKPOINT_LAZY), y) +vzcpt-objs += cpt_pagein.o +vzrst-objs += rst_pagein.o +endif + +ifeq ($(CONFIG_X86_64), y) +vzcpt-objs += cpt_x8664.o +ifeq ($(CONFIG_VZ_CHECKPOINT), m) +vzrst-objs += cpt_x8664.o +endif +endif diff --git a/kernel/cpt/cpt_conntrack.c b/kernel/cpt/cpt_conntrack.c new file mode 100644 index 0000000..19dcf32 --- /dev/null +++ b/kernel/cpt/cpt_conntrack.c @@ -0,0 +1,365 @@ +/* + * + * kernel/cpt/cpt_conntrack.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_VE_IPTABLES) && \ + (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) + +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + + +/* How does it work? + * + * Network is disabled, so new conntrack entries will not appear. + * However, some of them can disappear because of timeouts. + * + * So, we take read_lock, collect all required information atomically, + * essentially, creating parallel "refcount" structures holding pointers. + * We delete conntrack timers as well, so the structures cannot disappear + * after releasing the lock. Now, after releasing lock we can dump everything + * safely. And on exit we restore timers to their original values. + * + * Note, this approach is not going to work in VE0. + */ + +struct ct_holder +{ + struct ct_holder *next; + struct ip_conntrack_tuple_hash *cth; + int index; +}; + +static void encode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple) +{ + v->cpt_dst = tuple->dst.ip; + v->cpt_dstport = tuple->dst.u.all; + v->cpt_protonum = tuple->dst.protonum; + v->cpt_dir = tuple->dst.dir; + + v->cpt_src = tuple->src.ip; + v->cpt_srcport = tuple->src.u.all; +} + +static int dump_one_expect(struct cpt_ip_connexpect_image *v, + struct ip_conntrack_expect *exp, + int sibling, cpt_context_t *ctx) +{ + int err = 0; + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_NET_CONNTRACK_EXPECT; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + encode_tuple(&v->cpt_tuple, &exp->tuple); + encode_tuple(&v->cpt_mask, &exp->mask); + v->cpt_sibling_conntrack = sibling; + v->cpt_flags = exp->flags; + v->cpt_seq = exp->id; + v->cpt_dir = 0; + v->cpt_manip_proto = 0; +#ifdef CONFIG_IP_NF_NAT_NEEDED + v->cpt_manip_proto = exp->saved_proto.all; + v->cpt_dir = exp->dir; +#endif + v->cpt_timeout = 0; + if (exp->master->helper->timeout) + v->cpt_timeout = exp->timeout.expires - jiffies; + return err; +} + +/* NOTE. We use one page to dump list of expectations. This may be not enough + * in theory. In practice there is only one expectation per conntrack record. + * Moreover, taking into account that _ALL_ of expecations are saved in one + * global list, which is looked up each incoming/outpging packet, the system + * would be severely dead when even one conntrack would have so much of + * expectations. Shortly, I am not going to repair this. + */ + +static int dump_expect_list(struct ip_conntrack *ct, struct ct_holder *list, + cpt_context_t *ctx) +{ + int err = 0; + unsigned long pg; + struct cpt_ip_connexpect_image *v; + struct ip_conntrack_expect *exp; + + if (ct->expecting == 0) + return err; + if (ct->expecting*sizeof(struct cpt_ip_connexpect_image) > PAGE_SIZE) + return -ENOBUFS; + + pg = __get_free_page(GFP_KERNEL); + if (!pg) + return -ENOMEM; + v = (struct cpt_ip_connexpect_image *)pg; + + read_lock_bh(&ip_conntrack_lock); + list_for_each_entry(exp, &ve_ip_conntrack_expect_list, list) { + int sibling; + + if (exp->master != ct) + continue; + + if (ct->helper == NULL) { + eprintk_ctx("conntrack: no helper and non-trivial expectation\n"); + err = -EINVAL; + break; + } + + sibling = 0; +#if 0 + /* That's all? No need to calculate sibling? */ + if (exp->sibling) { + struct ct_holder *c; + for (c = list; c; c = c->next) { + if (tuplehash_to_ctrack(c->cth) == exp->sibling) { + sibling = c->index; + break; + } + } + /* NOTE: exp->sibling could be not "confirmed" and, hence, + * out of hash table. We should just ignore such a sibling, + * the connection is going to be retried, the packet + * apparently was lost somewhere. + */ + if (sibling == 0) + dprintk_ctx("sibling conntrack is not found\n"); + } +#endif + + /* If the expectation still does not have exp->sibling + * and timer is not running, it is about to die on another + * cpu. Skip it. */ + if (!sibling && + ct->helper->timeout && + !timer_pending(&exp->timeout)) { + dprintk_ctx("conntrack: expectation: no timer\n"); + continue; + } + + err = dump_one_expect(v, exp, sibling, ctx); + if (err) + break; + + v++; + } + read_unlock_bh(&ip_conntrack_lock); + + if (err == 0 && (unsigned long)v != pg) + ctx->write((void*)pg, (unsigned long)v - pg, ctx); + + free_page(pg); + return err; +} + +static int dump_one_ct(struct ct_holder *c, struct ct_holder *list, + cpt_context_t *ctx) +{ + struct ip_conntrack_tuple_hash *h = c->cth; + struct ip_conntrack *ct = tuplehash_to_ctrack(h); + struct cpt_ip_conntrack_image v; + int err = 0; + + if (sizeof(v.cpt_proto_data) != sizeof(ct->proto)) { + eprintk_ctx("conntrack module ct->proto version mismatch\n"); + return -EINVAL; + } + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_CONNTRACK; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + read_lock_bh(&ip_conntrack_lock); + v.cpt_status = ct->status; + v.cpt_timeout = ct->timeout.expires - jiffies; + v.cpt_ct_helper = (ct->helper != NULL); + v.cpt_index = c->index; + v.cpt_id = ct->id; + v.cpt_mark = 0; +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + v.cpt_mark = ct->mark; +#endif + encode_tuple(&v.cpt_tuple[0], &ct->tuplehash[0].tuple); + encode_tuple(&v.cpt_tuple[1], &ct->tuplehash[1].tuple); + memcpy(&v.cpt_proto_data, &ct->proto, sizeof(v.cpt_proto_data)); + memcpy(&v.cpt_help_data, &ct->help, sizeof(v.cpt_help_data)); + + v.cpt_masq_index = 0; + v.cpt_initialized = 0; + v.cpt_num_manips = 0; + v.cpt_nat_helper = 0; +#ifdef CONFIG_IP_NF_NAT_NEEDED +#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ + defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) + v.cpt_masq_index = ct->nat.masq_index; +#endif + /* "help" data is used by pptp, difficult to support */ + v.cpt_nat_seq[0].cpt_correction_pos = ct->nat.info.seq[0].correction_pos; + v.cpt_nat_seq[0].cpt_offset_before = ct->nat.info.seq[0].offset_before; + v.cpt_nat_seq[0].cpt_offset_after = ct->nat.info.seq[0].offset_after; + v.cpt_nat_seq[1].cpt_correction_pos = ct->nat.info.seq[1].correction_pos; + v.cpt_nat_seq[1].cpt_offset_before = ct->nat.info.seq[1].offset_before; + v.cpt_nat_seq[1].cpt_offset_after = ct->nat.info.seq[1].offset_after; +#endif + read_unlock_bh(&ip_conntrack_lock); + + ctx->write(&v, sizeof(v), ctx); + + err = dump_expect_list(ct, list, ctx); + + cpt_close_object(ctx); + return err; +} + +int cpt_dump_ip_conntrack(cpt_context_t * ctx) +{ + struct ct_holder *ct_list = NULL; + struct ct_holder *c, **cp; + int err = 0; + int index = 0; + int idx; + + if (get_exec_env()->_ip_conntrack == NULL) + return 0; + + for (idx = atomic_read(&(get_exec_env()->_ip_conntrack->_ip_conntrack_count)); idx >= 0; idx--) { + c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); + if (c == NULL) { + err = -ENOMEM; + goto done; + } + memset(c, 0, sizeof(struct ct_holder)); + c->next = ct_list; + ct_list = c; + } + + c = ct_list; + + read_lock_bh(&ip_conntrack_lock); + for (idx = 0; idx < ip_conntrack_htable_size; idx++) { + struct ip_conntrack_tuple_hash *h; + list_for_each_entry(h, &ve_ip_conntrack_hash[idx], list) { + /* Skip reply tuples, they are covered by original + * direction. */ + if (DIRECTION(h)) + continue; + + /* Oops, we have not enough of holders... + * It is impossible. */ + if (unlikely(c == NULL)) { + read_unlock_bh(&ip_conntrack_lock); + eprintk_ctx("unexpected conntrack appeared\n"); + err = -ENOMEM; + goto done; + } + + /* If timer is not running, it means that it + * has just been scheduled on another cpu. + * We should skip this conntrack, it is about to be + * destroyed. */ + if (!del_timer(&tuplehash_to_ctrack(h)->timeout)) { + dprintk_ctx("conntrack: no timer\n"); + continue; + } + + /* Timer is deleted. refcnt is _not_ decreased. + * We are going to restore the timer on exit + * from this function. */ + c->cth = h; + c->index = ++index; + c = c->next; + } + } + read_unlock_bh(&ip_conntrack_lock); + + /* No conntracks? Good. */ + if (index == 0) + goto done; + + /* Comb the list a little. */ + cp = &ct_list; + while ((c = *cp) != NULL) { + /* Discard unused entries; they can appear, if some + * entries were timed out since we preallocated the list. + */ + if (c->cth == NULL) { + *cp = c->next; + kfree(c); + continue; + } + + /* Move conntracks attached to expectations to the beginning + * of the list. */ + if (tuplehash_to_ctrack(c->cth)->master && c != ct_list) { + *cp = c->next; + c->next = ct_list; + ct_list = c; + dprintk_ctx("conntrack: %d moved in list\n", c->index); + continue; + } + cp = &c->next; + } + + cpt_open_section(ctx, CPT_SECT_NET_CONNTRACK); + + for (c = ct_list; c; c = c->next) { + err = dump_one_ct(c, ct_list, ctx); + if (err) + goto done; + } + + cpt_close_section(ctx); + +done: + while ((c = ct_list) != NULL) { + ct_list = c->next; + if (c->cth) { + /* Restore timer. refcnt is preserved. */ + add_timer(&tuplehash_to_ctrack(c->cth)->timeout); + } + kfree(c); + } + return err; +} + +#endif diff --git a/kernel/cpt/cpt_context.c b/kernel/cpt/cpt_context.c new file mode 100644 index 0000000..88c403d --- /dev/null +++ b/kernel/cpt/cpt_context.c @@ -0,0 +1,257 @@ +/* + * + * kernel/cpt/cpt_context.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + + +static void file_write(const void *addr, size_t count, struct cpt_context *ctx) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->write(file, addr, count, &file->f_pos); + set_fs(oldfs); + if (err != count && !ctx->write_error) + ctx->write_error = err < 0 ? err : -EIO; +} + +static void file_pwrite(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->write(file, addr, count, &pos); + set_fs(oldfs); + if (err != count && !ctx->write_error) + ctx->write_error = err < 0 ? err : -EIO; +} + +static void file_align(struct cpt_context *ctx) +{ + struct file *file = ctx->file; + + if (file) + file->f_pos = CPT_ALIGN(file->f_pos); +} + +void cpt_context_init(struct cpt_context *ctx) +{ + int i; + + memset(ctx, 0, sizeof(*ctx)); + + init_MUTEX(&ctx->main_sem); + ctx->refcount = 1; + + ctx->current_section = -1; + ctx->current_object = -1; + ctx->pagesize = PAGE_SIZE; + ctx->write = file_write; + ctx->pwrite = file_pwrite; + ctx->align = file_align; + for (i=0; i < CPT_SECT_MAX; i++) + ctx->sections[i] = CPT_NULL; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + init_completion(&ctx->pgin_notify); +#endif + cpt_object_init(ctx); +} + +int cpt_open_dumpfile(struct cpt_context *ctx) +{ + ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); + if (ctx->tmpbuf == NULL) + return -ENOMEM; + __cpt_release_buf(ctx); + return 0; +} + +int cpt_close_dumpfile(struct cpt_context *ctx) +{ + if (ctx->file) { + fput(ctx->file); + ctx->file = NULL; + } + if (ctx->tmpbuf) { + free_page((unsigned long)ctx->tmpbuf); + ctx->tmpbuf = NULL; + } + if (ctx->write_error) + eprintk_ctx("error while writing dump file: %d\n", ctx->write_error); + return ctx->write_error; +} + +int cpt_major_hdr_out(struct cpt_context *ctx) +{ + struct cpt_major_hdr hdr; + + if (ctx->file == NULL) + return 0; + + memset(&hdr, 0, sizeof(hdr)); + hdr.cpt_signature[0] = CPT_SIGNATURE0; + hdr.cpt_signature[1] = CPT_SIGNATURE1; + hdr.cpt_signature[2] = CPT_SIGNATURE2; + hdr.cpt_signature[3] = CPT_SIGNATURE3; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_image_version = CPT_VERSION_27; +#ifdef CONFIG_X86_64 + hdr.cpt_os_arch = CPT_OS_ARCH_EMT64; +#elif defined(CONFIG_X86_32) + hdr.cpt_os_arch = CPT_OS_ARCH_I386; +#elif defined(CONFIG_IA64) + hdr.cpt_os_arch = CPT_OS_ARCH_IA64; +#else +#error Arch is not supported +#endif + hdr.cpt_ve_features = (__u32)ctx->features; + hdr.cpt_ve_features2 = (__u32)(ctx->features>>32); + hdr.cpt_pagesize = (__u16)PAGE_SIZE; + hdr.cpt_hz = HZ; + hdr.cpt_start_jiffies64 = ctx->virt_jiffies64; + hdr.cpt_start_sec = ctx->start_time.tv_sec; + hdr.cpt_start_nsec = ctx->start_time.tv_nsec; + hdr.cpt_cpu_caps[0] = ctx->src_cpu_flags; + hdr.cpt_kernel_config[0] = ctx->kernel_config_flags; + hdr.cpt_iptables_mask = ctx->iptables_mask; + + ctx->write(&hdr, sizeof(hdr), ctx); + return 0; +} + +int cpt_close_section(struct cpt_context *ctx) +{ + if (ctx->file && ctx->current_section >= 0) { + __u64 next = ctx->file->f_pos - ctx->current_section; + ctx->pwrite(&next, 8, ctx, ctx->current_section); + ctx->current_section = -1; + } + return 0; +} +EXPORT_SYMBOL(cpt_close_section); + +int cpt_open_section(struct cpt_context *ctx, __u32 type) +{ + struct cpt_section_hdr hdr; + + if (ctx->file == NULL) + return 0; + + cpt_close_section(ctx); + + ctx->current_section = ctx->file->f_pos; + ctx->sections[type] = ctx->current_section; + + hdr.cpt_next = 0; + hdr.cpt_section = type; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_align = 0; + ctx->write(&hdr, sizeof(hdr), ctx); + + return 0; +} +EXPORT_SYMBOL(cpt_open_section); + + +int cpt_close_object(struct cpt_context *ctx) +{ + if (ctx->file && ctx->current_object >= 0) { + __u64 next = ctx->file->f_pos - ctx->current_object; + ctx->pwrite(&next, 8, ctx, ctx->current_object); + ctx->current_object = -1; + } + return 0; +} +EXPORT_SYMBOL(cpt_close_object); + +int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx) +{ + if (ctx->file == NULL) + return 0; + + cpt_close_object(ctx); + + ctx->current_object = ctx->file->f_pos; + if (obj) + cpt_obj_setpos(obj, ctx->current_object, ctx); + + return 0; +} +EXPORT_SYMBOL(cpt_open_object); + +int cpt_push_object(loff_t *saved, struct cpt_context *ctx) +{ + if (ctx->file) { + *saved = ctx->current_object; + ctx->current_object = ctx->file->f_pos; + } + return 0; +} +EXPORT_SYMBOL(cpt_push_object); + +int cpt_pop_object(loff_t *saved, struct cpt_context *ctx) +{ + ctx->current_object = *saved; + return 0; +} +EXPORT_SYMBOL(cpt_pop_object); + +int cpt_dump_tail(struct cpt_context *ctx) +{ + struct cpt_major_tail hdr; + int i; + + if (ctx->file == NULL) + return 0; + + cpt_open_section(ctx, CPT_SECT_TRAILER); + memset(&hdr, 0, sizeof(hdr)); + hdr.cpt_next = sizeof(hdr); + hdr.cpt_object = CPT_OBJ_TRAILER; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_content = CPT_CONTENT_VOID; + hdr.cpt_lazypages = 0; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + hdr.cpt_lazypages = ctx->lazypages; +#endif + hdr.cpt_64bit = ctx->tasks64; + hdr.cpt_signature[0] = CPT_SIGNATURE0; + hdr.cpt_signature[1] = CPT_SIGNATURE1; + hdr.cpt_signature[2] = CPT_SIGNATURE2; + hdr.cpt_signature[3] = CPT_SIGNATURE3; + hdr.cpt_nsect = CPT_SECT_MAX_INDEX; + for (i = 0; i < CPT_SECT_MAX_INDEX; i++) + hdr.cpt_sections[i] = ctx->sections[i]; + + ctx->write(&hdr, sizeof(hdr), ctx); + cpt_close_section(ctx); + return 0; +} diff --git a/kernel/cpt/cpt_context.h b/kernel/cpt/cpt_context.h new file mode 100644 index 0000000..e4f82f9 --- /dev/null +++ b/kernel/cpt/cpt_context.h @@ -0,0 +1,215 @@ +#include +#include +#include + +#define CPT_CTX_ERROR -1 +#define CPT_CTX_IDLE 0 +#define CPT_CTX_SUSPENDING 1 +#define CPT_CTX_SUSPENDED 2 +#define CPT_CTX_DUMPING 3 +#define CPT_CTX_UNDUMPING 4 +#define CPT_CTX_UNDUMPED 5 + +#define CPT_TID(tsk) task_pid_nr(tsk), task_pid_vnr(tsk), (tsk)->comm +#define CPT_FID "%d,%d(%s)" + + +typedef struct cpt_context +{ + struct list_head ctx_list; + int refcount; + int ctx_state; + int objcount; + int sticky; + struct semaphore main_sem; + + struct file *errorfile; + struct file *statusfile; + struct file *lockfile; + + int errno; + char *error_msg; + loff_t err_offset; + + struct file *file; + char *tmpbuf; + int pagesize; +#ifdef CONFIG_VZ_CHECKPOINT_ITER + int iter_done; + void *iter_dir; + struct user_beancounter *iter_ub; +#endif + loff_t current_section; + loff_t current_object; + + loff_t sections[CPT_SECT_MAX]; + + __u32 errormask; + __u32 write_error; + + struct list_head object_array[CPT_OBJ_MAX]; + + void (*write)(const void *addr, size_t count, struct cpt_context *ctx); + void (*pwrite)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); + ssize_t (*read)(void *addr, size_t count, struct cpt_context *ctx); + ssize_t (*pread)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); + void (*align)(struct cpt_context *ctx); + int ve_id; + int contextid; + struct timespec cpt_monotonic_time; /* Host monotonic time at the moment of cpt/rst + * corresponging to start_time */ + __u64 virt_jiffies64; /* Virtual jiffies64. It is == cpt_jiffies64 when + * VE did not migrate. */ + struct timespec start_time; + struct timespec delta_time; + __s64 delta_nsec; + int image_version; + __u16 image_arch; + __u64 iptables_mask; + __u64 features; + +#define CPT_ANONVMA_HBITS (sizeof(void*) == 4 ? 10 : 9) +#define CPT_ANONVMA_HSIZE (1<ve_id, ##arg) + +#define wprintk(a...) cpt_printk(2, "CPT WRN: " a) +#define wprintk_ctx(f, arg...) wprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg) + +#define eprintk(a...) cpt_printk(1, "CPT ERR: " a) +#define eprintk_ctx(f, arg...) \ +do { \ + eprintk("%p,%u :" f, ctx, ctx->ve_id, ##arg); \ + if (ctx->error_msg && ctx->err_offset < PAGE_SIZE) \ + ctx->err_offset += snprintf((char*)(ctx->error_msg + \ + ctx->err_offset), \ + PAGE_SIZE - ctx->err_offset, \ + "Error: " f, ##arg); \ +} while(0) + +#define CPT_TMPBUF_FREE 0x789adf12 +#define CPT_TMPBUF_BUSY 0xabcd9876 + +static inline void *cpt_get_buf(cpt_context_t *ctx) +{ + void *buf = ctx->tmpbuf; + + BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_FREE); + *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_BUSY; + return buf; +} + +static inline void __cpt_release_buf(cpt_context_t *ctx) +{ + void *buf = ctx->tmpbuf; + + *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; +} + +static inline void cpt_release_buf(cpt_context_t *ctx) +{ + void *buf = ctx->tmpbuf; + + BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_BUSY); + *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; +} + +static inline void cpt_flush_error(cpt_context_t *ctx) +{ + mm_segment_t oldfs; + + if (ctx->errorfile && ctx->error_msg && ctx->err_offset) { + if (ctx->errorfile->f_op && ctx->errorfile->f_op->write) { + oldfs = get_fs(); + set_fs(KERNEL_DS); + ctx->errorfile->f_op->write(ctx->errorfile, + ctx->error_msg, ctx->err_offset, + &ctx->errorfile->f_pos); + set_fs(oldfs); + } + ctx->error_msg[0] = 0; + ctx->err_offset = 0; + } +} diff --git a/kernel/cpt/cpt_dump.c b/kernel/cpt/cpt_dump.c new file mode 100644 index 0000000..f329506 --- /dev/null +++ b/kernel/cpt/cpt_dump.c @@ -0,0 +1,1250 @@ +/* + * + * kernel/cpt/cpt_dump.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_dump.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_process.h" +#include "cpt_net.h" +#include "cpt_socket.h" +#include "cpt_ubc.h" +#include "cpt_kernel.h" + + +static int vps_child_level(struct task_struct *root, struct task_struct *c) +{ + int level = 0; + int veid = VE_TASK_INFO(c)->owner_env->veid; + + while (VE_TASK_INFO(c)->owner_env->veid == veid) { + if (c->pid != c->tgid) + c = c->group_leader; + if (c == root) + return level; + + c = c->parent; + level++; + } + return -1; +} + +static inline int freezable(struct task_struct * p) +{ + if (p->exit_state) + return 0; + + switch (p->state) { + case EXIT_ZOMBIE: + case EXIT_DEAD: + case TASK_STOPPED: +#if TASK_TRACED != TASK_STOPPED + case TASK_TRACED: +#endif + return 0; + default: + return 1; + } +} + +static void wake_ve(cpt_context_t *ctx) +{ + struct task_struct *p, *g; + + do_each_thread_ve(g, p) { + spin_lock_irq(&p->sighand->siglock); + if (p->flags & PF_FROZEN) { + p->flags &= ~PF_FROZEN; + wake_up_process(p); + } + spin_unlock_irq(&p->sighand->siglock); + } while_each_thread_ve(g, p); +} + +/* + * Some comment is necessary about PF_FREEZE,PF_FROZEN,TIF_FREEZE... + * + * SWSUSP uses PF_FREEZE flag in tsk->flags raising it in context + * of another process. Apparently, it is unacceptable on SMP. + * Let's take freeze_processes() in kernel/power/process.c as an example. + * Unserialized modifications tsk->flags easily + * (believe or not, but it happens with probability of almost 100% :-)) + * creates the situation when setting PF_FREEZE in freeze_processes(), + * which quickly spins raising PF_FREEZE of all the processes, + * _clears_ PF_FROZEN just set in refrigerator(), so that suspend deadlocks. + * + * So, to make things clean, we require that those flags may be modified + * only under tsk->sighand->siglock, which is quite natural because PF_FREEZE + * is just a kind of signal. + * + * It is not enough, because we are still not allowed to change tsk->flags + * in context of another process, we can corrupt another flags, when the process + * running on another cpu modifies them. So, we use TIF_FREEZE in thread flags, + * which can be changed atomically. + * + * PF_FROZEN also changes in context of another process, but this happens + * only when the process is already in refrigerator() which does not modify + * tsk->flags. + */ + +static int check_process_external(struct task_struct *p) +{ + if (pid_alive(p)) { + if (p->pids[PIDTYPE_PID].pid->level == 0) + return PIDTYPE_PID; + if (p->pids[PIDTYPE_PGID].pid->level == 0) + return PIDTYPE_PGID; + if (p->pids[PIDTYPE_SID].pid->level == 0) + return PIDTYPE_SID; + } + + return PIDTYPE_MAX; +} + +enum +{ + OBSTACLE_NOGO = -1, + OBSTACLE_TIMEOUT = -2, + OBSTACLE_TRYAGAIN = -3, +}; + +#define SUSPEND_TIMEOUT (10UL*HZ) + +static int vps_stop_tasks(struct cpt_context *ctx) +{ + unsigned long start_time = jiffies; + unsigned long target, timeout; + struct task_struct *p, *g; + int todo; + int round = 0; + + do_gettimespec(&ctx->start_time); + do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time); + ctx->virt_jiffies64 = get_jiffies_64() + get_exec_env()->jiffies_fixup; + + read_lock(&tasklist_lock); + + atomic_inc(&get_exec_env()->suspend); + timeout = HZ/5; + target = jiffies + timeout; + + for(;;) { + struct task_struct *root; + todo = 0; + + root = find_task_by_vpid(1); + if (!root) { + read_unlock(&tasklist_lock); + eprintk_ctx("cannot find ve init\n"); + atomic_dec(&get_exec_env()->suspend); + return -ESRCH; + } + + do_each_thread_ve(g, p) { + if (vps_child_level(root, p) >= 0) { + switch (check_process_external(p)) { + case PIDTYPE_PID: + eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", + task_pid_vnr(p), p->pid, p->comm); + todo = OBSTACLE_NOGO; + goto out; + case PIDTYPE_PGID: + eprintk_ctx("external process group %d/%d(%s) inside CT " + "(e.g. vzctl enter or vzctl exec).\n", + task_pgrp_vnr(p), p->pid, p->comm); + todo = OBSTACLE_NOGO; + goto out; + case PIDTYPE_SID: + eprintk_ctx("external process session %d/%d(%s) inside CT " + "(e.g. vzctl enter or vzctl exec).\n", + task_session_vnr(p), p->pid, p->comm); + todo = OBSTACLE_NOGO; + goto out; + } + if (p->vfork_done) { + /* Task between vfork()...exec() + * cannot be frozen, because parent + * wait in uninterruptible state. + * So, we do nothing, waiting for + * exec(), unless: + */ + if (p->state == TASK_STOPPED || + p->state == TASK_TRACED) { + eprintk_ctx("task " CPT_FID " is stopped while vfork(). " + "Checkpointing is impossible.\n", + CPT_TID(p)); + todo = OBSTACLE_NOGO; + /* It is fatal, _user_ stopped + * vfork()ing task, so that we + * cannot suspend now. + */ + } else { + todo = OBSTACLE_TRYAGAIN; + } + goto out; + } + if (p->signal->group_exit_task && + p->signal->notify_count) { + /* exec() waits for threads' death */ + wprintk_ctx("task " CPT_FID " waits for threads' death\n", CPT_TID(p)); + todo = OBSTACLE_TRYAGAIN; + goto out; + } + if (p->state == TASK_TRACED +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) + && !p->stopped_state +#endif + ) { + int ptrace_id = p->pn_state; + /* Debugger waits for signal. */ + switch (ptrace_id) { + case PN_STOP_TF: + case PN_STOP_TF_RT: + case PN_STOP_ENTRY: + case PN_STOP_FORK: + case PN_STOP_VFORK: + case PN_STOP_SIGNAL: + case PN_STOP_EXIT: + case PN_STOP_LEAVE: + break; + default: + eprintk_ctx("task " CPT_FID " is stopped by debugger while %d.\n", CPT_TID(p), ptrace_id); + todo = OBSTACLE_NOGO; + goto out; + } + } +#ifdef CONFIG_UTRACE + if (check_utrace(p, root, ctx)) { + eprintk_ctx("task " CPT_FID " is utraced. Checkpointing is impossible.\n", CPT_TID(p)); + todo = OBSTACLE_NOGO; + goto out; + } +#endif + if (p->flags & PF_NOFREEZE) { + eprintk_ctx("task " CPT_FID " is unfreezable. Checkpointing is impossible.\n", CPT_TID(p)); + todo = OBSTACLE_NOGO; + goto out; + } + + if (!freezable(p)) + continue; + + spin_lock_irq(&p->sighand->siglock); + if (!(p->flags & PF_FROZEN)) { + set_tsk_thread_flag(p, TIF_FREEZE); + signal_wake_up(p, 0); + } + spin_unlock_irq(&p->sighand->siglock); + + if (p->flags & PF_FROZEN) { + if (p->state != TASK_UNINTERRUPTIBLE) + printk("Holy Crap 1 %ld " CPT_FID "\n", p->state, CPT_TID(p)); + continue; + } + + if (round == 10) + wprintk_ctx(CPT_FID " is running\n", CPT_TID(p)); + + todo++; + } else { + if (p != current) { + eprintk_ctx("foreign process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", + task_pid_vnr(p), task_pid_nr(p), p->comm); + todo = OBSTACLE_NOGO; + goto out; + } + } + } while_each_thread_ve(g, p); + + if (todo > 0) { + /* No visible obstacles, but VE did not freeze + * for timeout. Interrupt suspend, if it is major + * timeout or signal; if it is minor timeout + * we will wake VE and restart suspend. + */ + if (time_after(jiffies, start_time + SUSPEND_TIMEOUT) + || signal_pending(current)) + todo = OBSTACLE_TIMEOUT; + else if (time_after(jiffies, target)) + todo = OBSTACLE_TRYAGAIN; + } + +out: + if (todo < 0) { + atomic_dec(&get_exec_env()->suspend); + + wake_ve(ctx); + +#if 0 + /* This is sign of failure of printk(), which is not + * ours. So, no prefixes. */ + printk(">\n"); +#endif + } + + read_unlock(&tasklist_lock); + + if (!todo) { + atomic_dec(&get_exec_env()->suspend); + return 0; + } + + switch (todo) { + case OBSTACLE_NOGO: + eprintk_ctx("suspend is impossible now.\n"); + return -EAGAIN; + + case OBSTACLE_TIMEOUT: + eprintk_ctx("interrupted or timed out.\n"); + return -EINTR; + + case OBSTACLE_TRYAGAIN: + if (time_after(jiffies, start_time + SUSPEND_TIMEOUT) || + signal_pending(current)) { + wprintk_ctx("suspend timed out\n"); + return -EAGAIN; + } + + wprintk_ctx("minor suspend timeout (%lu) expired, " + "trying again\n", timeout); + + /* Try again. VE is awake, give it some time to run. */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ); + + /* After a short wait restart suspend + * with longer timeout */ + atomic_inc(&get_exec_env()->suspend); + timeout = min(timeout<<1, SUSPEND_TIMEOUT); + target = jiffies + timeout; + break; + + default: + if (round > 0) { + /* VE is partially frozen, give processes + * a chance to enter to refrigerator(). */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/20); + } else { + yield(); + } + } + + read_lock(&tasklist_lock); + round++; + } +} + +static int cpt_unlock_ve(struct cpt_context *ctx) +{ + struct ve_struct *env; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + down_write(&env->op_sem); + env->is_locked = 0; + up_write(&env->op_sem); + put_ve(env); + return 0; +} + +int cpt_resume(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx); + + cpt_unlock_sockets(ctx); + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pgin_task) { + wait_for_completion(&ctx->pgin_notify); + put_task_struct(ctx->pgin_task); + ctx->pgin_task = NULL; + } +#endif + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + + spin_lock_irq(&tsk->sighand->siglock); + if (tsk->flags & PF_FROZEN) { + tsk->flags &= ~PF_FROZEN; + wake_up_process(tsk); + } else if (freezable(tsk)) { + eprintk_ctx("strange, %s not frozen\n", tsk->comm ); + } + spin_unlock_irq(&tsk->sighand->siglock); + put_task_struct(tsk); + } + + cpt_resume_network(ctx); + + cpt_unlock_ve(ctx); + + cpt_finish_ubc(ctx); + cpt_object_destroy(ctx); + return 0; +} + +int cpt_kill(struct cpt_context *ctx) +{ + int err = 0; + struct ve_struct *env; + cpt_object_t *obj; + struct task_struct *root_task = NULL; + long delay; + + if (!ctx->ve_id) + return -EINVAL; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + + /* from here cpt_kill succeeds */ + virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx); + + if (current->ve_task_info.owner_env == env) { + wprintk_ctx("attempt to kill ve from inside, escaping...\n"); + ve_move_task(current, get_ve0()); + } + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pgin_task) { + wait_for_completion(&ctx->pgin_notify); + put_task_struct(ctx->pgin_task); + ctx->pgin_task = NULL; + } +#endif + + cpt_kill_sockets(ctx); + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + + if (tsk->exit_state) { + put_task_struct(tsk); + continue; + } + + if (task_pid_vnr(tsk) == 1) { + root_task = tsk; + continue; + } + + tsk->robust_list = NULL; +#ifdef CONFIG_COMPAT + tsk->compat_robust_list = NULL; +#endif + tsk->clear_child_tid = NULL; + + if (tsk->ptrace) { + write_lock_irq(&tasklist_lock); + tsk->ptrace = 0; + if (!list_empty(&tsk->ptrace_entry)) { + list_del_init(&tsk->ptrace_entry); + /* + * This code used to be here: + * remove_parent(tsk); + * tsk->parent = tsk->parent; + * add_parent(tsk); + */ + } + write_unlock_irq(&tasklist_lock); + } + + send_sig(SIGKILL, tsk, 1); + + spin_lock_irq(&tsk->sighand->siglock); + sigfillset(&tsk->blocked); + sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); + set_tsk_thread_flag(tsk, TIF_SIGPENDING); + if (tsk->flags & PF_FROZEN) + tsk->flags &= ~PF_FROZEN; + spin_unlock_irq(&tsk->sighand->siglock); + + wake_up_process(tsk); + put_task_struct(tsk); + } + + yield(); + + if (root_task != NULL) { + send_sig(SIGKILL, root_task, 1); + + spin_lock_irq(&root_task->sighand->siglock); + sigfillset(&root_task->blocked); + sigdelsetmask(&root_task->blocked, sigmask(SIGKILL)); + set_tsk_thread_flag(root_task, TIF_SIGPENDING); + clear_tsk_thread_flag(root_task, TIF_FREEZE); + if (root_task->flags & PF_FROZEN) + root_task->flags &= ~PF_FROZEN; + spin_unlock_irq(&root_task->sighand->siglock); + + wake_up_process(root_task); + put_task_struct(root_task); + } + + cpt_finish_ubc(ctx); + cpt_object_destroy(ctx); + + delay = 1; + while (atomic_read(&env->counter) != 1) { + if (signal_pending(current)) + break; + current->state = TASK_INTERRUPTIBLE; + delay = (delay < HZ) ? (delay << 1) : HZ; + schedule_timeout(delay); + } + put_ve(env); + + return err; +} + +#ifdef CONFIG_BEANCOUNTERS +static void collect_task_ubc(struct task_struct *t, struct cpt_context *ctx) +{ + struct task_beancounter *tbc; + + tbc = &(t->task_bc); + cpt_add_ubc(tbc->exec_ub, ctx); + cpt_add_ubc(tbc->task_ub, ctx); + cpt_add_ubc(tbc->fork_sub, ctx); +} +#else +static void inline collect_task_ubc(struct task_struct *t, + struct cpt_context *ctx) +{ return; } +#endif + +static cpt_object_t * remember_task(struct task_struct * child, + cpt_object_t * head, cpt_context_t * ctx) +{ + cpt_object_t *cobj; + + if (freezable(child) && !(child->flags&PF_FROZEN)) { + eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(child)); + put_task_struct(child); + return NULL; + } + + if (lookup_cpt_object(CPT_OBJ_TASK, child, ctx)) BUG(); + if ((cobj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { + put_task_struct(child); + return NULL; + } + cobj->o_count = 1; + cpt_obj_setobj(cobj, child, ctx); + insert_cpt_object(CPT_OBJ_TASK, cobj, head, ctx); + collect_task_ubc(child, ctx); + return cobj; +} + +static int vps_collect_tasks(struct cpt_context *ctx) +{ + int err = -ESRCH; + cpt_object_t *obj; + struct task_struct *root; + read_lock(&tasklist_lock); + root = find_task_by_vpid(1); + if (root) + get_task_struct(root); + read_unlock(&tasklist_lock); + + if (!root) { + err = -ESRCH; + eprintk_ctx("vps_collect_tasks: cannot find root\n"); + goto out; + } + + if ((obj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { + put_task_struct(root); + return -ENOMEM; + } + obj->o_count = 1; + cpt_obj_setobj(obj, root, ctx); + intern_cpt_object(CPT_OBJ_TASK, obj, ctx); + collect_task_ubc(root, ctx); + + /* Collect process subtree recursively */ + for_each_object(obj, CPT_OBJ_TASK) { + cpt_object_t *head = obj; + struct task_struct *tsk = obj->o_obj; + struct task_struct *child; + + if (freezable(tsk) && !(tsk->flags&PF_FROZEN)) { + eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(tsk)); + err = -EINVAL; + goto out; + } + + if (tsk->state == TASK_RUNNING) + printk("Holy Crap 2 %ld " CPT_FID "\n", tsk->state, CPT_TID(tsk)); + + wait_task_inactive(tsk, 0); + + err = check_task_state(tsk, ctx); + if (err) + goto out; + + if (tsk->pid == tsk->tgid) { + child = tsk; + for (;;) { + read_lock(&tasklist_lock); + child = next_thread(child); + if (child != tsk) + get_task_struct(child); + read_unlock(&tasklist_lock); + + if (child == tsk) + break; + + if (child->parent != tsk->parent) { + put_task_struct(child); + eprintk_ctx("illegal thread structure, kernel bug\n"); + err = -EINVAL; + goto out; + } + + if ((head = remember_task(child, head, ctx)) == NULL) { + eprintk_ctx("task obj allocation failure\n"); + err = -ENOMEM; + goto out; + } + } + } + + /* About locking. VE is frozen. But lists of children + * may change at least for init, when entered task reparents + * to init and when reparented task exits. If we take care + * of this case, we still can unlock while scanning + * tasklists. + */ + read_lock(&tasklist_lock); + list_for_each_entry(child, &tsk->children, sibling) { + if (child->parent != tsk) + continue; + if (child->pid != child->tgid) + continue; + get_task_struct(child); + read_unlock(&tasklist_lock); + + if ((head = remember_task(child, head, ctx)) == NULL) { + eprintk_ctx("task obj allocation failure\n"); + err = -ENOMEM; + goto out; + } + + read_lock(&tasklist_lock); + } + + list_for_each_entry(child, &tsk->ptraced, ptrace_entry) { + if (child->parent != tsk) + continue; + if (child->pid != child->tgid) + continue; + get_task_struct(child); + read_unlock(&tasklist_lock); + + if ((head = remember_task(child, head, ctx)) == NULL) { + eprintk_ctx("task obj allocation failure\n"); + err = -ENOMEM; + goto out; + } + + read_lock(&tasklist_lock); + } + read_unlock(&tasklist_lock); + } + + return 0; + +out: + while (!list_empty(&ctx->object_array[CPT_OBJ_TASK])) { + struct list_head *head = ctx->object_array[CPT_OBJ_TASK].next; + cpt_object_t *obj = list_entry(head, cpt_object_t, o_list); + struct task_struct *tsk; + + list_del(head); + tsk = obj->o_obj; + put_task_struct(tsk); + free_cpt_object(obj, ctx); + } + return err; +} + +static int cpt_collect(struct cpt_context *ctx) +{ + int err; + + if ((err = cpt_collect_mm(ctx)) != 0) + return err; + + if ((err = cpt_collect_sysv(ctx)) != 0) + return err; + + if ((err = cpt_collect_files(ctx)) != 0) + return err; + + if ((err = cpt_collect_fs(ctx)) != 0) + return err; + + if ((err = cpt_collect_namespace(ctx)) != 0) + return err; + + if ((err = cpt_collect_signals(ctx)) != 0) + return err; + + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_COLLECT, ctx) & NOTIFY_FAIL) + return -ECHRNG; + + return 0; +} + +static int cpt_dump_veinfo(cpt_context_t *ctx) +{ + struct cpt_veinfo_image *i = cpt_get_buf(ctx); + struct ve_struct *ve; + struct timespec delta; + struct ipc_namespace *ns; + + cpt_open_section(ctx, CPT_SECT_VEINFO); + cpt_open_object(NULL, ctx); + + memset(i, 0, sizeof(*i)); + + i->cpt_next = CPT_NULL; + i->cpt_object = CPT_OBJ_VEINFO; + i->cpt_hdrlen = sizeof(*i); + i->cpt_content = CPT_CONTENT_VOID; + + ve = get_exec_env(); + ns = ve->ve_ns->ipc_ns; + + if (ns->shm_ctlall > 0xFFFFFFFFU) + i->shm_ctl_all = 0xFFFFFFFFU; + if (ns->shm_ctlmax > 0xFFFFFFFFU) + i->shm_ctl_max = 0xFFFFFFFFU; + i->shm_ctl_mni = ns->shm_ctlmni; + + i->msg_ctl_max = ns->msg_ctlmax; + i->msg_ctl_mni = ns->msg_ctlmni; + i->msg_ctl_mnb = ns->msg_ctlmnb; + + BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr)); + i->sem_ctl_arr[0] = ns->sem_ctls[0]; + i->sem_ctl_arr[1] = ns->sem_ctls[1]; + i->sem_ctl_arr[2] = ns->sem_ctls[2]; + i->sem_ctl_arr[3] = ns->sem_ctls[3]; + + do_posix_clock_monotonic_gettime(&delta); + _set_normalized_timespec(&delta, + delta.tv_sec - ve->start_timespec.tv_sec, + delta.tv_nsec - ve->start_timespec.tv_nsec); + i->start_timespec_delta = cpt_timespec_export(&delta); + i->start_jiffies_delta = get_jiffies_64() - ve->start_jiffies; + + i->last_pid = ve->ve_ns->pid_ns->last_pid; + + ctx->write(i, sizeof(*i), ctx); + cpt_release_buf(ctx); + cpt_close_object(ctx); + cpt_close_section(ctx); + return 0; +} + +static int cpt_dump_utsname(cpt_context_t *ctx) +{ + int len; + struct cpt_object_hdr o; + struct ve_struct *ve; + struct uts_namespace *ns; + + cpt_open_section(ctx, CPT_SECT_UTSNAME); + + ve = get_exec_env(); + ns = ve->ve_ns->uts_ns; + + cpt_open_object(NULL, ctx); + len = strlen(ns->name.nodename); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(ns->name.nodename, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + + cpt_open_object(NULL, ctx); + len = strlen(ns->name.domainname); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(ns->name.domainname, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + + cpt_close_section(ctx); + return 0; +} + +#ifndef CONFIG_IA64 +static int cpt_dump_vsyscall(cpt_context_t *ctx) +{ + struct cpt_page_block *pgb = cpt_get_buf(ctx); + + cpt_open_section(ctx, CPT_SECT_VSYSCALL); + cpt_open_object(NULL, ctx); + + pgb->cpt_next = CPT_NULL; + pgb->cpt_object = CPT_OBJ_VSYSCALL; + pgb->cpt_hdrlen = sizeof(*pgb); + pgb->cpt_content = CPT_CONTENT_DATA; + pgb->cpt_start = cpt_ptr_export(vsyscall_addr); + pgb->cpt_end = pgb->cpt_start + PAGE_SIZE; + + ctx->write(pgb, sizeof(*pgb), ctx); + cpt_release_buf(ctx); + + ctx->write(vsyscall_addr, PAGE_SIZE, ctx); + + cpt_close_object(ctx); + cpt_close_section(ctx); + return 0; +} +#endif + +int cpt_dump(struct cpt_context *ctx) +{ + struct ve_struct *oldenv, *env; + struct nsproxy *old_ns; + int err, err2 = 0; + + if (!ctx->ve_id) + return -EINVAL; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + + down_read(&env->op_sem); + err = -ESRCH; + if (!env->is_running) + goto out_noenv; + if (!env->is_locked) + goto out_noenv; + err = -EINVAL; + if (env->ve_ns->pid_ns->flags & PID_NS_HIDDEN) { + printk(KERN_WARNING "CT: checkpointing not supported yet" + " for hidden pid namespaces.\n"); + goto out_noenv; + } + + oldenv = set_exec_env(env); + old_ns = current->nsproxy; + current->nsproxy = env->ve_ns; + + /* Phase 2: real checkpointing */ + err = cpt_open_dumpfile(ctx); + if (err) + goto out; + + cpt_major_hdr_out(ctx); + + if (!err) + err = cpt_dump_veinfo(ctx); + if (!err) + err = cpt_dump_ubc(ctx); + if (!err) + err = cpt_dump_files(ctx); + if (!err) + err = cpt_dump_files_struct(ctx); + if (!err) + err = cpt_dump_fs_struct(ctx); + /* netdevices should be dumped after dumping open files + as we need to restore netdevice binding to /dev/net/tun file */ + if (!err) + err = cpt_dump_ifinfo(ctx); + if (!err) + err = cpt_dump_namespace(ctx); + if (!err) + err = cpt_dump_sighand(ctx); + if (!err) + err = cpt_dump_vm(ctx); + if (!err) + err = cpt_dump_sysvsem(ctx); + if (!err) + err = cpt_dump_sysvmsg(ctx); + if (!err) + err = cpt_dump_tasks(ctx); + if (!err) + err = cpt_dump_orphaned_sockets(ctx); +#if defined(CONFIG_VE_IPTABLES) && \ + (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) + if (!err) + err = cpt_dump_ip_conntrack(ctx); +#endif + if (!err) { + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_DUMP, ctx) & NOTIFY_FAIL) + err = -ECHRNG; + } + if (!err) + err = cpt_dump_utsname(ctx); + +#ifndef CONFIG_IA64 + if (!err) + err = cpt_dump_vsyscall(ctx); +#endif + + if (!err) + err = cpt_dump_tail(ctx); + + err2 = cpt_close_dumpfile(ctx); + +out: + current->nsproxy = old_ns; + set_exec_env(oldenv); +out_noenv: + up_read(&env->op_sem); + put_ve(env); + return err ? : err2; +} + +int cpt_vps_suspend(struct cpt_context *ctx) +{ + struct ve_struct *oldenv, *env; + struct nsproxy *old_ns; + int err = 0; + + ctx->kernel_config_flags = test_kernel_config(); + cpt_object_init(ctx); + + if (!ctx->ve_id) { + env = get_exec_env(); + if (env == get_ve0()) + return -EINVAL; + wprintk("undefined ve_id\n"); + ctx->ve_id = env->veid; + get_ve(env); + } else { + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + } + +#ifdef CONFIG_VE_IPTABLES + ctx->iptables_mask = env->_iptables_modules; +#endif + ctx->features = env->features; + + down_write(&env->op_sem); + err = -ESRCH; + if (!env->is_running) + goto out_noenv; + + err = -EBUSY; + if (env->is_locked) + goto out_noenv; + env->is_locked = 1; + downgrade_write(&env->op_sem); + + oldenv = set_exec_env(env); + old_ns = current->nsproxy; + current->nsproxy = env->ve_ns; + + /* Phase 0: find and stop all the tasks */ + if ((err = vps_stop_tasks(ctx)) != 0) + goto out; + + if ((err = cpt_suspend_network(ctx)) != 0) + goto out_wake; + + /* At the moment all the state is frozen. We do not need to lock + * the state, which can be changed only if the tasks are running. + */ + + /* Phase 1: collect task tree */ + if ((err = vps_collect_tasks(ctx)) != 0) + goto out_wake; + + /* Phase 1': collect all the resources */ + if ((err = cpt_collect(ctx)) != 0) + goto out; + +out: + current->nsproxy = old_ns; + set_exec_env(oldenv); + up_read(&env->op_sem); + put_ve(env); + return err; + +out_noenv: + up_write(&env->op_sem); + put_ve(env); + return err; + +out_wake: + read_lock(&tasklist_lock); + wake_ve(ctx); + read_unlock(&tasklist_lock); + goto out; +} + +static void check_unsupported_netdevices(struct cpt_context *ctx, __u32 *caps) +{ + struct net *net = get_exec_env()->ve_netns; + struct net_device *dev; + + read_lock(&dev_base_lock); + for_each_netdev(net, dev) { + if (dev != net->loopback_dev +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) + && !(KSYMREF(veth_open) && dev->open == KSYMREF(veth_open)) +#endif +#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) + && dev != get_exec_env()->_venet_dev +#endif +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) + && dev->open != tun_net_open +#endif + ) { + eprintk_ctx("unsupported netdevice %s\n", dev->name); + *caps |= (1<flags & _TIF_IA32)) + *caps |= flags & ((1<mm && p->mm->context.vdso) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + *caps |= flags & (1<mm && p->mm->context.vdso) + *caps |= flags & (1<= 0) { + switch (check_process_external(p)) { + case PIDTYPE_PID: + eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", task_pid_vnr(p), p->pid, p->comm); + *caps |= (1<pid, p->comm); + *caps |= (1<pid, p->comm); + *caps |= (1<pid, p->comm); + *caps |= (1<nsproxy) { + ns = p->nsproxy->mnt_ns; + if (ns) + get_mnt_ns(ns); + } + task_unlock(p); + if (ns) { + if (ns != current->nsproxy->mnt_ns) { + eprintk_ctx("namespaces are not supported: process %d/%d(%s)\n", task_pid_vnr(p), p->pid, p->comm); + *caps |= (1<policy != SCHED_NORMAL) { + eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", task_pid_vnr(p), p->pid, p->comm); + *caps |= (1<pid, virt_pid(p), p->comm); + *caps |= (1<list) { + struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list); + struct path p; + + p.dentry = mnt->mnt_root; + p.mnt = mnt; + path = __d_path(&p, &env->root_path, + path_buf, PAGE_SIZE); + if (IS_ERR(path)) + continue; + + if (check_one_vfsmount(mnt)) { + eprintk_ctx("Unsupported filesystem %s\n", mnt->mnt_sb->s_type->name); + *caps |= (1<ve_id) + return -EINVAL; + + env = get_ve_by_id(ctx->ve_id); + if (env == NULL) + return -ESRCH; + + *caps = flags & (1<nsproxy; + current->nsproxy = env->ve_ns; + + check_unsupported_netdevices(ctx, caps); + + read_lock(&tasklist_lock); + root = find_task_by_vpid(1); + if (!root) { + read_unlock(&tasklist_lock); + eprintk_ctx("cannot find ve init\n"); + err = -ESRCH; + goto out; + } + get_task_struct(root); + for (p = __first_task_ve(env); p != NULL ; p = __next_task_ve(env, p)) + check_one_process(ctx, caps, flags, env, root, p); + read_unlock(&tasklist_lock); + + task_lock(root); + n = NULL; + if (root->nsproxy) { + n = root->nsproxy->mnt_ns; + if (n) + get_mnt_ns(n); + } + task_unlock(root); + if (n) { + char *path_buf; + + path_buf = (char *) __get_free_page(GFP_KERNEL); + if (!path_buf) { + put_mnt_ns(n); + err = -ENOMEM; + goto out_root; + } + + check_unsupported_mounts(ctx, caps, env, n, path_buf); + + free_page((unsigned long) path_buf); + put_mnt_ns(n); + } + + err = 0; + +out_root: + put_task_struct(root); +out: + current->nsproxy = old_ns; + set_exec_env(old_env); + put_ve(env); + + return err; +} diff --git a/kernel/cpt/cpt_dump.h b/kernel/cpt/cpt_dump.h new file mode 100644 index 0000000..71f6d94 --- /dev/null +++ b/kernel/cpt/cpt_dump.h @@ -0,0 +1,16 @@ +int cpt_dump(struct cpt_context *cpt); +int rst_undump(struct cpt_context *cpt); +int cpt_suspend(struct cpt_context *cpt); +int cpt_resume(struct cpt_context *cpt); +int cpt_kill(struct cpt_context *cpt); +int rst_clean(struct cpt_context *cpt); +int rst_resume(struct cpt_context *cpt); +int rst_kill(struct cpt_context *cpt); + +int cpt_freeze_one(pid_t pid, int freeze); +int cpt_vps_suspend(struct cpt_context *ctx); +int vps_rst_undump(struct cpt_context *ctx); + +int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps); + +int cpt_check_unsupported(struct task_struct *tsk, struct cpt_context *ctx); diff --git a/kernel/cpt/cpt_epoll.c b/kernel/cpt/cpt_epoll.c new file mode 100644 index 0000000..81d2b98 --- /dev/null +++ b/kernel/cpt/cpt_epoll.c @@ -0,0 +1,113 @@ +/* + * + * kernel/cpt/cpt_epoll.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +int cpt_dump_epolldev(cpt_object_t *obj, cpt_context_t *ctx) +{ + int err = 0; + struct file *file = obj->o_obj; + struct eventpoll *ep; + struct rb_node *rbp; + struct cpt_epoll_image ei; + + if (file->f_op != &eventpoll_fops) { + eprintk_ctx("bad epoll file\n"); + return -EINVAL; + } + + ep = file->private_data; + + /* eventpoll.c does not protect open /proc/N/fd, silly. + * Opener will get an invalid file with uninitialized private_data + */ + if (unlikely(ep == NULL)) { + eprintk_ctx("bad epoll device\n"); + return -EINVAL; + } + + cpt_open_object(NULL, ctx); + + ei.cpt_next = CPT_NULL; + ei.cpt_object = CPT_OBJ_EPOLL; + ei.cpt_hdrlen = sizeof(ei); + ei.cpt_content = CPT_CONTENT_ARRAY; + ei.cpt_file = obj->o_pos; + + ctx->write(&ei, sizeof(ei), ctx); + + mutex_lock(&epmutex); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + loff_t saved_obj; + cpt_object_t *tobj; + struct cpt_epoll_file_image efi; + struct epitem *epi; + epi = rb_entry(rbp, struct epitem, rbn); + tobj = lookup_cpt_object(CPT_OBJ_FILE, epi->ffd.file, ctx); + if (tobj == NULL) { + eprintk_ctx("epoll device refers to an external file\n"); + err = -EBUSY; + break; + } + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + efi.cpt_next = CPT_NULL; + efi.cpt_object = CPT_OBJ_EPOLL_FILE; + efi.cpt_hdrlen = sizeof(efi); + efi.cpt_content = CPT_CONTENT_VOID; + efi.cpt_file = tobj->o_pos; + efi.cpt_fd = epi->ffd.fd; + efi.cpt_events = epi->event.events; + efi.cpt_data = epi->event.data; + efi.cpt_revents = 0; + efi.cpt_ready = 0; + if (!list_empty(&epi->rdllink)) + efi.cpt_ready = 1; + + ctx->write(&efi, sizeof(efi), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + mutex_unlock(&epmutex); + + cpt_close_object(ctx); + + return err; +} + diff --git a/kernel/cpt/cpt_exports.c b/kernel/cpt/cpt_exports.c new file mode 100644 index 0000000..f492331 --- /dev/null +++ b/kernel/cpt/cpt_exports.c @@ -0,0 +1,13 @@ +#include +#include + +#include "cpt_obj.h" + +EXPORT_SYMBOL(alloc_cpt_object); +EXPORT_SYMBOL(intern_cpt_object); +EXPORT_SYMBOL(insert_cpt_object); +EXPORT_SYMBOL(__cpt_object_add); +EXPORT_SYMBOL(cpt_object_add); +EXPORT_SYMBOL(cpt_object_get); +EXPORT_SYMBOL(lookup_cpt_object); +EXPORT_SYMBOL(lookup_cpt_obj_bypos); diff --git a/kernel/cpt/cpt_files.c b/kernel/cpt/cpt_files.c new file mode 100644 index 0000000..376222e --- /dev/null +++ b/kernel/cpt/cpt_files.c @@ -0,0 +1,1626 @@ +/* + * + * kernel/cpt/cpt_files.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +void cpt_printk_dentry(struct dentry *d, struct vfsmount *mnt) +{ + char *path; + struct path p; + unsigned long pg = __get_free_page(GFP_KERNEL); + + if (!pg) + return; + + p.dentry = d; + p.mnt = mnt; + path = d_path(&p, (char *)pg, PAGE_SIZE); + + if (!IS_ERR(path)) + eprintk("<%s>", path); + free_page(pg); +} + +int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, + cpt_context_t *ctx) +{ + if (path[0] == '/' && !(!IS_ROOT(d) && d_unhashed(d))) { + struct nameidata nd; + if (path_lookup(path, 0, &nd)) { + eprintk_ctx("d_path cannot be looked up %s\n", path); + return -EINVAL; + } + if (nd.path.dentry != d || nd.path.mnt != mnt) { + eprintk_ctx("d_path is invisible %s\n", path); + path_put(&nd.path); + return -EINVAL; + } + path_put(&nd.path); + } + return 0; +} + +static int +cpt_replaced(struct dentry * de, struct vfsmount *mnt, cpt_context_t * ctx) +{ + int result = 0; + +#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE) + char *path; + unsigned long pg; + struct dentry * renamed_dentry; + struct path p; + + if (de->d_sb->s_magic != FSMAGIC_VEFS) + return 0; + if (de->d_inode->i_nlink != 0 || + atomic_read(&de->d_inode->i_writecount) > 0) + return 0; + + renamed_dentry = vefs_replaced_dentry(de); + if (renamed_dentry == NULL) + return 0; + + pg = __get_free_page(GFP_KERNEL); + if (!pg) + return 0; + + p.dentry = de; + p.mnt = mnt; + path = d_path(&p, (char *)pg, PAGE_SIZE); + if (!IS_ERR(path)) { + int len; + struct nameidata nd; + + len = pg + PAGE_SIZE - 1 - (unsigned long)path; + if (len >= sizeof("(deleted) ") - 1 && + !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) { + len -= sizeof("(deleted) ") - 1; + path += sizeof("(deleted) ") - 1; + } + + if (path_lookup(path, 0, &nd) == 0) { + if (mnt == nd.path.mnt && + vefs_is_renamed_dentry(nd.path.dentry, renamed_dentry)) + result = 1; + path_put(&nd.path); + } + } + free_page(pg); +#endif + return result; +} + +static int cpt_dump_dentry(struct dentry *d, struct vfsmount *mnt, + int replaced, cpt_context_t *ctx) +{ + int len; + char *path; + struct path p; + char *pg = cpt_get_buf(ctx); + loff_t saved; + + p.dentry = d; + p.mnt = mnt; + path = d_path(&p, pg, PAGE_SIZE); + len = PTR_ERR(path); + + if (IS_ERR(path)) { + struct cpt_object_hdr o; + char tmp[1]; + + /* VZ changes d_path() to return EINVAL, when path + * is not supposed to be visible inside VE. + * This changes behaviour of d_path() comparing + * to mainstream kernel, f.e. d_path() fails + * on any kind of shared memory. Maybe, there are + * another cases, but I am aware only about this one. + * So, we just ignore error on shmem mounts and proceed. + * Otherwise, checkpointing is prohibited because + * of reference to an invisible file. + */ + if (len != -EINVAL || + mnt != get_exec_env()->shmem_mnt) + eprintk_ctx("d_path err=%d\n", len); + else + len = 0; + + cpt_push_object(&saved, ctx); + cpt_open_object(NULL, ctx); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + tmp[0] = 0; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(tmp, 1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved, ctx); + + __cpt_release_buf(ctx); + return len; + } else { + struct cpt_object_hdr o; + + len = pg + PAGE_SIZE - 1 - path; + if (replaced && + len >= sizeof("(deleted) ") - 1 && + !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) { + len -= sizeof("(deleted) ") - 1; + path += sizeof("(deleted) ") - 1; + } + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + path[len] = 0; + + if (cpt_verify_overmount(path, d, mnt, ctx)) { + __cpt_release_buf(ctx); + return -EINVAL; + } + + cpt_push_object(&saved, ctx); + cpt_open_object(NULL, ctx); + ctx->write(&o, sizeof(o), ctx); + ctx->write(path, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved, ctx); + __cpt_release_buf(ctx); + } + return 0; +} + +int cpt_dump_string(const char *s, struct cpt_context *ctx) +{ + int len; + struct cpt_object_hdr o; + + cpt_open_object(NULL, ctx); + len = strlen(s); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(s, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + return 0; +} + +static int +cpt_dump_filename(struct file *file, int replaced, cpt_context_t *ctx) +{ + return cpt_dump_dentry(file->f_dentry, file->f_vfsmnt, replaced, ctx); +} + +int cpt_dump_inode(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) +{ + int err; + struct cpt_inode_image *v = cpt_get_buf(ctx); + struct kstat sbuf; + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_INODE; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + if ((err = vfs_getattr(mnt, d, &sbuf)) != 0) { + cpt_release_buf(ctx); + return err; + } + + v->cpt_dev = d->d_inode->i_sb->s_dev; + v->cpt_ino = d->d_inode->i_ino; + v->cpt_mode = sbuf.mode; + v->cpt_nlink = sbuf.nlink; + v->cpt_uid = sbuf.uid; + v->cpt_gid = sbuf.gid; + v->cpt_rdev = d->d_inode->i_rdev; + v->cpt_size = sbuf.size; + v->cpt_atime = cpt_timespec_export(&sbuf.atime); + v->cpt_mtime = cpt_timespec_export(&sbuf.mtime); + v->cpt_ctime = cpt_timespec_export(&sbuf.ctime); + v->cpt_blksize = sbuf.blksize; + v->cpt_blocks = sbuf.blocks; + v->cpt_sb = d->d_inode->i_sb->s_magic; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + return 0; +} + +int cpt_collect_files(cpt_context_t * ctx) +{ + int err; + cpt_object_t *obj; + int index = 0; + + /* Collect process fd sets */ + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->files && cpt_object_add(CPT_OBJ_FILES, tsk->files, ctx) == NULL) + return -ENOMEM; + } + + /* Collect files from fd sets */ + for_each_object(obj, CPT_OBJ_FILES) { + int fd; + struct files_struct *f = obj->o_obj; + + cpt_obj_setindex(obj, index++, ctx); + + if (obj->o_count != atomic_read(&f->count)) { + eprintk_ctx("files_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&f->count)); + return -EBUSY; + } + + for (fd = 0; fd < f->fdt->max_fds; fd++) { + struct file *file = fcheck_files(f, fd); + if (file && cpt_object_add(CPT_OBJ_FILE, file, ctx) == NULL) + return -ENOMEM; + } + } + + /* Collect files queued by AF_UNIX sockets. */ + if ((err = cpt_collect_passedfds(ctx)) < 0) + return err; + + /* OK. At this point we should count all the references. */ + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + struct file *parent; + cpt_object_t *ino_obj; + + if (obj->o_count != atomic_long_read(&file->f_count)) { + eprintk_ctx("file struct is referenced outside %d %ld\n", obj->o_count, atomic_long_read(&file->f_count)); + cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); + return -EBUSY; + } + + switch (file->f_dentry->d_inode->i_sb->s_magic) { + case FSMAGIC_FUTEX: + case FSMAGIC_MQUEUE: + case FSMAGIC_BDEV: +#ifndef CONFIG_INOTIFY_USER + case FSMAGIC_INOTIFY: +#endif + eprintk_ctx("file on unsupported FS: magic %08lx\n", file->f_dentry->d_inode->i_sb->s_magic); + return -EBUSY; + } + + /* Collect inode. It is necessary mostly to resolve deleted + * hard links. */ + ino_obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); + if (ino_obj == NULL) + return -ENOMEM; + + parent = ino_obj->o_parent; + if (!parent || (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) + ino_obj->o_parent = file; + + if (S_ISCHR(file->f_dentry->d_inode->i_mode)) { + int maj = imajor(file->f_dentry->d_inode); + if (maj == PTY_MASTER_MAJOR || + (maj >= UNIX98_PTY_MASTER_MAJOR && + maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || + maj == PTY_SLAVE_MAJOR || + maj == UNIX98_PTY_SLAVE_MAJOR || + maj == TTYAUX_MAJOR) { + err = cpt_collect_tty(file, ctx); + if (err) + return err; + } + } + + if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { + err = cpt_collect_socket(file, ctx); + if (err) + return err; + } + } + + err = cpt_index_sockets(ctx); + + return err; +} + +/* /dev/ptmx is special, all the files share one inode, but real tty backend + * is attached via file->private_data. + */ + +static inline int is_cloning_inode(struct inode *ino) +{ + return S_ISCHR(ino->i_mode) && + ino->i_rdev == MKDEV(TTYAUX_MAJOR,2); +} + +static int dump_one_flock(struct file_lock *fl, int owner, struct cpt_context *ctx) +{ + pid_t pid; + struct cpt_flock_image *v = cpt_get_buf(ctx); + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_FLOCK; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + v->cpt_owner = owner; + + pid = fl->fl_pid; + if (pid) { + pid = pid_to_vpid(fl->fl_pid); + if (pid == -1) { + if (!(fl->fl_flags&FL_FLOCK)) { + eprintk_ctx("posix lock from another container?\n"); + cpt_release_buf(ctx); + return -EBUSY; + } + pid = 0; + } + } + + v->cpt_pid = pid; + v->cpt_start = fl->fl_start; + v->cpt_end = fl->fl_end; + v->cpt_flags = fl->fl_flags; + v->cpt_type = fl->fl_type; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + return 0; +} + + +int cpt_dump_flock(struct file *file, struct cpt_context *ctx) +{ + int err = 0; + struct file_lock *fl; + + lock_kernel(); + for (fl = file->f_dentry->d_inode->i_flock; + fl; fl = fl->fl_next) { + if (file != fl->fl_file) + continue; + if (fl->fl_flags & FL_LEASE) { + eprintk_ctx("lease lock is not supported\n"); + err = -EINVAL; + break; + } + if (fl->fl_flags & FL_POSIX) { + cpt_object_t *obj; + obj = lookup_cpt_object(CPT_OBJ_FILES, fl->fl_owner, ctx); + if (obj) { + dump_one_flock(fl, obj->o_index, ctx); + continue; + } else { + eprintk_ctx("unknown lock owner %p\n", fl->fl_owner); + err = -EINVAL; + } + } + if (fl->fl_flags & FL_FLOCK) { + dump_one_flock(fl, -1, ctx); + continue; + } + } + unlock_kernel(); + return err; +} + +static int dump_one_file(cpt_object_t *obj, struct file *file, cpt_context_t *ctx) +{ + int err = 0; + cpt_object_t *iobj; + struct cpt_file_image *v = cpt_get_buf(ctx); + struct kstat sbuf; + int replaced = 0; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FILE; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_flags = file->f_flags; + v->cpt_mode = file->f_mode; + v->cpt_pos = file->f_pos; + v->cpt_uid = file->f_uid; + v->cpt_gid = file->f_gid; + + vfs_getattr(file->f_vfsmnt, file->f_dentry, &sbuf); + + v->cpt_i_mode = sbuf.mode; + v->cpt_lflags = 0; + if (IS_ROOT(file->f_dentry)) + v->cpt_lflags |= CPT_DENTRY_ROOT; + else if (d_unhashed(file->f_dentry)) { + if (cpt_replaced(file->f_dentry, file->f_vfsmnt, ctx)) { + v->cpt_lflags |= CPT_DENTRY_REPLACED; + replaced = 1; + } else { + v->cpt_lflags |= CPT_DENTRY_DELETED; + } + } + if (is_cloning_inode(file->f_dentry->d_inode)) + v->cpt_lflags |= CPT_DENTRY_CLONING; + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_PROC) + v->cpt_lflags |= CPT_DENTRY_PROC; + v->cpt_inode = CPT_NULL; + if (!(v->cpt_lflags & CPT_DENTRY_REPLACED)) { + iobj = lookup_cpt_object(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); + if (iobj) + v->cpt_inode = iobj->o_pos; + } + v->cpt_priv = CPT_NULL; + v->cpt_fown_fd = -1; + if (S_ISCHR(v->cpt_i_mode)) { + iobj = lookup_cpt_object(CPT_OBJ_TTY, file->private_data, ctx); + if (iobj) { + v->cpt_priv = iobj->o_pos; + if (file->f_flags&FASYNC) + v->cpt_fown_fd = cpt_tty_fasync(file, ctx); + } +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) + if (file->f_op && file->f_op->open == tun_chr_open) + v->cpt_lflags |= CPT_DENTRY_TUNTAP; +#endif + } + if (S_ISSOCK(v->cpt_i_mode)) { + if (obj->o_index < 0) { + eprintk_ctx("BUG: no socket index\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_priv = obj->o_index; + if (file->f_flags&FASYNC) + v->cpt_fown_fd = cpt_socket_fasync(file, ctx); + } + if (file->f_op == &eventpoll_fops) { + v->cpt_priv = file->f_dentry->d_inode->i_ino; + v->cpt_lflags |= CPT_DENTRY_EPOLL; + } + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) { + v->cpt_priv = file->f_dentry->d_inode->i_ino; + v->cpt_lflags |= CPT_DENTRY_INOTIFY; + } + + v->cpt_fown_pid = (file->f_owner.pid == NULL ? + CPT_FOWN_STRAY_PID : pid_vnr(file->f_owner.pid)); + v->cpt_fown_uid = file->f_owner.uid; + v->cpt_fown_euid = file->f_owner.euid; + v->cpt_fown_signo = file->f_owner.signum; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (!S_ISSOCK(v->cpt_i_mode)) { + err = cpt_dump_filename(file, replaced, ctx); + if (err) + return err; + if ((file->f_mode & FMODE_WRITE) && + file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_VEFS) + vefs_track_notify(file->f_dentry, 1); + } + + if (file->f_dentry->d_inode->i_flock) + err = cpt_dump_flock(file, ctx); + + cpt_close_object(ctx); + + return err; +} + +/* About this weird function... Crappy code dealing with SYSV shared memory + * defines TMPFS inode and file with f_op doing only mmap. So... + * Maybe, this is wrong and leaks something. It is clear access to + * SYSV shmem via mmap is quite unusual and impossible from user space. + */ +static int dump_content_shm(struct file *file, struct cpt_context *ctx) +{ + struct cpt_obj_bits *v; + loff_t saved_pos; + unsigned long addr; + + addr = do_mmap_pgoff(file, 0, file->f_dentry->d_inode->i_size, + PROT_READ, MAP_SHARED, 0); + if (IS_ERR((void*)addr)) + return PTR_ERR((void*)addr); + + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + v = cpt_get_buf(ctx); + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_BITS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_DATA; + v->cpt_size = file->f_dentry->d_inode->i_size; + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + ctx->write((void*)addr, file->f_dentry->d_inode->i_size, ctx); + ctx->align(ctx); + do_munmap(current->mm, addr, file->f_dentry->d_inode->i_size); + + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + return 0; +} + +static int data_is_zero(char *addr, int len) +{ + int i; + unsigned long zerolong = 0; + + for (i=0; if_op == NULL) + return -EINVAL; + + do_read = file->f_op->read; + if (file->f_op == &shm_file_operations) { + struct shm_file_data *sfd = file->private_data; + + cpt_dump_content_sysvshm(sfd->file, ctx); + + return 0; + } + if (file->f_op == &shmem_file_operations) { + do_read = file->f_dentry->d_inode->i_fop->read; + cpt_dump_content_sysvshm(file, ctx); + if (!do_read) { + wprintk_ctx("TMPFS is not configured?\n"); + return dump_content_shm(file, ctx); + } + } + + if (!(file->f_mode & FMODE_READ) || + (file->f_flags & O_DIRECT)) { + file = dentry_open(dget(file->f_dentry), + mntget(file->f_vfsmnt), O_RDONLY); + if (IS_ERR(file)) { + cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); + eprintk_ctx("cannot reopen file for read %ld\n", PTR_ERR(file)); + return PTR_ERR(file); + } + } else { + atomic_long_inc(&file->f_count); + } + + for (;;) { + mm_segment_t oldfs; + int err; + + (void)cpt_get_buf(ctx); + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = do_read(file, ctx->tmpbuf, PAGE_SIZE, &pos); + set_fs(oldfs); + if (err < 0) { + eprintk_ctx("dump_content_regular: do_read: %d", err); + fput(file); + __cpt_release_buf(ctx); + return err; + } + if (err == 0) { + __cpt_release_buf(ctx); + break; + } + if (data_is_zero(ctx->tmpbuf, err)) { + if (obj_opened != CPT_NULL) { + ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + obj_opened = CPT_NULL; + } + } else { + if (obj_opened == CPT_NULL) { + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + obj_opened = ctx->file->f_pos; + pgb.cpt_next = CPT_NULL; + pgb.cpt_object = CPT_OBJ_PAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_DATA; + pgb.cpt_start = pos - err; + pgb.cpt_end = pgb.cpt_start; + ctx->write(&pgb, sizeof(pgb), ctx); + } + ctx->write(ctx->tmpbuf, err, ctx); + pgb.cpt_end += err; + } + __cpt_release_buf(ctx); + } + + fput(file); + + if (obj_opened != CPT_NULL) { + ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + obj_opened = CPT_NULL; + } + return 0; +} + + +static int dump_content_chrdev(struct file *file, struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + int maj; + + maj = imajor(ino); + if (maj == MEM_MAJOR) { + /* Well, OK. */ + return 0; + } + if (maj == PTY_MASTER_MAJOR || + (maj >= UNIX98_PTY_MASTER_MAJOR && + maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || + maj == PTY_SLAVE_MAJOR || + maj == UNIX98_PTY_SLAVE_MAJOR || + maj == TTYAUX_MAJOR) { + return cpt_dump_content_tty(file, ctx); + } +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) + if (file->f_op && file->f_op->open == tun_chr_open) + return 0; +#endif + eprintk_ctx("unsupported chrdev %d/%d\n", maj, iminor(ino)); + return -EINVAL; +} + +static int dump_content_blkdev(struct file *file, struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + + /* We are not going to transfer them. */ + eprintk_ctx("unsupported blkdev %d/%d\n", imajor(ino), iminor(ino)); + return -EINVAL; +} + +static int dump_content_fifo(struct file *file, struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + cpt_object_t *obj; + loff_t saved_pos; + int readers; + int writers; + int anon = 0; + + mutex_lock(&ino->i_mutex); + readers = ino->i_pipe->readers; + writers = ino->i_pipe->writers; + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file1 = obj->o_obj; + if (file1->f_dentry->d_inode == ino) { + if (file1->f_mode & FMODE_READ) + readers--; + if (file1->f_mode & FMODE_WRITE) + writers--; + } + } + mutex_unlock(&ino->i_mutex); + if (readers || writers) { + struct dentry *dr = file->f_dentry->d_sb->s_root; + if (dr->d_name.len == 7 && memcmp(dr->d_name.name,"pipefs:",7) == 0) + anon = 1; + + if (anon) { + eprintk_ctx("pipe has %d/%d external readers/writers\n", readers, writers); + return -EBUSY; + } + /* If fifo has external readers/writers, we are in troubles. + * If the buffer is not empty, we must move its content. + * But if the fifo is owned by a service, we cannot do + * this. See? + * + * For now we assume, that if fifo is opened by another + * process, we do not own it and, hence, migrate without + * data. + */ + return 0; + } + + /* OK, we must save fifo state. No semaphores required. */ + + if (ino->i_pipe->nrbufs) { + struct cpt_obj_bits *v = cpt_get_buf(ctx); + struct pipe_inode_info *info; + int count, buf, nrbufs; + + mutex_lock(&ino->i_mutex); + info = ino->i_pipe; + count = 0; + buf = info->curbuf; + nrbufs = info->nrbufs; + while (--nrbufs >= 0) { + if (!info->bufs[buf].ops->can_merge) { + mutex_unlock(&ino->i_mutex); + eprintk_ctx("unknown format of pipe buffer\n"); + return -EINVAL; + } + count += info->bufs[buf].len; + buf = (buf+1) & (PIPE_BUFFERS-1); + } + + if (!count) { + mutex_unlock(&ino->i_mutex); + return 0; + } + + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_BITS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_DATA; + v->cpt_size = count; + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + count = 0; + buf = info->curbuf; + nrbufs = info->nrbufs; + while (--nrbufs >= 0) { + struct pipe_buffer *b = info->bufs + buf; + /* need to ->pin first? */ + void * addr = b->ops->map(info, b, 0); + ctx->write(addr + b->offset, b->len, ctx); + b->ops->unmap(info, b, addr); + buf = (buf+1) & (PIPE_BUFFERS-1); + } + + mutex_unlock(&ino->i_mutex); + + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + } + + return 0; +} + +static int dump_content_socket(struct file *file, struct cpt_context *ctx) +{ + return 0; +} + +struct cpt_dirent { + unsigned long ino; + char *name; + int namelen; + int found; +}; + +static int cpt_filldir(void * __buf, const char * name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct cpt_dirent * dirent = __buf; + + if ((ino == dirent->ino) && (namelen < PAGE_SIZE - 1)) { + memcpy(dirent->name, name, namelen); + dirent->name[namelen] = '\0'; + dirent->namelen = namelen; + dirent->found = 1; + return 1; + } + return 0; +} + +static int find_linked_dentry(struct dentry *d, struct vfsmount *mnt, + struct inode *ino, struct cpt_context *ctx) +{ + int err = -EBUSY; + struct file *f = NULL; + struct cpt_dirent entry; + struct dentry *de, *found = NULL; + + dprintk_ctx("deleted reference to existing inode, try to find file\n"); + /* 1. Try to find not deleted dentry in ino->i_dentry list */ + spin_lock(&dcache_lock); + list_for_each_entry(de, &ino->i_dentry, d_alias) { + if (!IS_ROOT(de) && d_unhashed(de)) + continue; + found = de; + dget_locked(found); + break; + } + spin_unlock(&dcache_lock); + if (found) { + err = cpt_dump_dentry(found, mnt, 0, ctx); + dput(found); + if (!err) { + dprintk_ctx("dentry found in aliases\n"); + return 0; + } + } + + /* 2. Try to find file in current dir */ + de = dget_parent(d); + if (!de) + return -EINVAL; + + mntget(mnt); + f = dentry_open(de, mnt, O_RDONLY); + if (IS_ERR(f)) + return PTR_ERR(f); + + entry.ino = ino->i_ino; + entry.name = cpt_get_buf(ctx); + entry.found = 0; + err = vfs_readdir(f, cpt_filldir, &entry); + if (err || !entry.found) { + err = err ? err : -ENOENT; + goto err_readdir; + } + + found = lookup_one_len(entry.name, de, entry.namelen); + if (IS_ERR(found)) { + err = PTR_ERR(found); + goto err_readdir; + } + + err = -ENOENT; + if (found->d_inode != ino) + goto err_lookup; + + dprintk_ctx("dentry found in dir\n"); + __cpt_release_buf(ctx); + err = cpt_dump_dentry(found, mnt, 0, ctx); + +err_lookup: + dput(found); +err_readdir: + fput(f); + __cpt_release_buf(ctx); + return err; +} + +static int dump_one_inode(struct file *file, struct dentry *d, + struct vfsmount *mnt, struct cpt_context *ctx) +{ + int err = 0; + struct inode *ino = d->d_inode; + cpt_object_t *iobj; + int dump_it = 0; + + iobj = lookup_cpt_object(CPT_OBJ_INODE, ino, ctx); + if (!iobj) + return -EINVAL; + + if (iobj->o_pos >= 0) + return 0; + + if ((!IS_ROOT(d) && d_unhashed(d)) && + !cpt_replaced(d, mnt, ctx)) + dump_it = 1; + if (!S_ISREG(ino->i_mode) && !S_ISDIR(ino->i_mode)) { + if (file->f_op == &eventpoll_fops) + return 0; + dump_it = 1; + } + + if (!dump_it) + return 0; + + cpt_open_object(iobj, ctx); + cpt_dump_inode(d, mnt, ctx); + + if (!IS_ROOT(d) && d_unhashed(d)) { + struct file *parent; + parent = iobj->o_parent; + if (!parent || + (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) { + /* Inode is not deleted, but it does not + * have references from inside checkpointed + * process group. */ + if (ino->i_nlink != 0) { + err = find_linked_dentry(d, mnt, ino, ctx); + if (err) { + eprintk_ctx("deleted reference to existing inode, checkpointing is impossible: %d\n", err); + return -EBUSY; + } + if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode)) + dump_it = 0; + } + } else { + /* Refer to _another_ file name. */ + err = cpt_dump_filename(parent, 0, ctx); + if (err) + return err; + if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode)) + dump_it = 0; + } + } + if (dump_it) { + if (S_ISREG(ino->i_mode)) { + if ((err = dump_content_regular(file, ctx)) != 0) { + eprintk_ctx("dump_content_regular "); + cpt_printk_dentry(d, mnt); + } + } else if (S_ISDIR(ino->i_mode)) { + /* We cannot do anything. The directory should be + * empty, so it is not a big deal. + */ + } else if (S_ISCHR(ino->i_mode)) { + err = dump_content_chrdev(file, ctx); + } else if (S_ISBLK(ino->i_mode)) { + err = dump_content_blkdev(file, ctx); + } else if (S_ISFIFO(ino->i_mode)) { + err = dump_content_fifo(file, ctx); + } else if (S_ISSOCK(ino->i_mode)) { + err = dump_content_socket(file, ctx); + } else { + eprintk_ctx("unknown inode mode %o, magic 0x%lx\n", ino->i_mode & S_IFMT, ino->i_sb->s_magic); + err = -EINVAL; + } + } + cpt_close_object(ctx); + + return err; +} + +int cpt_dump_files(struct cpt_context *ctx) +{ + int epoll_nr, inotify_nr; + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_TTY); + for_each_object(obj, CPT_OBJ_TTY) { + int err; + + if ((err = cpt_dump_tty(obj, ctx)) != 0) + return err; + } + cpt_close_section(ctx); + + cpt_open_section(ctx, CPT_SECT_INODE); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + int err; + + if ((err = dump_one_inode(file, file->f_dentry, + file->f_vfsmnt, ctx)) != 0) + return err; + } + for_each_object(obj, CPT_OBJ_FS) { + struct fs_struct *fs = obj->o_obj; + int err; + + if (fs->root.dentry && + (err = dump_one_inode(NULL, fs->root.dentry, fs->root.mnt, ctx)) != 0) + return err; + if (fs->pwd.dentry && + (err = dump_one_inode(NULL, fs->pwd.dentry, fs->pwd.mnt, ctx)) != 0) + return err; + } + cpt_close_section(ctx); + + epoll_nr = 0; + inotify_nr = 0; + cpt_open_section(ctx, CPT_SECT_FILES); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + int err; + + if ((err = dump_one_file(obj, file, ctx)) != 0) + return err; + if (file->f_op == &eventpoll_fops) + epoll_nr++; + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) + inotify_nr++; + } + cpt_close_section(ctx); + + if (epoll_nr) { + cpt_open_section(ctx, CPT_SECT_EPOLL); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + if (file->f_op == &eventpoll_fops) { + int err; + if ((err = cpt_dump_epolldev(obj, ctx)) != 0) + return err; + } + } + cpt_close_section(ctx); + } + + if (inotify_nr) { + cpt_open_section(ctx, CPT_SECT_INOTIFY); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) { + int err = -EINVAL; +#ifdef CONFIG_INOTIFY_USER + if ((err = cpt_dump_inotify(obj, ctx)) != 0) +#endif + return err; + } + } + cpt_close_section(ctx); + } + + cpt_open_section(ctx, CPT_SECT_SOCKET); + for_each_object(obj, CPT_OBJ_SOCKET) { + int err; + + if ((err = cpt_dump_socket(obj, obj->o_obj, obj->o_index, -1, ctx)) != 0) + return err; + } + cpt_close_section(ctx); + + return 0; +} + +static int dump_filedesc(int fd, struct file *file, + struct files_struct *f, struct cpt_context *ctx) +{ + struct cpt_fd_image *v = cpt_get_buf(ctx); + cpt_object_t *obj; + + cpt_open_object(NULL, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FILEDESC; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + v->cpt_fd = fd; + obj = lookup_cpt_object(CPT_OBJ_FILE, file, ctx); + if (!obj) BUG(); + v->cpt_file = obj->o_pos; + v->cpt_flags = 0; + if (FD_ISSET(fd, f->fdt->close_on_exec)) + v->cpt_flags = CPT_FD_FLAG_CLOSEEXEC; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + cpt_close_object(ctx); + + return 0; +} + +static int dump_one_file_struct(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct files_struct *f = obj->o_obj; + struct cpt_files_struct_image *v = cpt_get_buf(ctx); + int fd; + loff_t saved_obj; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FILES; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_index = obj->o_index; + v->cpt_max_fds = f->fdt->max_fds; + v->cpt_next_fd = f->next_fd; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + for (fd = 0; fd < f->fdt->max_fds; fd++) { + struct file *file = fcheck_files(f, fd); + if (file) + dump_filedesc(fd, file, f, ctx); + } + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + return 0; +} + +int cpt_dump_files_struct(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_FILES_STRUCT); + + for_each_object(obj, CPT_OBJ_FILES) { + int err; + + if ((err = dump_one_file_struct(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} + +int cpt_collect_fs(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->fs) { + if (cpt_object_add(CPT_OBJ_FS, tsk->fs, ctx) == NULL) + return -ENOMEM; + if (tsk->fs->pwd.dentry && + cpt_object_add(CPT_OBJ_INODE, tsk->fs->pwd.dentry->d_inode, ctx) == NULL) + return -ENOMEM; + if (tsk->fs->root.dentry && + cpt_object_add(CPT_OBJ_INODE, tsk->fs->root.dentry->d_inode, ctx) == NULL) + return -ENOMEM; + } + } + return 0; +} + +int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) +{ + struct file file; + + memset(&file, 0, sizeof(file)); + + file.f_dentry = d; + file.f_vfsmnt = mnt; + file.f_mode = FMODE_READ|FMODE_PREAD|FMODE_LSEEK; + return dump_one_file(NULL, &file, ctx); +} + +static int dump_one_fs(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct fs_struct *fs = obj->o_obj; + struct cpt_fs_struct_image *v = cpt_get_buf(ctx); + loff_t saved_obj; + int err; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_umask = fs->umask; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + err = cpt_dump_dir(fs->root.dentry, fs->root.mnt, ctx); + if (!err) + err = cpt_dump_dir(fs->pwd.dentry, fs->pwd.mnt, ctx); + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + return err; +} + +int cpt_dump_fs_struct(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_FS); + + for_each_object(obj, CPT_OBJ_FS) { + int err; + + if ((err = dump_one_fs(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} + +static int check_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) +{ + int err = 0; + struct mnt_namespace *n = obj->o_obj; + struct list_head *p; + char *path_buf, *path; + + path_buf = (char *) __get_free_page(GFP_KERNEL); + if (!path_buf) + return -ENOMEM; + + down_read(&namespace_sem); + list_for_each(p, &n->list) { + struct path pt; + struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list); + + pt.dentry = mnt->mnt_root; + pt.mnt = mnt; + path = d_path(&pt, path_buf, PAGE_SIZE); + if (IS_ERR(path)) + continue; + + if (check_one_vfsmount(mnt)) { + eprintk_ctx("unsupported fs type %s\n", mnt->mnt_sb->s_type->name); + err = -EINVAL; + break; + } + } + up_read(&namespace_sem); + + free_page((unsigned long) path_buf); + + return err; +} + +int cpt_collect_namespace(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->nsproxy && tsk->nsproxy->mnt_ns && + cpt_object_add(CPT_OBJ_NAMESPACE, + tsk->nsproxy->mnt_ns, ctx) == NULL) + return -ENOMEM; + } + + for_each_object(obj, CPT_OBJ_NAMESPACE) { + int err; + if ((err = check_one_namespace(obj, ctx)) != 0) + return err; + } + + return 0; +} + +struct args_t +{ + int* pfd; + char* path; +}; + +static int dumptmpfs(void *arg) +{ + int i; + struct args_t *args = arg; + int *pfd = args->pfd; + int fd0, fd2; + char *path = args->path; + char *argv[] = { "tar", "-c", "-S", "--numeric-owner", path, NULL }; + + i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); + if (i < 0) { + eprintk("cannot enter ve to dump tmpfs\n"); + module_put(THIS_MODULE); + return 255 << 8; + } + + if (pfd[1] != 1) + sc_dup2(pfd[1], 1); + set_fs(KERNEL_DS); + fd0 = sc_open("/dev/null", O_RDONLY, 0); + fd2 = sc_open("/dev/null", O_WRONLY, 0); + if (fd0 < 0 || fd2 < 0) { + eprintk("can not open /dev/null for tar: %d %d\n", fd0, fd2); + module_put(THIS_MODULE); + return 255 << 8; + } + if (fd0 != 0) + sc_dup2(fd0, 0); + if (fd2 != 2) + sc_dup2(fd2, 2); + + for (i = 3; i < current->files->fdt->max_fds; i++) { + sc_close(i); + } + + module_put(THIS_MODULE); + + i = sc_execve("/bin/tar", argv, NULL); + eprintk("failed to exec /bin/tar: %d\n", i); + return 255 << 8; +} + +static int cpt_dump_tmpfs(char *path, struct cpt_context *ctx) +{ + int err; + int pid; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + char buf[16]; + int n; + loff_t saved_obj; + struct args_t args; + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; + + err = sc_pipe(pfd); + if (err < 0) + return err; + args.pfd = pfd; + args.path = path; + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); + err = pid = local_kernel_thread(dumptmpfs, (void*)&args, + SIGCHLD | CLONE_VFORK, 0); + if (err < 0) { + eprintk_ctx("tmpfs local_kernel_thread: %d\n", err); + goto out; + } + f = fget(pfd[0]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NAME; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&v, sizeof(v), ctx); + + do { + oldfs = get_fs(); set_fs(KERNEL_DS); + n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); + set_fs(oldfs); + if (n > 0) + ctx->write(buf, n, ctx); + } while (n > 0); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("tar exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("tar terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + sigprocmask(SIG_SETMASK, &blocked, NULL); + + buf[0] = 0; + ctx->write(buf, 1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + return n ? : err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + sigprocmask(SIG_SETMASK, &blocked, NULL); + return err; +} + +static int loopy_root(struct vfsmount *mnt) +{ + struct list_head *p; + + list_for_each(p, &mnt->mnt_ns->list) { + struct vfsmount * m = list_entry(p, struct vfsmount, mnt_list); + if (m == mnt) + return 0; + if (m->mnt_sb == mnt->mnt_sb) + return 1; + } + /* Cannot happen */ + return 0; +} + +static int cpt_dump_bind_mnt(struct vfsmount * mnt, cpt_context_t * ctx) +{ + struct list_head *p; + int err = -EINVAL; + + /* One special case: mount --bind /a /a */ + if (mnt->mnt_root == mnt->mnt_mountpoint) + return cpt_dump_dentry(mnt->mnt_root, mnt, 0, ctx); + + list_for_each_prev(p, &mnt->mnt_list) { + struct vfsmount * m; + + if (p == &mnt->mnt_ns->list) + break; + + m = list_entry(p, struct vfsmount, mnt_list); + + if (m->mnt_sb != mnt->mnt_sb) + continue; + + err = cpt_dump_dentry(mnt->mnt_root, m, 0, ctx); + if (err == 0) + break; + } + return err; +} + +static int dump_vfsmount(struct vfsmount *mnt, struct cpt_context *ctx) +{ + int err = 0; + struct cpt_vfsmount_image v; + loff_t saved_obj; + char *path_buf, *path; + struct path p; + + path_buf = (char *) __get_free_page(GFP_KERNEL); + if (!path_buf) + return -ENOMEM; + + p.dentry = mnt->mnt_root; + p.mnt = mnt; + path = d_path(&p, path_buf, PAGE_SIZE); + if (IS_ERR(path)) { + free_page((unsigned long) path_buf); + return PTR_ERR(path) == -EINVAL ? 0 : PTR_ERR(path); + } + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_VFSMOUNT; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + v.cpt_mntflags = mnt->mnt_flags; + if (top_beancounter(slab_ub(mnt)) != top_beancounter(get_exec_ub())) { + v.cpt_mntflags |= CPT_MNT_EXT; + } else { + if (mnt->mnt_root != mnt->mnt_sb->s_root || loopy_root(mnt)) + v.cpt_mntflags |= CPT_MNT_BIND; + } + v.cpt_flags = mnt->mnt_sb->s_flags; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + cpt_dump_string(mnt->mnt_devname ? : "none", ctx); + cpt_dump_string(path, ctx); + cpt_dump_string(mnt->mnt_sb->s_type->name, ctx); + + if (v.cpt_mntflags & CPT_MNT_BIND) + err = cpt_dump_bind_mnt(mnt, ctx); + else if (!(v.cpt_mntflags & CPT_MNT_EXT) && + strcmp(mnt->mnt_sb->s_type->name, "tmpfs") == 0) { + mntget(mnt); + up_read(&namespace_sem); + err = cpt_dump_tmpfs(path, ctx); + down_read(&namespace_sem); + if (!err) { + if (list_empty(&mnt->mnt_list)) + err = -EBUSY; + } + mntput(mnt); + } + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + if (!err && mnt->mnt_sb->s_magic == FSMAGIC_VEFS) + vefs_track_force_stop(mnt->mnt_sb); + + free_page((unsigned long) path_buf); + + return err; +} + +static int dump_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct mnt_namespace *n = obj->o_obj; + struct cpt_object_hdr v; + struct list_head *p; + loff_t saved_obj; + int err = 0; + + cpt_open_object(obj, ctx); + + v.cpt_next = -1; + v.cpt_object = CPT_OBJ_NAMESPACE; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + + down_read(&namespace_sem); + list_for_each(p, &n->list) { + err = dump_vfsmount(list_entry(p, struct vfsmount, mnt_list), ctx); + if (err) + break; + } + up_read(&namespace_sem); + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + return err; +} + +int cpt_dump_namespace(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_NAMESPACE); + + for_each_object(obj, CPT_OBJ_NAMESPACE) { + int err; + + if ((err = dump_one_namespace(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} diff --git a/kernel/cpt/cpt_files.h b/kernel/cpt/cpt_files.h new file mode 100644 index 0000000..7770ab2 --- /dev/null +++ b/kernel/cpt/cpt_files.h @@ -0,0 +1,71 @@ +int cpt_collect_files(cpt_context_t *); +int cpt_collect_fs(cpt_context_t *); +int cpt_collect_namespace(cpt_context_t *); +int cpt_collect_sysvsem_undo(cpt_context_t *); +int cpt_collect_tty(struct file *, cpt_context_t *); +int cpt_dump_files(struct cpt_context *ctx); +int cpt_dump_files_struct(struct cpt_context *ctx); +int cpt_dump_fs_struct(struct cpt_context *ctx); +int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx); +int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx); +int cpt_dump_tty(cpt_object_t *, struct cpt_context *ctx); +struct file * rst_sysv_shm_vma(struct cpt_vma_image *vmai, struct cpt_context *ctx); +struct file * rst_sysv_shm_itself(loff_t pos, struct cpt_context *ctx); +struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, unsigned flags, struct cpt_context *ctx); +__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx); + +int rst_posix_locks(struct cpt_context *ctx); + +struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx); +int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx); +__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_restore_fs(struct cpt_context *ctx); + +int cpt_collect_sysv(cpt_context_t *); +int cpt_dump_sysvsem(struct cpt_context *ctx); +int cpt_dump_sysvmsg(struct cpt_context *ctx); +int rst_sysv_ipc(struct cpt_context *ctx); +int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx); +__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx); + +int cpt_dump_namespace(struct cpt_context *ctx); +int rst_root_namespace(struct cpt_context *ctx); + +int rst_stray_files(struct cpt_context *ctx); +int rst_tty_jobcontrol(struct cpt_context *ctx); + +void rst_flush_filejobs(struct cpt_context *); +int rst_do_filejobs(struct cpt_context *); + +extern struct file_operations eventpoll_fops; +int rst_eventpoll(struct cpt_context *); +struct file *cpt_open_epolldev(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx); +int cpt_dump_epolldev(cpt_object_t *obj, struct cpt_context *); + +int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx); +int cpt_get_dentry(struct dentry **dp, struct vfsmount **mp, + loff_t *pos, struct cpt_context *ctx); + +int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx); +int rst_inotify(cpt_context_t *ctx); +struct file *rst_open_inotify(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx); + + +int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, + cpt_context_t *ctx); + +#define check_one_vfsmount(mnt) \ + (strcmp(mnt->mnt_sb->s_type->name, "rootfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "ext3") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "ext2") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "simfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "unionfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "tmpfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "devpts") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "proc") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "sysfs") != 0) diff --git a/kernel/cpt/cpt_fsmagic.h b/kernel/cpt/cpt_fsmagic.h new file mode 100644 index 0000000..142e539 --- /dev/null +++ b/kernel/cpt/cpt_fsmagic.h @@ -0,0 +1,16 @@ +/* Collected from kernel sources. */ + +#define FSMAGIC_TMPFS 0x01021994 +#define FSMAGIC_PIPEFS 0x50495045 +#define FSMAGIC_SOCKFS 0x534F434B +#define FSMAGIC_PFMFS 0xa0b4d889 +#define FSMAGIC_BDEV 0x62646576 +#define FSMAGIC_FUTEX 0x0BAD1DEA +#define FSMAGIC_INOTIFY 0x2BAD1DEA +#define FSMAGIC_MQUEUE 0x19800202 +#define FSMAGIC_PROC 0x9fa0 +#define FSMAGIC_DEVPTS 0x1CD1 +#define FSMAGIC_AUTOFS 0x0187 +#define FSMAGIC_EXT2 0xEF53 +#define FSMAGIC_REISER 0x52654973 +#define FSMAGIC_VEFS 0x565a4653 diff --git a/kernel/cpt/cpt_inotify.c b/kernel/cpt/cpt_inotify.c new file mode 100644 index 0000000..4d4637e --- /dev/null +++ b/kernel/cpt/cpt_inotify.c @@ -0,0 +1,144 @@ +/* + * + * kernel/cpt/cpt_inotify.c + * + * Copyright (C) 2000-2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +extern struct file_operations inotify_fops; + +int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx) +{ + int err = 0; + struct file *file = obj->o_obj; + struct inotify_device *dev; + struct inotify_watch *watch; + struct inotify_kernel_event *kev; + struct cpt_inotify_image ii; + + if (file->f_op != &inotify_fops) { + eprintk_ctx("bad inotify file\n"); + return -EINVAL; + } + + dev = file->private_data; + + /* inotify_user.c does not protect open /proc/N/fd, silly. + * Opener will get an invalid file with uninitialized private_data + */ + if (unlikely(dev == NULL)) { + eprintk_ctx("bad inotify dev\n"); + return -EINVAL; + } + + cpt_open_object(NULL, ctx); + + ii.cpt_next = CPT_NULL; + ii.cpt_object = CPT_OBJ_INOTIFY; + ii.cpt_hdrlen = sizeof(ii); + ii.cpt_content = CPT_CONTENT_ARRAY; + ii.cpt_file = obj->o_pos; + ii.cpt_user = dev->user->uid; + ii.cpt_max_events = dev->max_events; + ii.cpt_last_wd = dev->ih->last_wd; + + ctx->write(&ii, sizeof(ii), ctx); + + mutex_lock(&dev->ih->mutex); + list_for_each_entry(watch, &dev->ih->watches, h_list) { + loff_t saved_obj; + loff_t saved_obj2; + struct cpt_inotify_wd_image wi; + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + wi.cpt_next = CPT_NULL; + wi.cpt_object = CPT_OBJ_INOTIFY_WATCH; + wi.cpt_hdrlen = sizeof(wi); + wi.cpt_content = CPT_CONTENT_ARRAY; + wi.cpt_wd = watch->wd; + wi.cpt_mask = watch->mask; + + ctx->write(&wi, sizeof(wi), ctx); + + cpt_push_object(&saved_obj2, ctx); + err = cpt_dump_dir(watch->path.dentry, watch->path.mnt, ctx); + cpt_pop_object(&saved_obj2, ctx); + if (err) + break; + + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + mutex_unlock(&dev->ih->mutex); + + if (err) + return err; + + mutex_lock(&dev->ev_mutex); + list_for_each_entry(kev, &dev->events, list) { + loff_t saved_obj; + struct cpt_inotify_ev_image ei; + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + ei.cpt_next = CPT_NULL; + ei.cpt_object = CPT_OBJ_INOTIFY_EVENT; + ei.cpt_hdrlen = sizeof(ei); + ei.cpt_content = CPT_CONTENT_NAME; + ei.cpt_wd = kev->event.wd; + ei.cpt_mask = kev->event.mask; + ei.cpt_cookie = kev->event.cookie; + ei.cpt_namelen = kev->name ? strlen(kev->name) : 0; + + ctx->write(&ei, sizeof(ei), ctx); + + if (kev->name) { + ctx->write(kev->name, ei.cpt_namelen+1, ctx); + ctx->align(ctx); + } + + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + mutex_unlock(&dev->ev_mutex); + + cpt_close_object(ctx); + + return err; +} diff --git a/kernel/cpt/cpt_kernel.c b/kernel/cpt/cpt_kernel.c new file mode 100644 index 0000000..5eb7f1c --- /dev/null +++ b/kernel/cpt/cpt_kernel.c @@ -0,0 +1,177 @@ +/* + * + * kernel/cpt/cpt_kernel.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#define __KERNEL_SYSCALLS__ 1 + +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include + +#include "cpt_kernel.h" +#include "cpt_syscalls.h" + +int debug_level = 1; + +#ifdef CONFIG_X86_32 + +/* + * Create a kernel thread + */ +extern void kernel_thread_helper(void); +int asm_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.bx = (unsigned long) fn; + regs.dx = (unsigned long) arg; + + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + + /* Ok, create the new process.. */ + return do_fork_pid(flags | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL, pid); +} +#endif + +#ifdef CONFIG_IA64 +pid_t +asm_kernel_thread (int (*fn)(void *), void *arg, unsigned long flags, pid_t pid) +{ + extern void start_kernel_thread (void); + unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; + struct { + struct switch_stack sw; + struct pt_regs pt; + } regs; + + memset(®s, 0, sizeof(regs)); + regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ + regs.pt.r1 = helper_fptr[1]; /* set GP */ + regs.pt.r9 = (unsigned long) fn; /* 1st argument */ + regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ + /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ + regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; + regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ + regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); + regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; + regs.sw.pr = (1 << 2 /*PRED_KERNEL_STACK*/); + return do_fork_pid(flags | CLONE_UNTRACED, 0, ®s.pt, 0, NULL, NULL, pid); +} +#endif + +int local_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) +{ + pid_t ret; + + if (current->fs == NULL) { + /* do_fork_pid() hates processes without fs, oopses. */ + printk("CPT BUG: local_kernel_thread: current->fs==NULL\n"); + return -EINVAL; + } + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + ret = asm_kernel_thread(fn, arg, flags, pid); + if (ret < 0) + module_put(THIS_MODULE); + return ret; +} + +#ifdef __i386__ +int __execve(const char *file, char **argv, char **envp) +{ + long res; + __asm__ volatile ("int $0x80" + : "=a" (res) + : "0" (__NR_execve),"b" ((long)(file)),"c" ((long)(argv)), + "d" ((long)(envp)) : "memory"); + return (int)res; +} +#endif + +int sc_execve(char *cmd, char **argv, char **env) +{ + int ret; +#ifndef __i386__ + ret = kernel_execve(cmd, argv, env); +#else + ret = __execve(cmd, argv, env); +#endif + return ret; +} + +unsigned int test_cpu_caps(void) +{ + unsigned int flags = 0; + +#ifdef CONFIG_X86 + if (boot_cpu_has(X86_FEATURE_CMOV)) + flags |= 1 << CPT_CPU_X86_CMOV; + if (cpu_has_fxsr) + flags |= 1 << CPT_CPU_X86_FXSR; + if (cpu_has_xmm) + flags |= 1 << CPT_CPU_X86_SSE; +#ifndef CONFIG_X86_64 + if (cpu_has_xmm2) +#endif + flags |= 1 << CPT_CPU_X86_SSE2; + if (cpu_has_mmx) + flags |= 1 << CPT_CPU_X86_MMX; + if (boot_cpu_has(X86_FEATURE_3DNOW)) + flags |= 1 << CPT_CPU_X86_3DNOW; + if (boot_cpu_has(X86_FEATURE_3DNOWEXT)) + flags |= 1 << CPT_CPU_X86_3DNOW2; + if (boot_cpu_has(X86_FEATURE_SYSCALL)) + flags |= 1 << CPT_CPU_X86_SYSCALL; +#ifdef CONFIG_X86_64 + if (boot_cpu_has(X86_FEATURE_SYSCALL) && + boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + flags |= 1 << CPT_CPU_X86_SYSCALL32; +#endif + if (boot_cpu_has(X86_FEATURE_SEP) +#ifdef CONFIG_X86_64 + && boot_cpu_data.x86_vendor == X86_VENDOR_INTEL +#endif + ) + flags |= ((1 << CPT_CPU_X86_SEP) | (1 << CPT_CPU_X86_SEP32)); +#ifdef CONFIG_X86_64 + flags |= 1 << CPT_CPU_X86_EMT64; +#endif +#endif +#ifdef CONFIG_IA64 + flags |= 1 << CPT_CPU_X86_IA64; + flags |= 1 << CPT_CPU_X86_FXSR; +#endif + return flags; +} + +unsigned int test_kernel_config(void) +{ + unsigned int flags = 0; +#ifdef CONFIG_X86 +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) + flags |= 1 << CPT_KERNEL_CONFIG_PAE; +#endif +#endif + return flags; +} diff --git a/kernel/cpt/cpt_kernel.h b/kernel/cpt/cpt_kernel.h new file mode 100644 index 0000000..9254778 --- /dev/null +++ b/kernel/cpt/cpt_kernel.h @@ -0,0 +1,99 @@ +/* Interface to kernel vars which we had to _add_. */ + +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) +#define TASK_TRACED TASK_STOPPED +#define unix_peer(sk) ((sk)->sk_pair) +#define page_mapcount(pg) ((pg)->mapcount) +#else +#define unix_peer(sk) (unix_sk(sk)->peer) +#endif + +#ifdef CONFIG_IA64 +#define cpu_has_fxsr 1 +#endif + +#define CPT_SIG_IGNORE_MASK (\ + (1 << (SIGCONT - 1)) | (1 << (SIGCHLD - 1)) | \ + (1 << (SIGWINCH - 1)) | (1 << (SIGURG - 1))) + +static inline void do_gettimespec(struct timespec *ts) +{ + struct timeval tv; + do_gettimeofday(&tv); + ts->tv_sec = tv.tv_sec; + ts->tv_nsec = tv.tv_usec*1000; +} + +int local_kernel_thread(int (*fn)(void *), + void * arg, + unsigned long flags, + pid_t pid); +int asm_kernel_thread(int (*fn)(void *), + void * arg, + unsigned long flags, + pid_t pid); + +#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE) +void vefs_track_force_stop(struct super_block *super); + +void vefs_track_notify(struct dentry *vdentry, int track_cow); + +struct dentry * vefs_replaced_dentry(struct dentry *de); +int vefs_is_renamed_dentry(struct dentry *vde, struct dentry *pde); +#else +static inline void vefs_track_force_stop(struct super_block *super) { }; + +static inline void vefs_track_notify(struct dentry *vdentry, int track_cow) { }; +#endif + +unsigned int test_cpu_caps(void); +unsigned int test_kernel_config(void); + +#define test_one_flag_old(src, dst, flag, message, ret) \ +if (src & (1 << flag)) \ + if (!(dst & (1 << flag))) { \ + wprintk("Destination cpu does not have " message "\n"); \ + ret = 1; \ + } +#define test_one_flag(src, dst, flag, message, ret) \ +if (src & (1 << flag)) \ + if (!(dst & (1 << flag))) { \ + eprintk_ctx("Destination cpu does not have " message "\n"); \ + ret = 1; \ + } + +static inline void +_set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) +{ + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} + +static inline struct timespec +_ns_to_timespec(const s64 nsec) +{ + struct timespec ts; + s32 rem; + + if (!nsec) + return (struct timespec) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} diff --git a/kernel/cpt/cpt_mm.c b/kernel/cpt/cpt_mm.c new file mode 100644 index 0000000..4e98a8e --- /dev/null +++ b/kernel/cpt/cpt_mm.c @@ -0,0 +1,923 @@ +/* + * + * kernel/cpt/cpt_mm.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#ifdef CONFIG_VZ_CHECKPOINT_LAZY +#include "cpt_pagein.h" +#endif +#include "cpt_ubc.h" + +static int collect_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, + cpt_context_t *ctx) +{ + if (!list_empty(&aio_ctx->run_list)) { + /* This is impossible at least with kernel 2.6.8.1 or 2.6.16 */ + eprintk_ctx("run list is not empty, cannot suspend AIO\n"); + return -EBUSY; + } + + /* Wait for pending IOCBs. Linux AIO is mostly _fake_. + * It is actually synchronous, except for direct IO and + * some funny raw USB things, which cannot happen inside VE. + * However, we do this for future. + * + * Later note: in 2.6.16 we may allow O_DIRECT, so that + * it is not meaningless code. + */ + wait_for_all_aios(aio_ctx); + + if (!list_empty(&aio_ctx->run_list) || + !list_empty(&aio_ctx->active_reqs) || + aio_ctx->reqs_active) { + eprintk_ctx("were not able to suspend AIO\n"); + return -EBUSY; + } + + return 0; +} + +static int collect_one_mm(struct mm_struct *mm, cpt_context_t * ctx) +{ + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_file) { + if (cpt_object_add(CPT_OBJ_FILE, vma->vm_file, ctx) == NULL) + return -ENOMEM; + } + } + + if (mm->exe_file && + cpt_object_add(CPT_OBJ_FILE, mm->exe_file, ctx) == NULL) + return -ENOMEM; + +#ifdef CONFIG_BEANCOUNTERS + if (cpt_add_ubc(mm->mm_ub, ctx) == NULL) + return -ENOMEM; +#endif + + if (mm->ioctx_list) { + struct kioctx *aio_ctx; + int err; + + for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next) + if ((err = collect_one_aio_ctx(mm, aio_ctx, ctx)) != 0) + return err; + } + + return 0; +} + +int cpt_collect_mm(cpt_context_t * ctx) +{ + cpt_object_t *obj; + int err; + int index; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->mm && cpt_object_add(CPT_OBJ_MM, tsk->mm, ctx) == NULL) + return -ENOMEM; + } + + index = 1; + for_each_object(obj, CPT_OBJ_MM) { + struct mm_struct *mm = obj->o_obj; + if (obj->o_count != atomic_read(&mm->mm_users)) { + eprintk_ctx("mm_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&mm->mm_users)); + return -EAGAIN; + } + cpt_obj_setindex(obj, index++, ctx); + + if ((err = collect_one_mm(mm, ctx)) != 0) + return err; + } + + return 0; +} + +static int zcnt, scnt, scnt0, ucnt; + +/* Function where_is_anon_page() returns address of a anonymous page in mm + * of already dumped process. This happens f.e. after fork(). We do not use + * this right now, just keep statistics, it is diffucult to restore such state, + * but the most direct use is to save space in dumped image. */ + + +static inline unsigned long +vma_address0(struct page *page, struct vm_area_struct *vma) +{ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + address |= 1; + return address; +} + +static int really_this_one(struct vm_area_struct *vma, unsigned long address, + struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + int result; + + pgd = pgd_offset(mm, address); + if (unlikely(!pgd_present(*pgd))) + return 0; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return 0; + + pmd = pmd_offset(pud, address); + if (unlikely(!pmd_present(*pmd))) + return 0; + + result = 0; + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) { + pte_unmap(pte); + return 0; + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) + result = 1; + pte_unmap_unlock(pte, ptl); + return result; +} + +static loff_t where_is_anon_page(cpt_object_t *mmobj, unsigned long mapaddr, + struct page *page, cpt_context_t * ctx) +{ + loff_t mmptr = CPT_NULL; + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + int idx = mmobj->o_index; + + if (!PageAnon(page)) + return CPT_NULL; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return CPT_NULL; + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + unsigned long addr = vma_address0(page, vma); + cpt_object_t *obj; + + /* We do not try to support mremapped regions (addr != mapaddr), + * only mmaps directly inherited via fork(). + * With this limitation we may check self-consistency of + * vmas (vm_start, vm_pgoff, anon_vma) before + * doing __copy_page_range() in rst_mm. + */ + if (mmobj->o_obj != vma->vm_mm && addr == mapaddr) { + obj = lookup_cpt_object(CPT_OBJ_MM, vma->vm_mm, ctx); + if (obj && obj->o_pos != CPT_NULL && obj->o_index < idx) { + if (really_this_one(vma, addr, page)) { + mmptr = obj->o_pos; + idx = obj->o_index; + } + } + } + } + page_unlock_anon_vma(anon_vma); + + return mmptr; +} + +struct page_area +{ + int type; + unsigned long start; + unsigned long end; + pgoff_t pgoff; + loff_t mm; + __u64 list[16]; +}; + +struct page_desc +{ + int type; + pgoff_t index; + loff_t mm; + int shared; +}; + +enum { + PD_ABSENT, + PD_COPY, + PD_ZERO, + PD_CLONE, + PD_FUNKEY, + PD_LAZY, + PD_ITER, + PD_ITERYOUNG, +}; + +/* 0: page can be obtained from backstore, or still not mapped anonymous page, + or something else, which does not requre copy. + 1: page requires copy + 2: page requres copy but its content is zero. Quite useless. + 3: wp page is shared after fork(). It is to be COWed when modified. + 4: page is something unsupported... We copy it right now. + */ + + + +static void page_get_desc(cpt_object_t *mmobj, + struct vm_area_struct *vma, unsigned long addr, + struct page_desc *pdesc, cpt_context_t * ctx) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + struct page *pg = NULL; + pgoff_t linear_index = (addr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff; + + pdesc->index = linear_index; + pdesc->shared = 0; + pdesc->mm = CPT_NULL; + + if (vma->vm_flags & VM_IO) { + pdesc->type = PD_ABSENT; + return; + } + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out_absent; + pud = pud_offset(pgd, addr); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out_absent; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out_absent; +#ifdef CONFIG_X86 + if (pmd_huge(*pmd)) { + eprintk_ctx("page_huge\n"); + goto out_unsupported; + } +#endif +#ifdef CONFIG_VZ_CHECKPOINT_LAZY +retry: +#endif + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = *ptep; + pte_unmap(ptep); + + if (pte_none(pte)) + goto out_absent_unlock; + + if (!pte_present(pte)) { + if (pte_file(pte)) { + pdesc->index = pte_to_pgoff(pte); + goto out_absent_unlock; + } + if (vma->vm_flags & VM_SHARED) { + /* It is impossible: shared mappings cannot be in swap */ + eprintk_ctx("shared mapping is not present: %08lx@%Ld\n", addr, mmobj->o_pos); + goto out_unsupported_unlock; + } +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + /* Otherwise it is in swap. */ + if (!ctx->lazy_vm) { + int err; + /* If lazy transfer is not enabled, + * raise it from swap now, so that we + * save at least when the page is shared. + */ + spin_unlock(ptl); + err = handle_mm_fault(mm, vma, addr, 0); + if (err == VM_FAULT_SIGBUS) + goto out_absent; + if (err == VM_FAULT_OOM) + goto out_absent; + err = 0; + goto retry; + } +#endif + pdesc->type = PD_LAZY; + goto out_unlock; + } + + if ((pg = vm_normal_page(vma, addr, pte)) == NULL) { + pdesc->type = PD_COPY; + goto out_unlock; + } + + get_page(pg); + spin_unlock(ptl); + + if (pg->mapping && !PageAnon(pg)) { + if (vma->vm_file == NULL) { + eprintk_ctx("pg->mapping!=NULL for fileless vma: %08lx\n", addr); + goto out_unsupported; + } + if (vma->vm_file->f_mapping != pg->mapping) { + eprintk_ctx("pg->mapping!=f_mapping: %08lx %p %p %Ld\n", + addr, vma->vm_file->f_mapping, pg->mapping, + mmobj->o_pos); + goto out_unsupported; + } + pdesc->index = (pg->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + /* Page is in backstore. For us it is like + * it is not present. + */ + goto out_absent; + } + + if (PageReserved(pg)) { + /* Special case: ZERO_PAGE is used, when an + * anonymous page is accessed but not written. */ + if (pg == ZERO_PAGE(addr)) { + if (pte_write(pte)) { + eprintk_ctx("not funny already, writable ZERO_PAGE\n"); + goto out_unsupported; + } + zcnt++; + goto out_absent; + } + eprintk_ctx("reserved page %lu at %08lx@%Ld\n", pg->index, + addr, mmobj->o_pos); + goto out_unsupported; + } + + if (pg == ZERO_PAGE(addr)) { + wprintk_ctx("that's how it works now\n"); + } + + if (!pg->mapping) { + eprintk_ctx("page without mapping at %08lx@%Ld\n", addr, + mmobj->o_pos); + goto out_unsupported; + } + + if (pg->mapping && page_mapcount(pg) > 1) { + pdesc->shared = 1; + pdesc->mm = where_is_anon_page(mmobj, addr, pg, ctx); + if (pdesc->mm != CPT_NULL) { + scnt0++; + pdesc->type = PD_CLONE; + goto out_put; + } else { + scnt++; + } + } +#ifdef CONFIG_VZ_CHECKPOINT_ITER + if (ctx->iter_done && + test_bit(PG_checkpointed, &pg->flags)) { + if (pte_write(pte)) { + wprintk_ctx("writable PG_checkpointed page\n"); + } + pdesc->index = page_to_pfn(pg); + pdesc->type = pte_young(pte) ? PD_ITERYOUNG : PD_ITER; + goto out_put; + } +#endif + pdesc->type = pte_young(pte) ? PD_COPY : PD_LAZY; + +out_put: + if (pg) + put_page(pg); + return; + +out_unlock: + spin_unlock(ptl); + goto out_put; + +out_absent_unlock: + spin_unlock(ptl); +out_absent: + pdesc->type = PD_ABSENT; + goto out_put; + +out_unsupported_unlock: + spin_unlock(ptl); +out_unsupported: + ucnt++; + pdesc->type = PD_FUNKEY; + goto out_put; +} + +/* ATTN: We give "current" to get_user_pages(). This is wrong, but get_user_pages() + * does not really need this thing. It just stores some page fault stats there. + * + * BUG: some archs (f.e. sparc64, but not Intel*) require flush cache pages + * before accessing vma. + */ +void dump_pages(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct cpt_context *ctx) +{ +#define MAX_PAGE_BATCH 16 + struct page *pg[MAX_PAGE_BATCH]; + int npages = (end - start)/PAGE_SIZE; + int count = 0; + + while (count < npages) { + int copy = npages - count; + int n; + + if (copy > MAX_PAGE_BATCH) + copy = MAX_PAGE_BATCH; + n = get_user_pages(current, vma->vm_mm, start, copy, + 0, 1, pg, NULL); + if (n == copy) { + int i; + for (i=0; iwrite(maddr, PAGE_SIZE, ctx); + kunmap(pg[i]); + } + } else { + eprintk_ctx("get_user_pages fault"); + for ( ; n > 0; n--) + page_cache_release(pg[n-1]); + return; + } + start += n*PAGE_SIZE; + count += n; + for ( ; n > 0; n--) + page_cache_release(pg[n-1]); + } + return; +} + +int dump_page_block(struct vm_area_struct *vma, struct cpt_page_block *pgb, + int copy, + struct cpt_context *ctx) +{ + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb->cpt_object = (copy != PD_LAZY) ? CPT_OBJ_PAGES : CPT_OBJ_LAZYPAGES; + pgb->cpt_hdrlen = sizeof(*pgb); + pgb->cpt_content = (copy == PD_COPY || copy == PD_LAZY) ? CPT_CONTENT_DATA : CPT_CONTENT_VOID; + + ctx->write(pgb, sizeof(*pgb), ctx); + if (copy == PD_COPY || copy == PD_LAZY) + dump_pages(vma, pgb->cpt_start, pgb->cpt_end, ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_remappage_block(struct vm_area_struct *vma, struct page_area *pa, + struct cpt_context *ctx) +{ + struct cpt_remappage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = CPT_OBJ_REMAPPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; + pgb.cpt_pgoff = pa->pgoff - (pa->end-pa->start)/PAGE_SIZE + 1; + + ctx->write(&pgb, sizeof(pgb), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_copypage_block(struct vm_area_struct *vma, struct page_area *pa, + struct cpt_context *ctx) +{ + struct cpt_copypage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = CPT_OBJ_COPYPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; + pgb.cpt_source = pa->mm; + + ctx->write(&pgb, sizeof(pgb), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_lazypage_block(struct vm_area_struct *vma, struct page_area *pa, + cpt_context_t *ctx) +{ + struct cpt_lazypage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = CPT_OBJ_LAZYPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + pgb.cpt_index = cpt_alloc_pgin_index(vma, pa->start, + (pa->end-pa->start)/PAGE_SIZE, ctx); +#endif + ctx->write(&pgb, sizeof(pgb), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_iterpage_block(struct vm_area_struct *vma, struct page_area *pa, + cpt_context_t *ctx) +{ + struct cpt_iterpage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = pa->type == PD_ITER ? CPT_OBJ_ITERPAGES : + CPT_OBJ_ITERYOUNGPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; + ctx->write(&pgb, sizeof(pgb), ctx); + + ctx->write(pa->list, 8*((pa->end-pa->start)/PAGE_SIZE), ctx); + + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + + +static int can_expand(struct page_area *pa, struct page_desc *pd) +{ + if (pa->start == pa->end) + return 1; + if (pa->type != pd->type) + return 0; + if (pa->type == PD_ITER || pa->type == PD_ITERYOUNG) { + if (pa->end - pa->start >= PAGE_SIZE*16) + return 0; + pa->list[(pa->end - pa->start)/PAGE_SIZE] = pd->index; + } + if (pa->type == PD_ABSENT) + return pd->index == pa->pgoff + 1; + if (pa->type == PD_CLONE) + return pd->mm == pa->mm; + return 1; +} + +static int dump_one_vma(cpt_object_t *mmobj, + struct vm_area_struct *vma, struct cpt_context *ctx) +{ + struct cpt_vma_image *v = cpt_get_buf(ctx); + unsigned long addr; + loff_t saved_object; + struct cpt_page_block pgb; + struct page_area pa; + int cloned_pages = 0; + + cpt_push_object(&saved_object, ctx); + + v->cpt_object = CPT_OBJ_VMA; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_start = vma->vm_start; + v->cpt_end = vma->vm_end; + v->cpt_flags = vma->vm_flags; + if (vma->vm_flags&VM_HUGETLB) { + eprintk_ctx("huge TLB VMAs are still not supported\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_pgprot = vma->vm_page_prot.pgprot; + v->cpt_pgoff = vma->vm_pgoff; + v->cpt_file = CPT_NULL; +#ifndef CONFIG_IA64 + if ((void *)vma->vm_start == vma->vm_mm->context.vdso && + vma->vm_ops == &special_mapping_vmops) + v->cpt_type = CPT_VMA_VDSO; + else +#endif + v->cpt_type = CPT_VMA_TYPE_0; + v->cpt_anonvma = 0; + + /* We have to remember what VMAs are bound to one anon_vma. + * So, we store an identifier of group of VMAs. It is handy + * to use absolute address of anon_vma as this identifier. */ + v->cpt_anonvmaid = (unsigned long)vma->anon_vma; + + if (vma->vm_file) { + struct file *filp; + cpt_object_t *obj = lookup_cpt_object(CPT_OBJ_FILE, vma->vm_file, ctx); + if (obj == NULL) BUG(); + filp = obj->o_obj; + if (filp->f_op == &shm_file_operations) { + struct shm_file_data *sfd = filp->private_data; + + v->cpt_type = CPT_VMA_TYPE_SHM; + obj = lookup_cpt_object(CPT_OBJ_FILE, sfd->file, ctx); + } + v->cpt_file = obj->o_pos; + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + if (v->cpt_type == CPT_VMA_VDSO) + goto out; + + pa.type = PD_ABSENT; + pa.pgoff = vma->vm_pgoff; + pa.mm = CPT_NULL; + pa.start = vma->vm_start; + pa.end = vma->vm_start; + + for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { + struct page_desc pd; + + page_get_desc(mmobj, vma, addr, &pd, ctx); + cloned_pages += pd.shared; + + if (pd.type == PD_FUNKEY) { + eprintk_ctx("dump_one_vma: funkey page\n"); + return -EINVAL; + } + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (pd.type == PD_LAZY && + (ctx->lazy_vm == 0 || (vma->vm_flags&VM_LOCKED))) + pd.type = PD_COPY; +#else + if (pd.type == PD_LAZY) + pd.type = PD_COPY; +#endif + + if (!can_expand(&pa, &pd)) { + if (pa.type == PD_COPY || + pa.type == PD_ZERO) { + pgb.cpt_start = pa.start; + pgb.cpt_end = pa.end; + dump_page_block(vma, &pgb, pa.type, ctx); + } else if (pa.type == PD_CLONE) { + dump_copypage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_LAZY) { + dump_lazypage_block(vma, &pa, ctx); + } else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) { + dump_iterpage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_ABSENT && + pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { + dump_remappage_block(vma, &pa, ctx); + } + pa.start = addr; + } + pa.type = pd.type; + pa.end = addr + PAGE_SIZE; + pa.pgoff = pd.index; + if (addr == pa.start) + pa.list[0] = pd.index; + pa.mm = pd.mm; + } + + if (pa.end > pa.start) { + if (pa.type == PD_COPY || + pa.type == PD_ZERO) { + pgb.cpt_start = pa.start; + pgb.cpt_end = pa.end; + dump_page_block(vma, &pgb, pa.type, ctx); + } else if (pa.type == PD_CLONE) { + dump_copypage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_LAZY) { + dump_lazypage_block(vma, &pa, ctx); + } else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) { + dump_iterpage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_ABSENT && + pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { + dump_remappage_block(vma, &pa, ctx); + } + } + + if (cloned_pages) { + __u32 anonvma = 1; + loff_t anonpos = ctx->current_object + offsetof(struct cpt_vma_image, cpt_anonvma); + ctx->pwrite(&anonvma, 4, ctx, anonpos); + } + +out: + cpt_close_object(ctx); + + cpt_pop_object(&saved_object, ctx); + + return 0; +} + +static int dump_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, + cpt_context_t *ctx) +{ + loff_t saved_object; + struct cpt_aio_ctx_image aimg; + + if (!list_empty(&aio_ctx->run_list) || + !list_empty(&aio_ctx->active_reqs) || + aio_ctx->reqs_active) { + eprintk_ctx("AIO is active after suspend\n"); + return -EBUSY; + } + + cpt_push_object(&saved_object, ctx); + + aimg.cpt_next = CPT_ALIGN(sizeof(aimg)); + aimg.cpt_object = CPT_OBJ_AIO_CONTEXT; + aimg.cpt_hdrlen = sizeof(aimg); + aimg.cpt_content = CPT_CONTENT_ARRAY; + + aimg.cpt_max_reqs = aio_ctx->max_reqs; + aimg.cpt_ring_pages = aio_ctx->ring_info.nr_pages; + aimg.cpt_nr = aio_ctx->ring_info.nr; + aimg.cpt_tail = aio_ctx->ring_info.tail; + aimg.cpt_mmap_base = aio_ctx->ring_info.mmap_base; + + ctx->write(&aimg, sizeof(aimg), ctx); + + cpt_pop_object(&saved_object, ctx); + return 0; +} + +static int dump_one_mm(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct mm_struct *mm = obj->o_obj; + struct vm_area_struct *vma; + struct cpt_mm_image *v = cpt_get_buf(ctx); + + cpt_open_object(obj, ctx); + + v->cpt_next = -1; + v->cpt_object = CPT_OBJ_MM; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_start_code = mm->start_code; + v->cpt_end_code = mm->end_code; + v->cpt_start_data = mm->start_data; + v->cpt_end_data = mm->end_data; + v->cpt_start_brk = mm->start_brk; + v->cpt_brk = mm->brk; + v->cpt_start_stack = mm->start_stack; + v->cpt_start_arg = mm->arg_start; + v->cpt_end_arg = mm->arg_end; + v->cpt_start_env = mm->env_start; + v->cpt_end_env = mm->env_end; + v->cpt_def_flags = mm->def_flags; +#ifdef CONFIG_BEANCOUNTERS + v->cpt_mmub = cpt_lookup_ubc(mm->mm_ub, ctx); +#endif + /* FIXME when coredump mask exceeds 8 bits */ + WARN_ON(mm->flags >> 8); + v->cpt_dumpable = mm->flags; + v->cpt_vps_dumpable = mm->vps_dumpable; + v->cpt_used_hugetlb = 0; /* not used */ +#ifndef CONFIG_IA64 + v->cpt_vdso = (__u32)(unsigned long)mm->context.vdso; +#endif + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + +#ifdef CONFIG_X86 + if (mm->context.size) { + loff_t saved_object; + struct cpt_obj_bits b; + int size; + + dprintk_ctx("nontrivial LDT\n"); + + cpt_push_object(&saved_object, ctx); + + cpt_open_object(NULL, ctx); + b.cpt_next = CPT_NULL; + b.cpt_object = CPT_OBJ_BITS; + b.cpt_hdrlen = sizeof(b); + b.cpt_content = CPT_CONTENT_MM_CONTEXT; + b.cpt_size = mm->context.size*LDT_ENTRY_SIZE; + + ctx->write(&b, sizeof(b), ctx); + + size = mm->context.size*LDT_ENTRY_SIZE; + +#if defined(CONFIG_X86_64) || defined(CONFIG_XEN) || \ + LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19) + ctx->write(mm->context.ldt, size, ctx); +#else + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + ctx->write(kaddr, bytes, ctx); + kunmap(mm->context.ldt_pages[nr]); + } +#endif + + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + } +#endif + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + int err; + + if ((err = dump_one_vma(obj, vma, ctx)) != 0) + return err; + } + + if (mm->ioctx_list) { + struct kioctx *aio_ctx; + int err; + + for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next) + if ((err = dump_one_aio_ctx(mm, aio_ctx, ctx)) != 0) + return err; + } + + cpt_close_object(ctx); + + return 0; +} + +int cpt_dump_vm(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + scnt = scnt0 = zcnt = 0; + + cpt_open_section(ctx, CPT_SECT_MM); + + for_each_object(obj, CPT_OBJ_MM) { + int err; + + if ((err = dump_one_mm(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + + if (scnt) + dprintk_ctx("cpt_dump_vm: %d shared private anon pages\n", scnt); + if (scnt0) + dprintk_ctx("cpt_dump_vm: %d anon pages are cloned\n", scnt0); + if (zcnt) + dprintk_ctx("cpt_dump_vm: %d silly pages canceled\n", zcnt); + return 0; +} diff --git a/kernel/cpt/cpt_mm.h b/kernel/cpt/cpt_mm.h new file mode 100644 index 0000000..dc2c483 --- /dev/null +++ b/kernel/cpt/cpt_mm.h @@ -0,0 +1,35 @@ +int cpt_collect_mm(cpt_context_t *); + +int cpt_dump_vm(struct cpt_context *ctx); + +__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx); + +int cpt_mm_prepare(unsigned long veid); + +int cpt_free_pgin_dir(struct cpt_context *); +int cpt_start_pagein(struct cpt_context *); +int rst_setup_pagein(struct cpt_context *); +int rst_complete_pagein(struct cpt_context *, int); +int rst_pageind(struct cpt_context *); +int cpt_iteration(cpt_context_t *ctx); +int rst_iteration(cpt_context_t *ctx); +void rst_drop_iter_dir(cpt_context_t *ctx); +int rst_iter(struct vm_area_struct *vma, u64 pfn, + unsigned long addr, cpt_context_t * ctx); + +int rst_swapoff(struct cpt_context *); + +#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES +struct linux_binprm; +extern int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack, + unsigned long map_address); +#endif + +#ifdef CONFIG_X86 +extern struct page *vdso32_pages[1]; +#define vsyscall_addr page_address(vdso32_pages[0]) +#endif + +extern struct vm_operations_struct special_mapping_vmops; diff --git a/kernel/cpt/cpt_net.c b/kernel/cpt/cpt_net.c new file mode 100644 index 0000000..eafbc8b --- /dev/null +++ b/kernel/cpt/cpt_net.c @@ -0,0 +1,614 @@ +/* + * + * kernel/cpt/cpt_net.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" +#include "cpt_syscalls.h" + +static void cpt_dump_veth(struct net_device *dev, struct cpt_context * ctx) +{ +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) + struct cpt_veth_image v; + struct veth_struct *veth; + + if (!KSYMREF(veth_open) || dev->open != KSYMREF(veth_open)) + return; + + veth = veth_from_netdev(dev); + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_VETH; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_allow_mac_change = veth->allow_mac_change; + + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); +#endif + return; +} + +static void cpt_dump_netstats(struct net_device *dev, struct cpt_context * ctx) +{ + struct cpt_netstats_image *n; + struct net_device_stats *stats; + + if (!dev->get_stats) + return; + + n = cpt_get_buf(ctx); + stats = dev->get_stats(dev); + cpt_open_object(NULL, ctx); + + n->cpt_next = CPT_NULL; + n->cpt_object = CPT_OBJ_NET_STATS; + n->cpt_hdrlen = sizeof(*n); + n->cpt_content = CPT_CONTENT_VOID; + + n->cpt_rx_packets = stats->rx_packets; + n->cpt_tx_packets = stats->tx_packets; + n->cpt_rx_bytes = stats->rx_bytes; + n->cpt_tx_bytes = stats->tx_bytes; + n->cpt_rx_errors = stats->rx_errors; + n->cpt_tx_errors = stats->tx_errors; + n->cpt_rx_dropped = stats->rx_dropped; + n->cpt_tx_dropped = stats->tx_dropped; + n->cpt_multicast = stats->multicast; + n->cpt_collisions = stats->collisions; + n->cpt_rx_length_errors = stats->rx_length_errors; + n->cpt_rx_over_errors = stats->rx_over_errors; + n->cpt_rx_crc_errors = stats->rx_crc_errors; + n->cpt_rx_frame_errors = stats->rx_frame_errors; + n->cpt_rx_fifo_errors = stats->rx_fifo_errors; + n->cpt_rx_missed_errors = stats->rx_missed_errors; + n->cpt_tx_aborted_errors = stats->tx_aborted_errors; + n->cpt_tx_carrier_errors = stats->tx_carrier_errors; + n->cpt_tx_fifo_errors = stats->tx_fifo_errors; + n->cpt_tx_heartbeat_errors = stats->tx_heartbeat_errors; + n->cpt_tx_window_errors = stats->tx_window_errors; + n->cpt_rx_compressed = stats->rx_compressed; + n->cpt_tx_compressed = stats->tx_compressed; + + ctx->write(n, sizeof(*n), ctx); + cpt_close_object(ctx); + cpt_release_buf(ctx); + return; +} + +static void cpt_dump_tuntap(struct net_device *dev, struct cpt_context * ctx) +{ +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) + struct cpt_tuntap_image v; + struct tun_struct *tun; + cpt_object_t *obj; + + if (dev->open != tun_net_open) + return; + + tun = netdev_priv(dev); + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_TUNTAP; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_owner = tun->owner; + v.cpt_flags = tun->flags; + v.cpt_attached = tun->attached; + + if (tun->bind_file) { + obj = lookup_cpt_object(CPT_OBJ_FILE, tun->bind_file, ctx); + BUG_ON(!obj); + v.cpt_bindfile = obj->o_pos; + } + + BUG_ON(tun->txflt.count != 0); /* FIXME (f271b2cc) */ + + v.cpt_if_flags = 0; + memset(v.cpt_dev_addr, 0, sizeof(v.cpt_dev_addr)); + memset(v.cpt_chr_filter, 0, sizeof(v.cpt_chr_filter)); + memset(v.cpt_net_filter, 0, sizeof(v.cpt_net_filter)); + + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); +#endif + return; +} + +int cpt_dump_link(struct cpt_context * ctx) +{ + struct net *net = get_exec_env()->ve_netns; + struct net_device *dev; + + cpt_open_section(ctx, CPT_SECT_NET_DEVICE); + for_each_netdev(net, dev) { + struct cpt_netdev_image v; + struct cpt_hwaddr_image hw; + loff_t saved_obj; + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_DEVICE; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + v.cpt_index = dev->ifindex; + v.cpt_flags = dev->flags; + memcpy(v.cpt_name, dev->name, IFNAMSIZ); + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + + cpt_dump_tuntap(dev, ctx); + + cpt_dump_veth(dev, ctx); + + /* Dump hardware address */ + cpt_open_object(NULL, ctx); + hw.cpt_next = CPT_NULL; + hw.cpt_object = CPT_OBJ_NET_HWADDR; + hw.cpt_hdrlen = sizeof(hw); + hw.cpt_content = CPT_CONTENT_VOID; + BUILD_BUG_ON(sizeof(hw.cpt_dev_addr) != sizeof(dev->dev_addr)); + memcpy(hw.cpt_dev_addr, dev->dev_addr, sizeof(hw.cpt_dev_addr)); + ctx->write(&hw, sizeof(hw), ctx); + cpt_close_object(ctx); + + cpt_dump_netstats(dev, ctx); + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + if (dev != net->loopback_dev +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) + && !(KSYMREF(veth_open) && dev->open == KSYMREF(veth_open)) +#endif +#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) + && dev != get_exec_env()->_venet_dev +#endif +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) + && dev->open != tun_net_open +#endif + ) { + eprintk_ctx("unsupported netdevice %s\n", dev->name); + cpt_close_section(ctx); + return -EBUSY; + } + } + cpt_close_section(ctx); + return 0; +} + +int cpt_suspend_network(struct cpt_context *ctx) +{ + get_exec_env()->disable_net = 1; + synchronize_net(); + return 0; +} + +int cpt_resume_network(struct cpt_context *ctx) +{ + struct ve_struct *env; + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + env->disable_net = 0; + put_ve(env); + return 0; +} + +int cpt_dump_ifaddr(struct cpt_context * ctx) +{ + struct net *net = get_exec_env()->ve_netns; + struct net_device *dev; + + cpt_open_section(ctx, CPT_SECT_NET_IFADDR); + for_each_netdev(net, dev) { + struct in_device *idev = in_dev_get(dev); + struct in_ifaddr *ifa; + + if (!idev) + continue; + + for (ifa = idev->ifa_list; ifa; ifa = ifa->ifa_next) { + struct cpt_ifaddr_image v; + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_IFADDR; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_index = dev->ifindex; + v.cpt_family = AF_INET; + v.cpt_masklen = ifa->ifa_prefixlen; + v.cpt_flags = ifa->ifa_flags; + v.cpt_scope = ifa->ifa_scope; + memset(&v.cpt_address, 0, sizeof(v.cpt_address)); + memset(&v.cpt_peer, 0, sizeof(v.cpt_peer)); + memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast)); + v.cpt_address[0] = ifa->ifa_local; + v.cpt_peer[0] = ifa->ifa_address; + v.cpt_broadcast[0] = ifa->ifa_broadcast; + memcpy(v.cpt_label, ifa->ifa_label, IFNAMSIZ); + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); + } + in_dev_put(idev); + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + for_each_netdev(net, dev) { + struct inet6_dev *idev = in6_dev_get(dev); + struct inet6_ifaddr *ifa; + + if (!idev) + continue; + + for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) { + struct cpt_ifaddr_image v; + + if (dev == net->loopback_dev && + ifa->prefix_len == 128 && + ifa->addr.s6_addr32[0] == 0 && + ifa->addr.s6_addr32[1] == 0 && + ifa->addr.s6_addr32[2] == 0 && + ifa->addr.s6_addr32[3] == htonl(1)) + continue; + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_IFADDR; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_index = dev->ifindex; + v.cpt_family = AF_INET6; + v.cpt_masklen = ifa->prefix_len; + v.cpt_flags = ifa->flags; + v.cpt_scope = ifa->scope; + v.cpt_valid_lft = ifa->valid_lft; + v.cpt_prefered_lft = ifa->prefered_lft; + memcpy(&v.cpt_address, &ifa->addr, 16); + memcpy(&v.cpt_peer, &ifa->addr, 16); + memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast)); + memcpy(v.cpt_label, dev->name, IFNAMSIZ); + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); + } + in6_dev_put(idev); + } +#endif + cpt_close_section(ctx); + return 0; +} + +#ifdef CONFIG_IP_FIB_TRIE +#error "Trie fib rules are known not to be restored proprly yet" +#endif + +static int cpt_dump_route(struct cpt_context * ctx) +{ + int err; + struct socket *sock; + struct msghdr msg; + struct iovec iov; + struct { + struct nlmsghdr nlh; + struct rtgenmsg g; + } req; + struct sockaddr_nl nladdr; + struct cpt_object_hdr v; + mm_segment_t oldfs; + char *pg; + + err = sock_create(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); + if (err) + return err; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = RTM_GETROUTE; + req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; + req.nlh.nlmsg_pid = 0; + req.g.rtgen_family = AF_INET; + + iov.iov_base=&req; + iov.iov_len=sizeof(req); + msg.msg_name=&nladdr; + msg.msg_namelen=sizeof(nladdr); + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_flags=MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(sock, &msg, sizeof(req)); + set_fs(oldfs); + + if (err < 0) + goto out_sock; + + pg = (char*)__get_free_page(GFP_KERNEL); + if (pg == NULL) { + err = -ENOMEM; + goto out_sock; + } + + cpt_open_section(ctx, CPT_SECT_NET_ROUTE); + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_ROUTE; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_NLMARRAY; + + ctx->write(&v, sizeof(v), ctx); + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +restart: +#endif + for (;;) { + struct nlmsghdr *h; + + iov.iov_base = pg; + iov.iov_len = PAGE_SIZE; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); + set_fs(oldfs); + + if (err < 0) + goto out_sock_pg; + if (msg.msg_flags & MSG_TRUNC) { + err = -ENOBUFS; + goto out_sock_pg; + } + + h = (struct nlmsghdr*)pg; + while (NLMSG_OK(h, err)) { + if (h->nlmsg_type == NLMSG_DONE) { + err = 0; + goto done; + } + if (h->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *errm = (struct nlmsgerr*)NLMSG_DATA(h); + err = errm->error; + eprintk_ctx("NLMSG error: %d\n", errm->error); + goto done; + } + if (h->nlmsg_type != RTM_NEWROUTE) { + eprintk_ctx("NLMSG: %d\n", h->nlmsg_type); + err = -EINVAL; + goto done; + } + ctx->write(h, NLMSG_ALIGN(h->nlmsg_len), ctx); + h = NLMSG_NEXT(h, err); + } + if (err) { + eprintk_ctx("!!!Remnant of size %d %d %d\n", err, h->nlmsg_len, h->nlmsg_type); + err = -EINVAL; + break; + } + } +done: +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (!err && req.g.rtgen_family == AF_INET) { + req.g.rtgen_family = AF_INET6; + iov.iov_base=&req; + iov.iov_len=sizeof(req); + msg.msg_name=&nladdr; + msg.msg_namelen=sizeof(nladdr); + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_flags=MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(sock, &msg, sizeof(req)); + set_fs(oldfs); + + if (err > 0) + goto restart; + } +#endif + ctx->align(ctx); + cpt_close_object(ctx); + cpt_close_section(ctx); + +out_sock_pg: + free_page((unsigned long)pg); +out_sock: + sock_release(sock); + return err; +} + +static int dumpfn(void *arg) +{ + int i; + int *pfd = arg; + char *argv[] = { "iptables-save", "-c", NULL }; + + i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); + if (i < 0) { + eprintk("cannot enter ve to dump iptables\n"); + module_put(THIS_MODULE); + return 255 << 8; + } + + if (pfd[1] != 1) + sc_dup2(pfd[1], 1); + + for (i=0; ifiles->fdt->max_fds; i++) { + if (i != 1) + sc_close(i); + } + + module_put(THIS_MODULE); + + set_fs(KERNEL_DS); + i = sc_execve("/sbin/iptables-save", argv, NULL); + if (i == -ENOENT) + i = sc_execve("/usr/sbin/iptables-save", argv, NULL); + eprintk("failed to exec iptables-save: %d\n", i); + return 255 << 8; +} + + +static int cpt_dump_iptables(struct cpt_context * ctx) +{ + int err = 0; +#ifdef CONFIG_VE_IPTABLES + int pid; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + char buf[16]; + loff_t pos; + int n; + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; + + if (!(get_exec_env()->_iptables_modules & VE_IP_IPTABLES_MOD)) + return 0; + + err = sc_pipe(pfd); + if (err < 0) { + eprintk_ctx("sc_pipe: %d\n", err); + return err; + } + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); + err = pid = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); + if (err < 0) { + eprintk_ctx("local_kernel_thread: %d\n", err); + goto out; + } + + f = fget(pfd[0]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + cpt_open_section(ctx, CPT_SECT_NET_IPTABLES); + + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NAME; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&v, sizeof(v), ctx); + + pos = ctx->file->f_pos; + do { + oldfs = get_fs(); set_fs(KERNEL_DS); + n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); + set_fs(oldfs); + if (n > 0) + ctx->write(buf, n, ctx); + } while (n > 0); + + if (n < 0) + eprintk_ctx("read: %d\n", n); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("iptables-save exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("iptables-save terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + sigprocmask(SIG_SETMASK, &blocked, NULL); + + if (ctx->file->f_pos != pos) { + buf[0] = 0; + ctx->write(buf, 1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_close_section(ctx); + } else { + pos = ctx->current_section; + cpt_close_object(ctx); + cpt_close_section(ctx); + ctx->sections[CPT_SECT_NET_IPTABLES] = CPT_NULL; + ctx->file->f_pos = pos; + } + return n ? : err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + sigprocmask(SIG_SETMASK, &blocked, NULL); +#endif + return err; +} + +int cpt_dump_ifinfo(struct cpt_context * ctx) +{ + int err; + + rtnl_lock(); + err = cpt_dump_link(ctx); + if (!err) + err = cpt_dump_ifaddr(ctx); + rtnl_unlock(); + if (!err) + err = cpt_dump_route(ctx); + if (!err) + err = cpt_dump_iptables(ctx); + return err; +} diff --git a/kernel/cpt/cpt_net.h b/kernel/cpt/cpt_net.h new file mode 100644 index 0000000..5d33877 --- /dev/null +++ b/kernel/cpt/cpt_net.h @@ -0,0 +1,7 @@ +int cpt_dump_ifinfo(struct cpt_context *ctx); +int rst_restore_net(struct cpt_context *ctx); +int cpt_suspend_network(struct cpt_context *ctx); +int cpt_resume_network(struct cpt_context *ctx); +int rst_resume_network(struct cpt_context *ctx); +int cpt_dump_ip_conntrack(struct cpt_context *ctx); +int rst_restore_ip_conntrack(struct cpt_context * ctx); diff --git a/kernel/cpt/cpt_obj.c b/kernel/cpt/cpt_obj.c new file mode 100644 index 0000000..7ab23d7 --- /dev/null +++ b/kernel/cpt/cpt_obj.c @@ -0,0 +1,162 @@ +/* + * + * kernel/cpt/cpt_obj.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = kmalloc(sizeof(cpt_object_t), gfp); + if (obj) { + INIT_LIST_HEAD(&obj->o_list); + INIT_LIST_HEAD(&obj->o_hash); + INIT_LIST_HEAD(&obj->o_alist); + obj->o_count = 1; + obj->o_pos = CPT_NULL; + obj->o_lock = 0; + obj->o_parent = NULL; + obj->o_index = CPT_NOINDEX; + obj->o_obj = NULL; + obj->o_image = NULL; + ctx->objcount++; + } + return obj; +} + +void free_cpt_object(cpt_object_t *obj, cpt_context_t *ctx) +{ + list_del(&obj->o_alist); + kfree(obj); + ctx->objcount--; +} + +void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_context_t *ctx) +{ + list_add_tail(&obj->o_list, &ctx->object_array[type]); +} + +void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, + cpt_object_t *head, cpt_context_t *ctx) +{ + list_add(&obj->o_list, &head->o_list); +} + +cpt_object_t * __cpt_object_add(enum _cpt_object_type type, void *p, + unsigned gfp_mask, cpt_context_t *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_object(type, p, ctx); + + if (obj) { + obj->o_count++; + return obj; + } + + if ((obj = alloc_cpt_object(gfp_mask, ctx)) != NULL) { + if (p) + cpt_obj_setobj(obj, p, ctx); + intern_cpt_object(type, obj, ctx); + return obj; + } + return NULL; +} + +cpt_object_t * cpt_object_add(enum _cpt_object_type type, void *p, cpt_context_t *ctx) +{ + return __cpt_object_add(type, p, GFP_KERNEL, ctx); +} + +cpt_object_t * cpt_object_get(enum _cpt_object_type type, void *p, cpt_context_t *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_object(type, p, ctx); + + if (obj) + obj->o_count++; + + return obj; +} + +int cpt_object_init(cpt_context_t *ctx) +{ + int i; + + for (i=0; iobject_array[i]); + } + return 0; +} + +int cpt_object_destroy(cpt_context_t *ctx) +{ + int i; + + for (i=0; iobject_array[i])) { + struct list_head *head = ctx->object_array[i].next; + cpt_object_t *obj = list_entry(head, cpt_object_t, o_list); + list_del(head); + if (obj->o_image) + kfree(obj->o_image); + free_cpt_object(obj, ctx); + } + } + if (ctx->objcount != 0) + eprintk_ctx("BUG: ctx->objcount=%d\n", ctx->objcount); + return 0; +} + +cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, type) { + if (obj->o_obj == p) + return obj; + } + return NULL; +} + +cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, type) { + if (obj->o_pos == pos) + return obj; + } + return NULL; +} + +cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, type) { + if (obj->o_index == index) + return obj; + } + return NULL; +} diff --git a/kernel/cpt/cpt_obj.h b/kernel/cpt/cpt_obj.h new file mode 100644 index 0000000..7762623 --- /dev/null +++ b/kernel/cpt/cpt_obj.h @@ -0,0 +1,62 @@ +#ifndef __CPT_OBJ_H_ +#define __CPT_OBJ_H_ 1 + +#include +#include + +typedef struct _cpt_object +{ + struct list_head o_list; + struct list_head o_hash; + int o_count; + int o_index; + int o_lock; + loff_t o_pos; + loff_t o_ppos; + void *o_obj; + void *o_image; + void *o_parent; + struct list_head o_alist; +} cpt_object_t; + +struct cpt_context; + +#define for_each_object(obj, type) list_for_each_entry(obj, &ctx->object_array[type], o_list) + + +extern cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx); +extern void free_cpt_object(cpt_object_t *obj, struct cpt_context *ctx); + +cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx); +cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx); +cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx); + +static inline void cpt_obj_setpos(cpt_object_t *cpt, loff_t pos, struct cpt_context *ctx) +{ + cpt->o_pos = pos; + /* Add to pos hash table */ +} + +static inline void cpt_obj_setobj(cpt_object_t *cpt, void *ptr, struct cpt_context *ctx) +{ + cpt->o_obj = ptr; + /* Add to hash table */ +} + +static inline void cpt_obj_setindex(cpt_object_t *cpt, __u32 index, struct cpt_context *ctx) +{ + cpt->o_index = index; + /* Add to index hash table */ +} + + +extern void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, struct cpt_context *ctx); +extern void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_object_t *head, struct cpt_context *ctx); +extern cpt_object_t *cpt_object_add(enum _cpt_object_type type, void *p, struct cpt_context *ctx); +extern cpt_object_t *__cpt_object_add(enum _cpt_object_type type, void *p, unsigned int gfp_mask, struct cpt_context *ctx); +extern cpt_object_t *cpt_object_get(enum _cpt_object_type type, void *p, struct cpt_context *ctx); + +extern int cpt_object_init(struct cpt_context *ctx); +extern int cpt_object_destroy(struct cpt_context *ctx); + +#endif /* __CPT_OBJ_H_ */ diff --git a/kernel/cpt/cpt_proc.c b/kernel/cpt/cpt_proc.c new file mode 100644 index 0000000..08d5fd4 --- /dev/null +++ b/kernel/cpt/cpt_proc.c @@ -0,0 +1,595 @@ +/* + * + * kernel/cpt/cpt_proc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_dump.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" + +MODULE_AUTHOR("Alexey Kuznetsov "); +MODULE_LICENSE("GPL"); + +/* List of contexts and lock protecting the list */ +static struct list_head cpt_context_list; +static spinlock_t cpt_context_lock; + +static int proc_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos = 0; + off_t begin = 0; + int len = 0; + cpt_context_t *ctx; + + len += sprintf(buffer, "Ctx Id VE State\n"); + + spin_lock(&cpt_context_lock); + + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + len += sprintf(buffer+len,"%p %08x %-8u %d", + ctx, + ctx->contextid, + ctx->ve_id, + ctx->ctx_state + ); + + buffer[len++] = '\n'; + + pos = begin+len; + if (pos < offset) { + len = 0; + begin = pos; + } + if (pos > offset+length) + goto done; + } + *eof = 1; + +done: + spin_unlock(&cpt_context_lock); + *start = buffer + (offset - begin); + len -= (offset - begin); + if(len > length) + len = length; + if(len < 0) + len = 0; + return len; +} + +void cpt_context_release(cpt_context_t *ctx) +{ + list_del(&ctx->ctx_list); + spin_unlock(&cpt_context_lock); + + if (ctx->ctx_state > 0) + cpt_resume(ctx); + ctx->ctx_state = CPT_CTX_ERROR; + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pgin_task) + put_task_struct(ctx->pgin_task); + if (ctx->pgin_dir) + cpt_free_pgin_dir(ctx); + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); +#endif + if (ctx->objcount) + eprintk_ctx("%d objects leaked\n", ctx->objcount); + if (ctx->file) + fput(ctx->file); + cpt_flush_error(ctx); + if (ctx->errorfile) { + fput(ctx->errorfile); + ctx->errorfile = NULL; + } + if (ctx->error_msg) { + free_page((unsigned long)ctx->error_msg); + ctx->error_msg = NULL; + } + if (ctx->statusfile) + fput(ctx->statusfile); + if (ctx->lockfile) + fput(ctx->lockfile); + kfree(ctx); + + spin_lock(&cpt_context_lock); +} + +static void __cpt_context_put(cpt_context_t *ctx) +{ + if (!--ctx->refcount) + cpt_context_release(ctx); +} + +static void cpt_context_put(cpt_context_t *ctx) +{ + spin_lock(&cpt_context_lock); + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); +} + +cpt_context_t * cpt_context_open(void) +{ + cpt_context_t *ctx; + + if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { + cpt_context_init(ctx); + spin_lock(&cpt_context_lock); + list_add_tail(&ctx->ctx_list, &cpt_context_list); + spin_unlock(&cpt_context_lock); + ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); + if (ctx->error_msg != NULL) + ctx->error_msg[0] = 0; + } + return ctx; +} + +static cpt_context_t * cpt_context_lookup(unsigned int contextid) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + if (ctx->contextid == contextid) { + ctx->refcount++; + spin_unlock(&cpt_context_lock); + return ctx; + } + } + spin_unlock(&cpt_context_lock); + return NULL; +} + +int cpt_context_lookup_veid(unsigned int veid) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + if (ctx->ve_id == veid && ctx->ctx_state > 0) { + spin_unlock(&cpt_context_lock); + return 1; + } + } + spin_unlock(&cpt_context_lock); + return 0; +} + +static int cpt_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) +{ + int err = 0; + cpt_context_t *ctx; + struct file *dfile = NULL; + int try; + + unlock_kernel(); + + if (cmd == CPT_VMPREP) { +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + err = cpt_mm_prepare(arg); +#else + err = -EINVAL; +#endif + goto out_lock; + } + + if (cmd == CPT_TEST_CAPS) { + unsigned int src_flags, dst_flags = arg; + + err = 0; + src_flags = test_cpu_caps(); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); + goto out_lock; + } + + if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { + cpt_context_t *old_ctx; + + ctx = NULL; + if (cmd == CPT_JOIN_CONTEXT) { + err = -ENOENT; + ctx = cpt_context_lookup(arg); + if (!ctx) + goto out_lock; + } + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + file->private_data = ctx; + + if (old_ctx) { + if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { + old_ctx->sticky = 0; + old_ctx->refcount--; + } + __cpt_context_put(old_ctx); + } + spin_unlock(&cpt_context_lock); + err = 0; + goto out_lock; + } + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + if (ctx) + ctx->refcount++; + spin_unlock(&cpt_context_lock); + + if (!ctx) { + cpt_context_t *old_ctx; + + err = -ENOMEM; + ctx = cpt_context_open(); + if (!ctx) + goto out_lock; + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + if (!old_ctx) { + ctx->refcount++; + file->private_data = ctx; + } else { + old_ctx->refcount++; + } + if (old_ctx) { + __cpt_context_put(ctx); + ctx = old_ctx; + } + spin_unlock(&cpt_context_lock); + } + + if (cmd == CPT_GET_CONTEXT) { + unsigned int contextid = (unsigned int)arg; + + if (ctx->contextid && ctx->contextid != contextid) { + err = -EINVAL; + goto out_nosem; + } + if (!ctx->contextid) { + cpt_context_t *c1 = cpt_context_lookup(contextid); + if (c1) { + cpt_context_put(c1); + err = -EEXIST; + goto out_nosem; + } + ctx->contextid = contextid; + } + spin_lock(&cpt_context_lock); + if (!ctx->sticky) { + ctx->sticky = 1; + ctx->refcount++; + } + spin_unlock(&cpt_context_lock); + goto out_nosem; + } + + down(&ctx->main_sem); + + err = -EBUSY; + if (ctx->ctx_state < 0) + goto out; + + err = 0; + switch (cmd) { + case CPT_SET_DUMPFD: + if (ctx->ctx_state == CPT_CTX_DUMPING) { + err = -EBUSY; + break; + } + if (arg >= 0) { + err = -EBADF; + dfile = fget(arg); + if (dfile == NULL) + break; + if (dfile->f_op == NULL || + dfile->f_op->write == NULL) { + fput(dfile); + break; + } + err = 0; + } + if (ctx->file) + fput(ctx->file); + ctx->file = dfile; + break; + case CPT_SET_ERRORFD: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->errorfile) + fput(ctx->errorfile); + ctx->errorfile = dfile; + break; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + case CPT_SET_PAGEINFDIN: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); + ctx->pagein_file_in = dfile; + break; + case CPT_SET_PAGEINFDOUT: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + ctx->pagein_file_out = dfile; + break; + case CPT_SET_LAZY: + ctx->lazy_vm = arg; + break; + case CPT_ITER: + err = cpt_iteration(ctx); + break; + case CPT_PAGEIND: + err = cpt_start_pagein(ctx); + break; +#endif + case CPT_SET_VEID: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->ve_id = arg; + break; + case CPT_SET_CPU_FLAGS: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->dst_cpu_flags = arg; + ctx->src_cpu_flags = test_cpu_caps(); + break; + case CPT_SUSPEND: + if (cpt_context_lookup_veid(ctx->ve_id) || + ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->ctx_state = CPT_CTX_SUSPENDING; + try = 0; + do { + err = cpt_vps_suspend(ctx); + if (err) + cpt_resume(ctx); + if (err == -EAGAIN) + msleep(1000); + try++; + } while (err == -EAGAIN && try < 3); + if (err) { + ctx->ctx_state = CPT_CTX_IDLE; + } else { + ctx->ctx_state = CPT_CTX_SUSPENDED; + } + break; + case CPT_DUMP: + if (!ctx->ctx_state) { + err = -ENOENT; + break; + } + if (!ctx->file) { + err = -EBADF; + break; + } + err = cpt_dump(ctx); + break; + case CPT_RESUME: + if (ctx->ctx_state == CPT_CTX_IDLE) { + err = -ENOENT; + break; + } + err = cpt_resume(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + case CPT_KILL: + if (ctx->ctx_state == CPT_CTX_IDLE) { + err = -ENOENT; + break; + } + err = cpt_kill(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + case CPT_TEST_VECAPS: + { + __u32 dst_flags = arg; + __u32 src_flags; + + err = cpt_vps_caps(ctx, &src_flags); + if (err) + break; + + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_EMT64, "emt64", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_IA64, "ia64", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL, "syscall", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL32, "syscall32", err); + if (src_flags & CPT_UNSUPPORTED_MASK) + err = 2; + break; + } + default: + err = -EINVAL; + break; + } + +out: + cpt_flush_error(ctx); + up(&ctx->main_sem); +out_nosem: + cpt_context_put(ctx); +out_lock: + lock_kernel(); + if (err == -ERESTARTSYS || err == -ERESTARTNOINTR || + err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK) + err = -EINTR; + return err; +} + +static int cpt_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int cpt_release(struct inode * inode, struct file * file) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + file->private_data = NULL; + + if (ctx) + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); + + module_put(THIS_MODULE); + return 0; +} + + +static struct file_operations cpt_fops = { + .owner = THIS_MODULE, + .open = cpt_open, + .release = cpt_release, + .ioctl = cpt_ioctl, +}; + +static struct proc_dir_entry *proc_ent; + +static struct ctl_table_header *ctl_header; + +static ctl_table debug_table[] = { + { + .procname = "cpt", + .data = &debug_level, + .maxlen = sizeof(debug_level), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; +static ctl_table root_table[] = { + { + .ctl_name = CTL_DEBUG, + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { .ctl_name = 0 } +}; + +static int __init init_cpt(void) +{ + int err; + + err = -ENOMEM; + ctl_header = register_sysctl_table(root_table); + if (!ctl_header) + goto err_mon; + + spin_lock_init(&cpt_context_lock); + INIT_LIST_HEAD(&cpt_context_list); + + err = -EINVAL; + proc_ent = proc_create("cpt", 0600, NULL, NULL); + if (!proc_ent) + goto err_out; + + cpt_fops.read = proc_ent->proc_fops->read; + cpt_fops.write = proc_ent->proc_fops->write; + cpt_fops.llseek = proc_ent->proc_fops->llseek; + proc_ent->proc_fops = &cpt_fops; + + proc_ent->read_proc = proc_read; + proc_ent->data = NULL; + proc_ent->owner = THIS_MODULE; + return 0; + +err_out: + unregister_sysctl_table(ctl_header); +err_mon: + return err; +} +module_init(init_cpt); + +static void __exit exit_cpt(void) +{ + remove_proc_entry("cpt", NULL); + unregister_sysctl_table(ctl_header); + + spin_lock(&cpt_context_lock); + while (!list_empty(&cpt_context_list)) { + cpt_context_t *ctx; + ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); + + if (!ctx->sticky) + ctx->refcount++; + ctx->sticky = 0; + + BUG_ON(ctx->refcount != 1); + + __cpt_context_put(ctx); + } + spin_unlock(&cpt_context_lock); +} +module_exit(exit_cpt); diff --git a/kernel/cpt/cpt_process.c b/kernel/cpt/cpt_process.c new file mode 100644 index 0000000..8b6d4bf --- /dev/null +++ b/kernel/cpt/cpt_process.c @@ -0,0 +1,1369 @@ +/* + * + * kernel/cpt/cpt_process.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_ubc.h" +#include "cpt_process.h" +#include "cpt_kernel.h" + +#ifdef CONFIG_X86_32 +#undef task_pt_regs +#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.sp0) - 1) +#endif + +int check_task_state(struct task_struct *tsk, struct cpt_context *ctx) +{ +#ifdef CONFIG_X86_64 + if (!(task_thread_info(tsk)->flags&_TIF_IA32)) { + if (task_pt_regs(tsk)->ip >= VSYSCALL_START && + task_pt_regs(tsk)->ip < VSYSCALL_END) { + eprintk_ctx(CPT_FID "cannot be checkpointied while vsyscall, try later\n", CPT_TID(tsk)); + return -EAGAIN; + } + } +#endif + return 0; +} + +#ifdef CONFIG_X86 + +static u32 encode_segment(u32 segreg) +{ + segreg &= 0xFFFF; + + if (segreg == 0) + return CPT_SEG_ZERO; + if ((segreg & 3) != 3) { + wprintk("Invalid RPL of a segment reg %x\n", segreg); + return CPT_SEG_ZERO; + } + + /* LDT descriptor, it is just an index to LDT array */ + if (segreg & 4) + return CPT_SEG_LDT + (segreg >> 3); + + /* TLS descriptor. */ + if ((segreg >> 3) >= GDT_ENTRY_TLS_MIN && + (segreg >> 3) <= GDT_ENTRY_TLS_MAX) + return CPT_SEG_TLS1 + ((segreg>>3) - GDT_ENTRY_TLS_MIN); + + /* One of standard desriptors */ +#ifdef CONFIG_X86_64 + if (segreg == __USER32_DS) + return CPT_SEG_USER32_DS; + if (segreg == __USER32_CS) + return CPT_SEG_USER32_CS; + if (segreg == __USER_DS) + return CPT_SEG_USER64_DS; + if (segreg == __USER_CS) + return CPT_SEG_USER64_CS; +#else + if (segreg == __USER_DS) + return CPT_SEG_USER32_DS; + if (segreg == __USER_CS) + return CPT_SEG_USER32_CS; +#endif + wprintk("Invalid segment reg %x\n", segreg); + return CPT_SEG_ZERO; +} + +#ifdef CONFIG_X86_64 +static void xlate_ptregs_64_to_32(struct cpt_x86_regs *d, struct pt_regs *s, + struct task_struct *tsk) +{ + d->cpt_ebp = s->bp; + d->cpt_ebx = s->bx; + d->cpt_eax = s->ax; + d->cpt_ecx = s->cx; + d->cpt_edx = s->dx; + d->cpt_esi = s->si; + d->cpt_edi = s->di; + d->cpt_orig_eax = s->orig_ax; + d->cpt_eip = s->ip; + d->cpt_xcs = encode_segment(s->cs); + d->cpt_eflags = s->flags; + d->cpt_esp = s->sp; + d->cpt_xss = encode_segment(s->ss); + d->cpt_xds = encode_segment(tsk->thread.ds); + d->cpt_xes = encode_segment(tsk->thread.es); +} + +static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) +{ + cpt_open_object(NULL, ctx); + + if (task_thread_info(tsk)->flags & _TIF_IA32) { + struct cpt_x86_regs ri; + ri.cpt_next = sizeof(ri); + ri.cpt_object = CPT_OBJ_X86_REGS; + ri.cpt_hdrlen = sizeof(ri); + ri.cpt_content = CPT_CONTENT_VOID; + + ri.cpt_debugreg[0] = tsk->thread.debugreg0; + ri.cpt_debugreg[1] = tsk->thread.debugreg1; + ri.cpt_debugreg[2] = tsk->thread.debugreg2; + ri.cpt_debugreg[3] = tsk->thread.debugreg3; + ri.cpt_debugreg[4] = 0; + ri.cpt_debugreg[5] = 0; + ri.cpt_debugreg[6] = tsk->thread.debugreg6; + ri.cpt_debugreg[7] = tsk->thread.debugreg7; + ri.cpt_fs = encode_segment(tsk->thread.fsindex); + ri.cpt_gs = encode_segment(tsk->thread.gsindex); + + xlate_ptregs_64_to_32(&ri, task_pt_regs(tsk), tsk); + + ctx->write(&ri, sizeof(ri), ctx); + } else { + struct cpt_x86_64_regs ri; + ri.cpt_next = sizeof(ri); + ri.cpt_object = CPT_OBJ_X86_64_REGS; + ri.cpt_hdrlen = sizeof(ri); + ri.cpt_content = CPT_CONTENT_VOID; + + ri.cpt_fsbase = tsk->thread.fs; + ri.cpt_gsbase = tsk->thread.gs; + ri.cpt_fsindex = encode_segment(tsk->thread.fsindex); + ri.cpt_gsindex = encode_segment(tsk->thread.gsindex); + ri.cpt_ds = encode_segment(tsk->thread.ds); + ri.cpt_es = encode_segment(tsk->thread.es); + ri.cpt_debugreg[0] = tsk->thread.debugreg0; + ri.cpt_debugreg[1] = tsk->thread.debugreg1; + ri.cpt_debugreg[2] = tsk->thread.debugreg2; + ri.cpt_debugreg[3] = tsk->thread.debugreg3; + ri.cpt_debugreg[4] = 0; + ri.cpt_debugreg[5] = 0; + ri.cpt_debugreg[6] = tsk->thread.debugreg6; + ri.cpt_debugreg[7] = tsk->thread.debugreg7; + + memcpy(&ri.cpt_r15, task_pt_regs(tsk), sizeof(struct pt_regs)); + + ri.cpt_cs = encode_segment(task_pt_regs(tsk)->cs); + ri.cpt_ss = encode_segment(task_pt_regs(tsk)->ss); + + ctx->write(&ri, sizeof(ri), ctx); + + } + cpt_close_object(ctx); + + return 0; +} + +#else + +static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) +{ + struct cpt_x86_regs ri; + struct pt_regs *pt_regs; + + cpt_open_object(NULL, ctx); + + ri.cpt_next = sizeof(ri); + ri.cpt_object = CPT_OBJ_X86_REGS; + ri.cpt_hdrlen = sizeof(ri); + ri.cpt_content = CPT_CONTENT_VOID; + + ri.cpt_debugreg[0] = tsk->thread.debugreg0; + ri.cpt_debugreg[1] = tsk->thread.debugreg1; + ri.cpt_debugreg[2] = tsk->thread.debugreg2; + ri.cpt_debugreg[3] = tsk->thread.debugreg3; + ri.cpt_debugreg[6] = tsk->thread.debugreg6; + ri.cpt_debugreg[7] = tsk->thread.debugreg7; + + pt_regs = task_pt_regs(tsk); + + ri.cpt_fs = encode_segment(pt_regs->fs); + ri.cpt_gs = encode_segment(tsk->thread.gs); + + ri.cpt_ebx = pt_regs->bx; + ri.cpt_ecx = pt_regs->cx; + ri.cpt_edx = pt_regs->dx; + ri.cpt_esi = pt_regs->si; + ri.cpt_edi = pt_regs->di; + ri.cpt_ebp = pt_regs->bp; + ri.cpt_eax = pt_regs->ax; + ri.cpt_xds = pt_regs->ds; + ri.cpt_xes = pt_regs->es; + ri.cpt_orig_eax = pt_regs->orig_ax; + ri.cpt_eip = pt_regs->ip; + ri.cpt_xcs = pt_regs->cs; + ri.cpt_eflags = pt_regs->flags; + ri.cpt_esp = pt_regs->sp; + ri.cpt_xss = pt_regs->ss; + + ri.cpt_xcs = encode_segment(pt_regs->cs); + ri.cpt_xss = encode_segment(pt_regs->ss); + ri.cpt_xds = encode_segment(pt_regs->ds); + ri.cpt_xes = encode_segment(pt_regs->es); + + ctx->write(&ri, sizeof(ri), ctx); + cpt_close_object(ctx); + + return 0; +} +#endif +#endif + +#ifdef CONFIG_IA64 + +/* + PMD? + */ + +#define _C(x) do { if ((err = (x)) < 0) { printk("atm:" CPT_FID #x " %d\n", \ + CPT_TID(tsk), err); return -EINVAL; } } while (0) + +static int ass_to_mouth(struct cpt_ia64_regs *r, struct task_struct *tsk, + struct cpt_context *ctx) +{ + int err; + struct unw_frame_info info; + struct ia64_fpreg fpval; + int i; + + unw_init_from_blocked_task(&info, tsk); + _C(unw_unwind_to_user(&info)); + + /* NAT_BITS */ + do { + unsigned long scratch_unat; + + scratch_unat = info.sw->caller_unat; + if (info.pri_unat_loc) + scratch_unat = *info.pri_unat_loc; + + r->nat[0] = ia64_get_scratch_nat_bits(task_pt_regs(tsk), scratch_unat); + /* Just to be on safe side. */ + r->nat[0] &= 0xFFFFFFFFUL; + } while (0); + + /* R4-R7 */ + for (i = 4; i <= 7; i++) { + char nat = 0; + _C(unw_access_gr(&info, i, &r->gr[i], &nat, 0)); + r->nat[0] |= (nat != 0) << i; + } + + /* B1-B5 */ + for (i = 1; i <= 5; i++) { + _C(unw_access_br(&info, i, &r->br[i], 0)); + } + + /* AR_EC, AR_LC */ + _C(unw_access_ar(&info, UNW_AR_EC, &r->ar_ec, 0)); + _C(unw_access_ar(&info, UNW_AR_LC, &r->ar_lc, 0)); + + /* F2..F5, F16..F31 */ + for (i = 2; i <= 5; i++) { + _C(unw_get_fr(&info, i, &fpval)); + memcpy(&r->fr[i*2], &fpval, 16); + } + for (i = 16; i <= 31; i++) { + _C(unw_get_fr(&info, i, &fpval)); + memcpy(&r->fr[i*2], &fpval, 16); + } + return 0; +} + +#undef _C + +static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) +{ + int err; + unsigned long pg; + struct cpt_ia64_regs *r; + struct ia64_psr *psr; + struct switch_stack *sw; + struct pt_regs *pt; + void *krbs = (void *)tsk + IA64_RBS_OFFSET; + unsigned long reg; + + if (tsk->exit_state) + return 0; + + pt = task_pt_regs(tsk); + + sw = (struct switch_stack *) (tsk->thread.ksp + 16); + + if ((pg = __get_free_page(GFP_KERNEL)) == 0) + return -ENOMEM; + + r = (void*)pg; + /* To catch if we forgot some register */ + memset(r, 0xA5, sizeof(*r)); + + r->gr[0] = 0; + r->fr[0] = r->fr[1] = 0; + r->fr[2] = 0x8000000000000000UL; + r->fr[3] = 0xffff; + + r->nat[0] = r->nat[1] = 0; + + err = ass_to_mouth(r, tsk, ctx); + if (err) { + printk("ass_to_mouth error %d\n", err); + goto out; + } + + /* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */ + memcpy(&r->gr[1], &pt->r1, 8*(2-1)); + memcpy(&r->gr[2], &pt->r2, 8*(4-2)); + memcpy(&r->gr[8], &pt->r8, 8*(12-8)); + memcpy(&r->gr[12], &pt->r12, 8*(14-12)); + memcpy(&r->gr[14], &pt->r14, 8*(15-14)); + memcpy(&r->gr[15], &pt->r15, 8*(16-15)); + memcpy(&r->gr[16], &pt->r16, 8*(32-16)); + + r->br[0] = pt->b0; + r->br[6] = pt->b6; + r->br[7] = pt->b7; + + r->ar_bspstore = pt->ar_bspstore; + r->ar_unat = pt->ar_unat; + r->ar_pfs = pt->ar_pfs; + r->ar_ccv = pt->ar_ccv; + r->ar_fpsr = pt->ar_fpsr; + r->ar_csd = pt->ar_csd; + r->ar_ssd = pt->ar_ssd; + r->ar_rsc = pt->ar_rsc; + + r->cr_iip = pt->cr_iip; + r->cr_ipsr = pt->cr_ipsr; + + r->pr = pt->pr; + + r->cfm = pt->cr_ifs; + r->ar_rnat = pt->ar_rnat; + + /* fpregs 6..9,10..11 are in pt_regs */ + memcpy(&r->fr[2*6], &pt->f6, 16*(10-6)); + memcpy(&r->fr[2*10], &pt->f10, 16*(12-10)); + /* fpreg 12..15 are on switch stack */ + memcpy(&r->fr[2*12], &sw->f12, 16*(16-12)); + /* fpregs 32...127 */ + psr = ia64_psr(task_pt_regs(tsk)); + preempt_disable(); + if (ia64_is_local_fpu_owner(tsk) && psr->mfh) { + psr->mfh = 0; + tsk->thread.flags |= IA64_THREAD_FPH_VALID; + ia64_save_fpu(&tsk->thread.fph[0]); + } + preempt_enable(); + memcpy(&r->fr[32*2], tsk->thread.fph, 16*(128-32)); + + if (tsk->thread.flags & IA64_THREAD_DBG_VALID) { + memcpy(r->ibr, tsk->thread.ibr, sizeof(r->ibr)); + memcpy(r->dbr, tsk->thread.dbr, sizeof(r->ibr)); + } else { + memset(r->ibr, 0, sizeof(r->ibr)); + memset(r->dbr, 0, sizeof(r->dbr)); + } + + r->loadrs = pt->loadrs; + r->num_regs = ia64_rse_num_regs(krbs, krbs + 8*(pt->loadrs >> 19)); + if ((long)pt->cr_ifs > 0) + r->num_regs += (pt->cr_ifs & 0x7f); + + if (r->num_regs > 96) { + eprintk_ctx(CPT_FID " too much RSE regs %lu\n", + CPT_TID(tsk), r->num_regs); + return -EINVAL; + } + + for (reg = 0; reg < r->num_regs; reg++) { + unsigned long *ptr = ia64_rse_skip_regs(krbs, reg); + unsigned long *rnatp = ia64_rse_rnat_addr(ptr); + + r->gr[32+reg] = *ptr; + + if ((unsigned long)rnatp >= sw->ar_bspstore) + rnatp = &sw->ar_rnat; + if (*rnatp & (1UL<nat[0] |= (1UL<<(reg+32)); + else + r->nat[1] |= (1UL<<(reg-32)); + } + } + if (r->nat[0] | r->nat[1]) + wprintk_ctx(CPT_FID " nat bits %lx%016lx\n", CPT_TID(tsk), + r->nat[1], r->nat[0]); + + cpt_open_object(NULL, ctx); + r->cpt_next = sizeof(*r); + r->cpt_object = CPT_OBJ_IA64_REGS; + r->cpt_hdrlen = sizeof(*r); + r->cpt_content = CPT_CONTENT_VOID; + ctx->write(r, sizeof(*r), ctx); + cpt_close_object(ctx); + err = 0; + +out: + free_page(pg); + return err; +} +#endif + +static int dump_kstack(struct task_struct *tsk, struct cpt_context *ctx) +{ + struct cpt_obj_bits hdr; + unsigned long size; + void *start; + + cpt_open_object(NULL, ctx); + +#ifdef CONFIG_X86_64 + size = tsk->thread.sp0 - tsk->thread.sp; + start = (void*)tsk->thread.sp; +#elif defined(CONFIG_X86_32) + size = tsk->thread.sp0 - tsk->thread.sp; + start = (void*)tsk->thread.sp; +#elif defined(CONFIG_IA64) + size = (unsigned long)(task_pt_regs(tsk)+1) - tsk->thread.ksp; + start = (void*)tsk->thread.ksp; +#else +#error Arch is not supported +#endif + + hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); + hdr.cpt_object = CPT_OBJ_BITS; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_content = CPT_CONTENT_STACK; + hdr.cpt_size = size; + + ctx->write(&hdr, sizeof(hdr), ctx); + ctx->write(start, size, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + return 0; +} + +#ifdef CONFIG_X86 +/* Formats of i387_fxsave_struct are the same for x86_64 + * and i386. Plain luck. */ + +static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx) +{ + struct cpt_obj_bits hdr; + unsigned long size; + int type; + + if (!tsk->thread.xstate) + return 0; + + cpt_open_object(NULL, ctx); + + type = CPT_CONTENT_X86_FPUSTATE; + size = sizeof(struct i387_fxsave_struct); +#ifndef CONFIG_X86_64 + if (!cpu_has_fxsr) { + size = sizeof(struct i387_fsave_struct); + type = CPT_CONTENT_X86_FPUSTATE_OLD; + } +#endif + + hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); + hdr.cpt_object = CPT_OBJ_BITS; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_content = type; + hdr.cpt_size = size; + + ctx->write(&hdr, sizeof(hdr), ctx); + ctx->write(tsk->thread.xstate, size, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + return 0; +} +#endif + +#ifdef CONFIG_IA64 + +static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx) +{ + return 0; +} +#endif + +static int encode_siginfo(struct cpt_siginfo_image *si, siginfo_t *info) +{ + si->cpt_signo = info->si_signo; + si->cpt_errno = info->si_errno; + si->cpt_code = info->si_code; + + switch(si->cpt_code & __SI_MASK) { + case __SI_TIMER: + si->cpt_pid = info->si_tid; + si->cpt_uid = info->si_overrun; + si->cpt_sigval = cpt_ptr_export(info->_sifields._timer._sigval.sival_ptr); + si->cpt_utime = info->si_sys_private; + break; + case __SI_POLL: + si->cpt_pid = info->si_band; + si->cpt_uid = info->si_fd; + break; + case __SI_FAULT: + si->cpt_sigval = cpt_ptr_export(info->si_addr); +#ifdef __ARCH_SI_TRAPNO + si->cpt_pid = info->si_trapno; +#endif + break; + case __SI_CHLD: + si->cpt_pid = info->si_pid; + si->cpt_uid = info->si_uid; + si->cpt_sigval = info->si_status; + si->cpt_stime = info->si_stime; + si->cpt_utime = info->si_utime; + break; + case __SI_KILL: + case __SI_RT: + case __SI_MESGQ: + default: + si->cpt_pid = info->si_pid; + si->cpt_uid = info->si_uid; + si->cpt_sigval = cpt_ptr_export(info->si_ptr); + break; + } + return 0; +} + +static int dump_sigqueue(struct sigpending *list, struct cpt_context *ctx) +{ + struct sigqueue *q; + loff_t saved_obj; + + if (list_empty(&list->list)) + return 0; + + cpt_push_object(&saved_obj, ctx); + list_for_each_entry(q, &list->list, list) { + struct cpt_siginfo_image si; + + si.cpt_next = sizeof(si); + si.cpt_object = CPT_OBJ_SIGINFO; + si.cpt_hdrlen = sizeof(si); + si.cpt_content = CPT_CONTENT_VOID; + + si.cpt_qflags = q->flags; + si.cpt_user = q->user->uid; + + if (encode_siginfo(&si, &q->info)) + return -EINVAL; + + ctx->write(&si, sizeof(si), ctx); + } + cpt_pop_object(&saved_obj, ctx); + return 0; +} + + + +static int dump_one_signal_struct(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct signal_struct *sig = obj->o_obj; + struct cpt_signal_image *v = cpt_get_buf(ctx); + struct task_struct *tsk; + int i; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SIGNAL_STRUCT; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + if (sig->__pgrp <= 0) { + eprintk_ctx("bad pgid\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_pgrp_type = CPT_PGRP_NORMAL; + read_lock(&tasklist_lock); + tsk = find_task_by_pid_ns(sig->__pgrp, &init_pid_ns); + if (tsk == NULL) + v->cpt_pgrp_type = CPT_PGRP_ORPHAN; + read_unlock(&tasklist_lock); + v->cpt_pgrp = pid_to_vpid(sig->__pgrp); + + v->cpt_old_pgrp = 0; +/* if (!sig->tty_old_pgrp) { + eprintk_ctx("bad tty_old_pgrp\n"); + cpt_release_buf(ctx); + return -EINVAL; + }*/ + if (sig->tty_old_pgrp) { + v->cpt_old_pgrp_type = CPT_PGRP_NORMAL; + read_lock(&tasklist_lock); + tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PID); + if (tsk == NULL) { + v->cpt_old_pgrp_type = CPT_PGRP_ORPHAN; + tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PGID); + } + read_unlock(&tasklist_lock); + if (tsk == NULL) { + eprintk_ctx("tty_old_pgrp does not exist anymore\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_old_pgrp = pid_vnr(sig->tty_old_pgrp); + if ((int)v->cpt_old_pgrp < 0) { + dprintk_ctx("stray tty_old_pgrp %d\n", pid_nr(sig->tty_old_pgrp)); + v->cpt_old_pgrp = -1; + v->cpt_old_pgrp_type = CPT_PGRP_STRAY; + } + } + + if (sig->__session <= 0) { + eprintk_ctx("bad session\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_session_type = CPT_PGRP_NORMAL; + read_lock(&tasklist_lock); + tsk = find_task_by_pid_ns(sig->__session, &init_pid_ns); + if (tsk == NULL) + v->cpt_session_type = CPT_PGRP_ORPHAN; + read_unlock(&tasklist_lock); + v->cpt_session = pid_to_vpid(sig->__session); + + v->cpt_leader = sig->leader; + v->cpt_ctty = CPT_NULL; + if (sig->tty) { + cpt_object_t *cobj = lookup_cpt_object(CPT_OBJ_TTY, sig->tty, ctx); + if (cobj) + v->cpt_ctty = cobj->o_pos; + else { + eprintk_ctx("controlling tty is not found\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + } + memcpy(&v->cpt_sigpending, &sig->shared_pending.signal, 8); + + v->cpt_curr_target = 0; + if (sig->curr_target) + v->cpt_curr_target = task_pid_vnr(sig->curr_target); + v->cpt_group_exit = ((sig->flags & SIGNAL_GROUP_EXIT) != 0); + v->cpt_group_exit_code = sig->group_exit_code; + v->cpt_group_exit_task = 0; + if (sig->group_exit_task) + v->cpt_group_exit_task = task_pid_vnr(sig->group_exit_task); + v->cpt_notify_count = sig->notify_count; + v->cpt_group_stop_count = sig->group_stop_count; + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,8) + v->cpt_utime = sig->utime; + v->cpt_stime = sig->stime; + v->cpt_cutime = sig->cutime; + v->cpt_cstime = sig->cstime; + v->cpt_nvcsw = sig->nvcsw; + v->cpt_nivcsw = sig->nivcsw; + v->cpt_cnvcsw = sig->cnvcsw; + v->cpt_cnivcsw = sig->cnivcsw; + v->cpt_min_flt = sig->min_flt; + v->cpt_maj_flt = sig->maj_flt; + v->cpt_cmin_flt = sig->cmin_flt; + v->cpt_cmaj_flt = sig->cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; icpt_rlim_cur[i] = sig->rlim[i].rlim_cur; + v->cpt_rlim_max[i] = sig->rlim[i].rlim_max; + } else { + v->cpt_rlim_cur[i] = CPT_NULL; + v->cpt_rlim_max[i] = CPT_NULL; + } + } +#endif + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + dump_sigqueue(&sig->shared_pending, ctx); + + cpt_close_object(ctx); + return 0; +} + +int cpt_check_unsupported(struct task_struct *tsk, cpt_context_t *ctx) +{ + if (tsk->splice_pipe) { + eprintk_ctx("splice is used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#ifdef CONFIG_KEYS + if (tsk->request_key_auth || tsk->thread_keyring) { + eprintk_ctx("keys are used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#endif +#ifdef CONFIG_NUMA + if (tsk->mempolicy) { + eprintk_ctx("NUMA mempolicy is used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#endif +#ifdef CONFIG_TUX + if (tsk->tux_info) { + eprintk_ctx("TUX is used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#endif + return 0; +} + +static int dump_one_process(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct task_struct *tsk = obj->o_obj; + int last_thread; + struct cpt_task_image *v = cpt_get_buf(ctx); + cpt_object_t *tobj; + cpt_object_t *tg_obj; + loff_t saved_obj; + int i; + int err; + struct timespec delta; + struct mm_struct * tsk_mm; + struct files_struct * tsk_files; + struct fs_struct * tsk_fs; + struct mnt_namespace * tsk_ns; + + cpt_open_object(obj, ctx); + + v->cpt_signal = CPT_NULL; + tg_obj = lookup_cpt_object(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx); + if (!tg_obj) BUG(); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_TASK; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_state = tsk->state; + if (tsk->state == EXIT_ZOMBIE) { + eprintk_ctx("invalid zombie state on" CPT_FID "\n", CPT_TID(tsk)); + cpt_release_buf(ctx); + return -EINVAL; + } else if (tsk->state == EXIT_DEAD) { + if (tsk->exit_state != EXIT_DEAD && + tsk->exit_state != EXIT_ZOMBIE) { + eprintk_ctx("invalid exit_state %d on" CPT_FID "\n", tsk->exit_state, CPT_TID(tsk)); + cpt_release_buf(ctx); + return -EINVAL; + } + } + if (tsk->exit_state) { + v->cpt_state = tsk->exit_state; + if (tsk->state != TASK_DEAD) { + eprintk_ctx("invalid tsk->state %ld/%d on" CPT_FID "\n", + tsk->state, tsk->exit_state, CPT_TID(tsk)); + cpt_release_buf(ctx); + return -EINVAL; + } + } + if (cpt_check_unsupported(tsk, ctx)) { + cpt_release_buf(ctx); + return -EBUSY; + } + + v->cpt_flags = tsk->flags&~(PF_FROZEN|PF_EXIT_RESTART); + v->cpt_ptrace = tsk->ptrace; + v->cpt_prio = tsk->prio; + v->cpt_exit_code = tsk->exit_code; + v->cpt_exit_signal = tsk->exit_signal; + v->cpt_pdeath_signal = tsk->pdeath_signal; + v->cpt_static_prio = tsk->static_prio; + v->cpt_rt_priority = tsk->rt_priority; + v->cpt_policy = tsk->policy; + if (v->cpt_policy != SCHED_NORMAL) { + eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", task_pid_vnr(tsk), tsk->pid, tsk->comm); + cpt_release_buf(ctx); + return -EINVAL; + } + + /* Unpleasant moment. When leader of thread group exits, + * it remains in zombie state until all the group exits. + * We save not-NULL pointers to process mm/files/fs, so + * that we can restore this thread group. + */ + tsk_mm = tsk->mm; + tsk_files = tsk->files; + tsk_fs = tsk->fs; + tsk_ns = tsk->nsproxy ? tsk->nsproxy->mnt_ns : NULL; + + if (tsk->exit_state && !thread_group_empty(tsk) && + thread_group_leader(tsk)) { + struct task_struct * p = tsk; + + read_lock(&tasklist_lock); + do { + if (p->mm) + tsk_mm = p->mm; + if (p->files) + tsk_files = p->files; + if (p->fs) + tsk_fs = p->fs; + if (p->nsproxy && p->nsproxy->mnt_ns) + tsk_ns = p->nsproxy->mnt_ns; + p = next_thread(p); + } while (p != tsk); + read_unlock(&tasklist_lock); + } + + v->cpt_mm = CPT_NULL; + if (tsk_mm) { + tobj = lookup_cpt_object(CPT_OBJ_MM, tsk_mm, ctx); + if (!tobj) BUG(); + v->cpt_mm = tobj->o_pos; + } + v->cpt_files = CPT_NULL; + if (tsk_files) { + tobj = lookup_cpt_object(CPT_OBJ_FILES, tsk_files, ctx); + if (!tobj) BUG(); + v->cpt_files = tobj->o_pos; + } + v->cpt_fs = CPT_NULL; + if (tsk_fs) { + tobj = lookup_cpt_object(CPT_OBJ_FS, tsk_fs, ctx); + if (!tobj) BUG(); + v->cpt_fs = tobj->o_pos; + } + v->cpt_namespace = CPT_NULL; + if (tsk_ns) { + tobj = lookup_cpt_object(CPT_OBJ_NAMESPACE, tsk_ns, ctx); + if (!tobj) BUG(); + v->cpt_namespace = tobj->o_pos; + + if (tsk_ns != current->nsproxy->mnt_ns) + eprintk_ctx("namespaces are not supported:" + "process " CPT_FID "\n", CPT_TID(tsk)); + } + v->cpt_sysvsem_undo = CPT_NULL; + if (tsk->sysvsem.undo_list && !tsk->exit_state) { + tobj = lookup_cpt_object(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx); + if (!tobj) BUG(); + v->cpt_sysvsem_undo = tobj->o_pos; + } + v->cpt_sighand = CPT_NULL; + if (tsk->sighand) { + tobj = lookup_cpt_object(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx); + if (!tobj) BUG(); + v->cpt_sighand = tobj->o_pos; + } + v->cpt_sigblocked = cpt_sigset_export(&tsk->blocked); + v->cpt_sigrblocked = cpt_sigset_export(&tsk->real_blocked); + v->cpt_sigsuspend_blocked = cpt_sigset_export(&tsk->saved_sigmask); + + v->cpt_pid = task_pid_vnr(tsk); + v->cpt_tgid = task_tgid_vnr(tsk); + v->cpt_ppid = 0; + if (tsk->parent) { + if (tsk->parent != tsk->real_parent && + !lookup_cpt_object(CPT_OBJ_TASK, tsk->parent, ctx)) { + eprintk_ctx("task %d/%d(%s) is ptraced from ve0\n", tsk->pid, task_pid_vnr(tsk), tsk->comm); + cpt_release_buf(ctx); + return -EBUSY; + } + v->cpt_ppid = task_pid_vnr(tsk->parent); + } + v->cpt_rppid = tsk->real_parent ? task_pid_vnr(tsk->real_parent) : 0; + v->cpt_pgrp = task_pgrp_vnr(tsk); + v->cpt_session = task_session_vnr(tsk); + v->cpt_old_pgrp = 0; + if (tsk->signal->tty_old_pgrp) + v->cpt_old_pgrp = pid_vnr(tsk->signal->tty_old_pgrp); + v->cpt_leader = tsk->group_leader ? task_pid_vnr(tsk->group_leader) : 0; + v->cpt_set_tid = (unsigned long)tsk->set_child_tid; + v->cpt_clear_tid = (unsigned long)tsk->clear_child_tid; + memcpy(v->cpt_comm, tsk->comm, 16); + v->cpt_user = tsk->user->uid; + v->cpt_uid = tsk->uid; + v->cpt_euid = tsk->euid; + v->cpt_suid = tsk->suid; + v->cpt_fsuid = tsk->fsuid; + v->cpt_gid = tsk->gid; + v->cpt_egid = tsk->egid; + v->cpt_sgid = tsk->sgid; + v->cpt_fsgid = tsk->fsgid; + v->cpt_ngids = 0; + if (tsk->group_info && tsk->group_info->ngroups != 0) { + int i = tsk->group_info->ngroups; + if (i > 32) { + /* Shame... I did a simplified version and _forgot_ + * about this. Later, later. */ + eprintk_ctx("too many of groups " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + v->cpt_ngids = i; + for (i--; i>=0; i--) + v->cpt_gids[i] = tsk->group_info->small_block[i]; + } + v->cpt_prctl_uac = 0; + v->cpt_prctl_fpemu = 0; + v->__cpt_pad1 = 0; +#ifdef CONFIG_IA64 + v->cpt_prctl_uac = (tsk->thread.flags & IA64_THREAD_UAC_MASK) >> IA64_THREAD_UAC_SHIFT; + v->cpt_prctl_fpemu = (tsk->thread.flags & IA64_THREAD_FPEMU_MASK) >> IA64_THREAD_FPEMU_SHIFT; +#endif + memcpy(&v->cpt_ecap, &tsk->cap_effective, 8); + memcpy(&v->cpt_icap, &tsk->cap_inheritable, 8); + memcpy(&v->cpt_pcap, &tsk->cap_permitted, 8); + v->cpt_keepcap = tsk->securebits; + + v->cpt_did_exec = tsk->did_exec; + v->cpt_exec_domain = -1; + v->cpt_thrflags = task_thread_info(tsk)->flags & ~(1<cpt_64bit = 0; +#ifdef CONFIG_X86_64 + /* Clear x86_64 specific flags */ + v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32); + if (!(task_thread_info(tsk)->flags & _TIF_IA32)) { + ctx->tasks64++; + v->cpt_64bit = 1; + } +#endif +#ifdef CONFIG_IA64 + /* Clear ia64 specific flags */ + //// v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32); + if (!IS_IA32_PROCESS(task_pt_regs(tsk))) { + ctx->tasks64++; + v->cpt_64bit = 1; + } +#endif + v->cpt_thrstatus = task_thread_info(tsk)->status; + v->cpt_addr_limit = -1; + + v->cpt_personality = tsk->personality; + +#ifdef CONFIG_X86 + for (i=0; i=3) { + eprintk_ctx("too many tls descs\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_tls[i] = (((u64)tsk->thread.tls_array[i].b)<<32) + tsk->thread.tls_array[i].a; + } +#endif + + v->cpt_restart.fn = CPT_RBL_0; + if (task_thread_info(tsk)->restart_block.fn != task_thread_info(current)->restart_block.fn) { + struct restart_block *rb = &task_thread_info(tsk)->restart_block; + ktime_t e; + + if (rb->fn == hrtimer_nanosleep_restart) { + v->cpt_restart.fn = CPT_RBL_NANOSLEEP; + + e.tv64 = ((u64)rb->arg3 << 32) | (u64)rb->arg2; + e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + v->cpt_restart.arg0 = rb->arg0; + v->cpt_restart.arg1 = rb->arg1; + v->cpt_restart.arg2 = ktime_to_ns(e); + v->cpt_restart.arg3 = 0; + dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0); + goto continue_dump; + } +#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) + if (rb->fn == compat_nanosleep_restart) { + v->cpt_restart.fn = CPT_RBL_COMPAT_NANOSLEEP; + + e.tv64 = ((u64)rb->arg3 << 32) | (u64)rb->arg2; + e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + v->cpt_restart.arg0 = rb->arg0; + v->cpt_restart.arg1 = rb->arg1; + v->cpt_restart.arg2 = ktime_to_ns(e); + v->cpt_restart.arg3 = 0; + dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0); + goto continue_dump; + } +#endif + if (rb->fn == do_restart_poll) { + u64 timeout_jiffies; + + timeout_jiffies = ((u64)rb->arg3 << 32)|(u64)rb->arg2; + e.tv64 = timeout_jiffies * TICK_NSEC; + + v->cpt_restart.fn = CPT_RBL_POLL; + v->cpt_restart.arg0 = rb->arg0; + v->cpt_restart.arg1 = rb->arg1; + v->cpt_restart.arg2 = ktime_to_ns(e); + v->cpt_restart.arg3 = 0; + dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0); + goto continue_dump; + } + if (rb->fn == futex_wait_restart) { + v->cpt_restart.fn = CPT_RBL_FUTEX_WAIT; + + e.tv64 = rb->futex.time; + e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + v->cpt_restart.arg0 = (unsigned long)rb->futex.uaddr; + v->cpt_restart.arg1 = rb->futex.val; + v->cpt_restart.arg2 = ktime_to_ns(e); + v->cpt_restart.arg3 = rb->futex.flags; + goto continue_dump; + } + eprintk_ctx("unknown restart block %p\n", rb->fn); + return -EINVAL; + } + +continue_dump: + v->cpt_it_real_incr = 0; + v->cpt_it_prof_incr = 0; + v->cpt_it_virt_incr = 0; + v->cpt_it_real_value = 0; + v->cpt_it_prof_value = 0; + v->cpt_it_virt_value = 0; + if (thread_group_leader(tsk) && tsk->exit_state == 0) { + ktime_t rem; + + v->cpt_it_real_incr = ktime_to_ns(tsk->signal->it_real_incr); + v->cpt_it_prof_incr = tsk->signal->it_prof_incr; + v->cpt_it_virt_incr = tsk->signal->it_virt_incr; + + rem = hrtimer_get_remaining(&tsk->signal->real_timer); + + if (hrtimer_active(&tsk->signal->real_timer)) { + if (rem.tv64 <= 0) + rem.tv64 = NSEC_PER_USEC; + v->cpt_it_real_value = ktime_to_ns(rem); + dprintk("cpt itimer " CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_it_real_value); + } + v->cpt_it_prof_value = tsk->signal->it_prof_expires; + v->cpt_it_virt_value = tsk->signal->it_virt_expires; + } + v->cpt_used_math = (tsk_used_math(tsk) != 0); + + if (tsk->notifier) { + eprintk_ctx("task notifier is in use: process %d/%d(%s)\n", task_pid_vnr(tsk), tsk->pid, tsk->comm); + cpt_release_buf(ctx); + return -EINVAL; + } + + v->cpt_utime = tsk->utime; + v->cpt_stime = tsk->stime; + delta = tsk->start_time; + _set_normalized_timespec(&delta, + delta.tv_sec - get_exec_env()->start_timespec.tv_sec, + delta.tv_nsec - get_exec_env()->start_timespec.tv_nsec); + v->cpt_starttime = cpt_timespec_export(&delta); + v->cpt_nvcsw = tsk->nvcsw; + v->cpt_nivcsw = tsk->nivcsw; + v->cpt_min_flt = tsk->min_flt; + v->cpt_maj_flt = tsk->maj_flt; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) + v->cpt_cutime = tsk->cutime; + v->cpt_cstime = tsk->cstime; + v->cpt_cnvcsw = tsk->cnvcsw; + v->cpt_cnivcsw = tsk->cnivcsw; + v->cpt_cmin_flt = tsk->cmin_flt; + v->cpt_cmaj_flt = tsk->cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; icpt_rlim_cur[i] = tsk->rlim[i].rlim_cur; + v->cpt_rlim_max[i] = tsk->rlim[i].rlim_max; + } else { + v->cpt_rlim_cur[i] = CPT_NULL; + v->cpt_rlim_max[i] = CPT_NULL; + } + } +#else + v->cpt_cutime = tsk->signal->cutime; + v->cpt_cstime = tsk->signal->cstime; + v->cpt_cnvcsw = tsk->signal->cnvcsw; + v->cpt_cnivcsw = tsk->signal->cnivcsw; + v->cpt_cmin_flt = tsk->signal->cmin_flt; + v->cpt_cmaj_flt = tsk->signal->cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; icpt_rlim_cur[i] = tsk->signal->rlim[i].rlim_cur; + v->cpt_rlim_max[i] = tsk->signal->rlim[i].rlim_max; + } else { + v->cpt_rlim_cur[i] = CPT_NULL; + v->cpt_rlim_max[i] = CPT_NULL; + } + } +#endif + +#ifdef CONFIG_BEANCOUNTERS + if (tsk->mm) + v->cpt_mm_ub = cpt_lookup_ubc(tsk->mm->mm_ub, ctx); + else + v->cpt_mm_ub = CPT_NULL; + v->cpt_task_ub = cpt_lookup_ubc(tsk->task_bc.task_ub, ctx); + v->cpt_exec_ub = cpt_lookup_ubc(tsk->task_bc.exec_ub, ctx); + v->cpt_fork_sub = cpt_lookup_ubc(tsk->task_bc.fork_sub, ctx); +#endif + + v->cpt_ptrace_message = tsk->ptrace_message; + v->cpt_pn_state = tsk->pn_state; + v->cpt_stopped_state = tsk->stopped_state; + v->cpt_sigsuspend_state = 0; + +#ifdef CONFIG_X86_32 + if (tsk->thread.vm86_info) { + eprintk_ctx("vm86 task is running\n"); + cpt_release_buf(ctx); + return -EBUSY; + } +#endif + + v->cpt_sigpending = cpt_sigset_export(&tsk->pending.signal); + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + dump_kstack(tsk, ctx); + cpt_pop_object(&saved_obj, ctx); + + cpt_push_object(&saved_obj, ctx); + err = dump_registers(tsk, ctx); + cpt_pop_object(&saved_obj, ctx); + if (err) + return err; + + if (tsk_used_math(tsk)) { + cpt_push_object(&saved_obj, ctx); + dump_fpustate(tsk, ctx); + cpt_pop_object(&saved_obj, ctx); + } + + if (tsk->last_siginfo) { + struct cpt_siginfo_image si; + cpt_push_object(&saved_obj, ctx); + + si.cpt_next = sizeof(si); + si.cpt_object = CPT_OBJ_LASTSIGINFO; + si.cpt_hdrlen = sizeof(si); + si.cpt_content = CPT_CONTENT_VOID; + + if (encode_siginfo(&si, tsk->last_siginfo)) + return -EINVAL; + + ctx->write(&si, sizeof(si), ctx); + cpt_pop_object(&saved_obj, ctx); + } + + if (tsk->sas_ss_size) { + struct cpt_sigaltstack_image si; + cpt_push_object(&saved_obj, ctx); + + si.cpt_next = sizeof(si); + si.cpt_object = CPT_OBJ_SIGALTSTACK; + si.cpt_hdrlen = sizeof(si); + si.cpt_content = CPT_CONTENT_VOID; + + si.cpt_stack = tsk->sas_ss_sp; + si.cpt_stacksize = tsk->sas_ss_size; + + ctx->write(&si, sizeof(si), ctx); + cpt_pop_object(&saved_obj, ctx); + } + + if (tsk->robust_list +#ifdef CONFIG_COMPAT + || tsk->compat_robust_list +#endif + ) { + struct cpt_task_aux_image ai; + cpt_push_object(&saved_obj, ctx); + + ai.cpt_next = sizeof(ai); + ai.cpt_object = CPT_OBJ_TASK_AUX; + ai.cpt_hdrlen = sizeof(ai); + ai.cpt_content = CPT_CONTENT_VOID; + + ai.cpt_robust_list = (unsigned long)tsk->robust_list; +#ifdef CONFIG_X86_64 +#ifdef CONFIG_COMPAT + if (task_thread_info(tsk)->flags & _TIF_IA32) + ai.cpt_robust_list = (unsigned long)tsk->compat_robust_list; +#endif +#endif + ctx->write(&ai, sizeof(ai), ctx); + cpt_pop_object(&saved_obj, ctx); + } + + dump_sigqueue(&tsk->pending, ctx); + + last_thread = 1; + read_lock(&tasklist_lock); + do { + struct task_struct * next = next_thread(tsk); + if (next != tsk && !thread_group_leader(next)) + last_thread = 0; + } while (0); + read_unlock(&tasklist_lock); + + if (last_thread) { + struct task_struct *prev_tsk; + int err; + loff_t pos = ctx->file->f_pos; + + cpt_push_object(&saved_obj, ctx); + err = dump_one_signal_struct(tg_obj, ctx); + cpt_pop_object(&saved_obj, ctx); + if (err) + return err; + + prev_tsk = tsk; + for (;;) { + if (prev_tsk->tgid == tsk->tgid) { + loff_t tg_pos; + + tg_pos = obj->o_pos + offsetof(struct cpt_task_image, cpt_signal); + ctx->pwrite(&pos, sizeof(pos), ctx, tg_pos); + if (thread_group_leader(prev_tsk)) + break; + } + + if (obj->o_list.prev == &ctx->object_array[CPT_OBJ_TASK]) { + eprintk_ctx("bug: thread group leader is lost\n"); + return -EINVAL; + } + + obj = list_entry(obj->o_list.prev, cpt_object_t, o_list); + prev_tsk = obj->o_obj; + } + } + + cpt_close_object(ctx); + return 0; +} + +int cpt_dump_tasks(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_TASKS); + + for_each_object(obj, CPT_OBJ_TASK) { + int err; + + if ((err = dump_one_process(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} + +int cpt_collect_signals(cpt_context_t *ctx) +{ + cpt_object_t *obj; + + /* Collect process fd sets */ + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->signal && !list_empty(&tsk->signal->posix_timers)) { + eprintk_ctx("task %d/%d(%s) uses posix timers\n", tsk->pid, task_pid_vnr(tsk), tsk->comm); + return -EBUSY; + } + if (tsk->signal && cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx) == NULL) + return -ENOMEM; + if (tsk->sighand && cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx) == NULL) + return -ENOMEM; + } + return 0; +} + + +static int dump_one_sighand_struct(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct sighand_struct *sig = obj->o_obj; + struct cpt_sighand_image *v = cpt_get_buf(ctx); + int i; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SIGHAND_STRUCT; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + for (i=0; i< _NSIG; i++) { + if (sig->action[i].sa.sa_handler != SIG_DFL || + sig->action[i].sa.sa_flags) { + loff_t saved_obj; + struct cpt_sighandler_image *o = cpt_get_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + o->cpt_next = CPT_NULL; + o->cpt_object = CPT_OBJ_SIGHANDLER; + o->cpt_hdrlen = sizeof(*o); + o->cpt_content = CPT_CONTENT_VOID; + + o->cpt_signo = i; + o->cpt_handler = (unsigned long)sig->action[i].sa.sa_handler; + o->cpt_restorer = 0; +#ifdef CONFIG_X86 + o->cpt_restorer = (unsigned long)sig->action[i].sa.sa_restorer; +#endif + o->cpt_flags = sig->action[i].sa.sa_flags; + memcpy(&o->cpt_mask, &sig->action[i].sa.sa_mask, 8); + ctx->write(o, sizeof(*o), ctx); + cpt_release_buf(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + } + + cpt_close_object(ctx); + return 0; +} + +int cpt_dump_sighand(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_SIGHAND_STRUCT); + + for_each_object(obj, CPT_OBJ_SIGHAND_STRUCT) { + int err; + + if ((err = dump_one_sighand_struct(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} diff --git a/kernel/cpt/cpt_process.h b/kernel/cpt/cpt_process.h new file mode 100644 index 0000000..b9f28af --- /dev/null +++ b/kernel/cpt/cpt_process.h @@ -0,0 +1,13 @@ +int cpt_collect_signals(cpt_context_t *); +int cpt_dump_signal(struct cpt_context *); +int cpt_dump_sighand(struct cpt_context *); +int cpt_dump_tasks(struct cpt_context *); + +int rst_signal_complete(struct cpt_task_image *ti, int *exiting, struct cpt_context *ctx); +__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx); + +int rst_restore_process(struct cpt_context *ctx); +int rst_process_linkage(struct cpt_context *ctx); + +int check_task_state(struct task_struct *tsk, struct cpt_context *ctx); +struct pid *alloc_vpid_safe(pid_t vnr); diff --git a/kernel/cpt/cpt_socket.c b/kernel/cpt/cpt_socket.c new file mode 100644 index 0000000..4878df1 --- /dev/null +++ b/kernel/cpt/cpt_socket.c @@ -0,0 +1,790 @@ +/* + * + * kernel/cpt/cpt_socket.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_socket.h" +#include "cpt_files.h" +#include "cpt_kernel.h" + +static int dump_rqueue(int owner, struct sock *sk, struct cpt_context *ctx); + + +/* Sockets are quite different of another kinds of files. + * There is one simplification: only one struct file can refer to a socket, + * so we could store information about socket directly in section FILES as + * a description of a file and append f.e. array of not-yet-accepted + * connections of listening socket as array of auxiliary data. + * + * Complications are: + * 1. TCP sockets can be orphans. We have to relocate orphans as well, + * so we have to create special section for orphans. + * 2. AF_UNIX sockets are distinguished objects: set of links between + * AF_UNIX sockets is quite arbitrary. + * A. Each socket can refers to many of files due to FD passing. + * B. Each socket except for connected ones can have in queue skbs + * sent by any of sockets. + * + * 2A is relatively easy: after our tasks are frozen we make an additional + * recursive pass throgh set of collected files and get referenced to + * FD passed files. After end of recursion, all the files are treated + * in the same way. All they will be stored in section FILES. + * + * 2B. We have to resolve all those references at some point. + * It is the place where pipe-like approach to image fails. + * + * All this makes socket checkpointing quite chumbersome. + * Right now we collect all the sockets and assign some numeric index value + * to each of them. The socket section is separate and put after section FILES, + * so section FILES refers to sockets by index, section SOCKET refers to FILES + * as usual by position in image. All the refs inside socket section are + * by index. When restoring we read socket section, create objects to hold + * mappings index <-> pos. At the second pass we open sockets (simultaneosly + * with their pairs) and create FILE objects. + */ + + +/* ====== FD passing ====== */ + +/* Almost nobody does FD passing via AF_UNIX sockets, nevertheless we + * have to implement this. A problem is that in general case we receive + * skbs from an unknown context, so new files can arrive to checkpointed + * set of processes even after they are stopped. Well, we are going just + * to ignore unknown fds while doing real checkpointing. It is fair because + * links outside checkpointed set are going to fail anyway. + * + * ATTN: the procedure is recursive. We linearize the recursion adding + * newly found files to the end of file list, so they will be analyzed + * in the same loop. + */ + +static int collect_one_passedfd(struct file *file, cpt_context_t * ctx) +{ + struct inode *inode = file->f_dentry->d_inode; + struct socket *sock; + struct sock *sk; + struct sk_buff *skb; + + if (!S_ISSOCK(inode->i_mode)) + return -ENOTSOCK; + + sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; + + if (sock->ops->family != AF_UNIX) + return 0; + + sk = sock->sk; + + /* Subtle locking issue. skbs cannot be removed while + * we are scanning, because all the processes are stopped. + * They still can be added to tail of queue. Locking while + * we dereference skb->next is enough to resolve this. + * See above about collision with skbs added after we started + * checkpointing. + */ + + skb = skb_peek(&sk->sk_receive_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { + if (UNIXCB(skb).fp && skb->sk && + (!sock_flag(skb->sk, SOCK_DEAD) || unix_peer(sk) == skb->sk)) { + struct scm_fp_list *fpl = UNIXCB(skb).fp; + int i; + + for (i = fpl->count-1; i >= 0; i--) { + if (cpt_object_add(CPT_OBJ_FILE, fpl->fp[i], ctx) == NULL) + return -ENOMEM; + } + } + + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_receive_queue.lock); + } + + return 0; +} + +int cpt_collect_passedfds(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + + if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { + int err; + + if ((err = collect_one_passedfd(file, ctx)) < 0) + return err; + } + } + + return 0; +} + +/* ====== End of FD passing ====== */ + +/* Must be called under bh_lock_sock() */ + +void clear_backlog(struct sock *sk) +{ + struct sk_buff *skb = sk->sk_backlog.head; + + sk->sk_backlog.head = sk->sk_backlog.tail = NULL; + while (skb) { + struct sk_buff *next = skb->next; + + skb->next = NULL; + kfree_skb(skb); + skb = next; + } +} + +void release_sock_nobacklog(struct sock *sk) +{ + spin_lock_bh(&(sk->sk_lock.slock)); + clear_backlog(sk); + sk->sk_lock.owned = 0; + if (waitqueue_active(&(sk->sk_lock.wq))) + wake_up(&(sk->sk_lock.wq)); + spin_unlock_bh(&(sk->sk_lock.slock)); +} + +int cpt_dump_skb(int type, int owner, struct sk_buff *skb, + struct cpt_context *ctx) +{ + struct cpt_skb_image *v = cpt_get_buf(ctx); + loff_t saved_obj; + struct timeval tmptv; + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SKB; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_owner = owner; + v->cpt_queue = type; + skb_get_timestamp(skb, &tmptv); + v->cpt_stamp = cpt_timeval_export(&tmptv); + v->cpt_hspace = skb->data - skb->head; + v->cpt_tspace = skb->end - skb->tail; + v->cpt_h = skb_transport_header(skb) - skb->head; + v->cpt_nh = skb_network_header(skb) - skb->head; + v->cpt_mac = skb_mac_header(skb) - skb->head; + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v->cpt_cb)); + memcpy(v->cpt_cb, skb->cb, sizeof(v->cpt_cb)); + if (sizeof(skb->cb) > sizeof(v->cpt_cb)) { + int i; + for (i=sizeof(v->cpt_cb); icb); i++) { + if (skb->cb[i]) { + wprintk_ctx("dirty skb cb"); + break; + } + } + } + v->cpt_len = skb->len; + v->cpt_mac_len = skb->mac_len; + v->cpt_csum = skb->csum; + v->cpt_local_df = skb->local_df; + v->cpt_pkt_type = skb->pkt_type; + v->cpt_ip_summed = skb->ip_summed; + v->cpt_priority = skb->priority; + v->cpt_protocol = skb->protocol; + v->cpt_security = 0; + v->cpt_gso_segs = skb_shinfo(skb)->gso_segs; + v->cpt_gso_size = skb_shinfo(skb)->gso_size; + if (skb_shinfo(skb)->gso_type) { + eprintk_ctx("skb ufo is not supported\n"); + return -EINVAL; + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (skb->len + (skb->data - skb->head) > 0) { + struct cpt_obj_bits ob; + loff_t saved_obj2; + + cpt_push_object(&saved_obj2, ctx); + cpt_open_object(NULL, ctx); + ob.cpt_next = CPT_NULL; + ob.cpt_object = CPT_OBJ_BITS; + ob.cpt_hdrlen = sizeof(ob); + ob.cpt_content = CPT_CONTENT_DATA; + ob.cpt_size = skb->len + v->cpt_hspace; + + ctx->write(&ob, sizeof(ob), ctx); + + ctx->write(skb->head, (skb->data-skb->head) + (skb->len-skb->data_len), ctx); + if (skb->data_len) { + int offset = skb->len - skb->data_len; + while (offset < skb->len) { + int copy = skb->len - offset; + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + (void)cpt_get_buf(ctx); + if (skb_copy_bits(skb, offset, ctx->tmpbuf, copy)) + BUG(); + ctx->write(ctx->tmpbuf, copy, ctx); + __cpt_release_buf(ctx); + offset += copy; + } + } + + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj2, ctx); + } + + if (skb->sk && skb->sk->sk_family == AF_UNIX) { + struct scm_fp_list *fpl = UNIXCB(skb).fp; + + if (fpl) { + int i; + + for (i = 0; i < fpl->count; i++) { + struct cpt_fd_image v; + cpt_object_t *obj; + loff_t saved_obj2; + + obj = lookup_cpt_object(CPT_OBJ_FILE, fpl->fp[i], ctx); + + if (!obj) { + eprintk_ctx("lost passed FD\n"); + return -EINVAL; + } + + cpt_push_object(&saved_obj2, ctx); + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_FILEDESC; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_fd = i; + v.cpt_file = obj->o_pos; + v.cpt_flags = 0; + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj2, ctx); + } + } + } + + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + return 0; +} + +static int dump_rqueue(int idx, struct sock *sk, struct cpt_context *ctx) +{ + struct sk_buff *skb; + struct sock *sk_cache = NULL; + + skb = skb_peek(&sk->sk_receive_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { + int err; + + if (sk->sk_family == AF_UNIX) { + cpt_object_t *obj; + if (skb->sk != sk_cache) { + idx = -1; + sk_cache = NULL; + obj = lookup_cpt_object(CPT_OBJ_SOCKET, skb->sk, ctx); + if (obj) { + idx = obj->o_index; + sk_cache = skb->sk; + } else if (unix_peer(sk) != skb->sk) + goto next_skb; + } + } + + err = cpt_dump_skb(CPT_SKB_RQ, idx, skb, ctx); + if (err) + return err; + +next_skb: + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_receive_queue.lock); + } + return 0; +} + +static int dump_wqueue(int idx, struct sock *sk, struct cpt_context *ctx) +{ + struct sk_buff *skb; + + skb = skb_peek(&sk->sk_write_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_write_queue) { + int err = cpt_dump_skb(CPT_SKB_WQ, idx, skb, ctx); + if (err) + return err; + + spin_lock_irq(&sk->sk_write_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_write_queue.lock); + } + return 0; +} + +void cpt_dump_sock_attr(struct sock *sk, cpt_context_t *ctx) +{ + loff_t saved_obj; + if (sk->sk_filter) { + struct cpt_obj_bits v; + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_SKFILTER; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_DATA; + v.cpt_size = sk->sk_filter->len*sizeof(struct sock_filter); + + ctx->write(&v, sizeof(v), ctx); + ctx->write(sk->sk_filter->insns, v.cpt_size, ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { + cpt_push_object(&saved_obj, ctx); + cpt_dump_mcfilter(sk, ctx); + cpt_pop_object(&saved_obj, ctx); + } +} + +/* Dump socket content */ + +int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx) +{ + struct cpt_sock_image *v = cpt_get_buf(ctx); + struct socket *sock; + struct timeval tmptv; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SOCKET; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_file = CPT_NULL; + sock = sk->sk_socket; + if (sock && sock->file) { + cpt_object_t *tobj; + tobj = lookup_cpt_object(CPT_OBJ_FILE, sock->file, ctx); + if (tobj) + v->cpt_file = tobj->o_pos; + } + v->cpt_index = index; + v->cpt_parent = parent; + + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { + if (sock && !obj->o_lock) { + lockdep_off(); + lock_sock(sk); + lockdep_on(); + obj->o_lock = 1; + } + } + + /* Some bits stored in inode */ + v->cpt_ssflags = sock ? sock->flags : 0; + v->cpt_sstate = sock ? sock->state : 0; + v->cpt_passcred = sock ? test_bit(SOCK_PASSCRED, &sock->flags) : 0; + + /* Common data */ + v->cpt_family = sk->sk_family; + v->cpt_type = sk->sk_type; + v->cpt_state = sk->sk_state; + v->cpt_reuse = sk->sk_reuse; + v->cpt_zapped = sock_flag(sk, SOCK_ZAPPED); + v->cpt_shutdown = sk->sk_shutdown; + v->cpt_userlocks = sk->sk_userlocks; + v->cpt_no_check = sk->sk_no_check; + v->cpt_zapped = sock_flag(sk, SOCK_DBG); + v->cpt_rcvtstamp = sock_flag(sk, SOCK_RCVTSTAMP); + v->cpt_localroute = sock_flag(sk, SOCK_LOCALROUTE); + v->cpt_protocol = sk->sk_protocol; + v->cpt_err = sk->sk_err; + v->cpt_err_soft = sk->sk_err_soft; + v->cpt_max_ack_backlog = sk->sk_max_ack_backlog; + v->cpt_priority = sk->sk_priority; + v->cpt_rcvlowat = sk->sk_rcvlowat; + v->cpt_rcvtimeo = CPT_NULL; + if (sk->sk_rcvtimeo != MAX_SCHEDULE_TIMEOUT) + v->cpt_rcvtimeo = sk->sk_rcvtimeo > INT_MAX ? INT_MAX : sk->sk_rcvtimeo; + v->cpt_sndtimeo = CPT_NULL; + if (sk->sk_sndtimeo != MAX_SCHEDULE_TIMEOUT) + v->cpt_sndtimeo = sk->sk_sndtimeo > INT_MAX ? INT_MAX : sk->sk_sndtimeo; + v->cpt_rcvbuf = sk->sk_rcvbuf; + v->cpt_sndbuf = sk->sk_sndbuf; + v->cpt_bound_dev_if = sk->sk_bound_dev_if; + v->cpt_flags = sk->sk_flags; + v->cpt_lingertime = CPT_NULL; + if (sk->sk_lingertime != MAX_SCHEDULE_TIMEOUT) + v->cpt_lingertime = sk->sk_lingertime > INT_MAX ? INT_MAX : sk->sk_lingertime; + v->cpt_peer_pid = sk->sk_peercred.pid; + v->cpt_peer_uid = sk->sk_peercred.uid; + v->cpt_peer_gid = sk->sk_peercred.gid; + tmptv = ktime_to_timeval(sk->sk_stamp); + v->cpt_stamp = cpt_timeval_export(&tmptv); + + v->cpt_peer = -1; + v->cpt_socketpair = 0; + v->cpt_deleted = 0; + + v->cpt_laddrlen = 0; + if (sock) { + int alen = sizeof(v->cpt_laddr); + int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_laddr, &alen, 0); + if (err) { + cpt_release_buf(ctx); + return err; + } + v->cpt_laddrlen = alen; + } + v->cpt_raddrlen = 0; + if (sock) { + int alen = sizeof(v->cpt_raddr); + int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_raddr, &alen, 2); + if (!err) + v->cpt_raddrlen = alen; + } + + if (sk->sk_family == AF_UNIX) { + if (unix_sk(sk)->dentry) { + struct dentry *d = unix_sk(sk)->dentry; + v->cpt_deleted = !IS_ROOT(d) && d_unhashed(d); + if (!v->cpt_deleted) { + int err = 0; + char *path; + struct path p; + unsigned long pg = __get_free_page(GFP_KERNEL); + + if (!pg) { + cpt_release_buf(ctx); + return -ENOMEM; + } + + p.dentry = d; + p.mnt = unix_sk(sk)->mnt; + path = d_path(&p, (char *)pg, PAGE_SIZE); + + if (!IS_ERR(path)) { + int len = strlen(path); + if (len < 126) { + strcpy(((char*)v->cpt_laddr)+2, path); + v->cpt_laddrlen = len + 2; + } else { + wprintk_ctx("af_unix path is too long: %s (%s)\n", path, ((char*)v->cpt_laddr)+2); + } + err = cpt_verify_overmount(path, d, unix_sk(sk)->mnt, ctx); + } else { + eprintk_ctx("cannot get path of an af_unix socket\n"); + err = PTR_ERR(path); + } + free_page(pg); + if (err) { + cpt_release_buf(ctx); + return err; + } + } + } + + /* If the socket is connected, find its peer. If peer is not + * in our table, the socket is connected to external process + * and we consider it disconnected. + */ + if (unix_peer(sk)) { + cpt_object_t *pobj; + pobj = lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(sk), ctx); + if (pobj) + v->cpt_peer = pobj->o_index; + else + v->cpt_shutdown = SHUTDOWN_MASK; + + if (unix_peer(unix_peer(sk)) == sk) + v->cpt_socketpair = 1; + } + + /* If the socket shares address with another socket it is + * child of some listening socket. Find and record it. */ + if (unix_sk(sk)->addr && + atomic_read(&unix_sk(sk)->addr->refcnt) > 1 && + sk->sk_state != TCP_LISTEN) { + cpt_object_t *pobj; + for_each_object(pobj, CPT_OBJ_SOCKET) { + struct sock *psk = pobj->o_obj; + if (psk->sk_family == AF_UNIX && + psk->sk_state == TCP_LISTEN && + unix_sk(psk)->addr == unix_sk(sk)->addr) { + v->cpt_parent = pobj->o_index; + break; + } + } + } + } + + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) + cpt_dump_socket_in(v, sk, ctx); + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_dump_sock_attr(sk, ctx); + + dump_rqueue(index, sk, ctx); + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { + dump_wqueue(index, sk, ctx); + cpt_dump_ofo_queue(index, sk, ctx); + } + + if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) + && sk->sk_state == TCP_LISTEN) + cpt_dump_synwait_queue(sk, index, ctx); + + cpt_close_object(ctx); + + if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) + && sk->sk_state == TCP_LISTEN) + cpt_dump_accept_queue(sk, index, ctx); + + return 0; +} + +int cpt_dump_orphaned_sockets(struct cpt_context *ctx) +{ + int i; + + cpt_open_section(ctx, CPT_SECT_ORPHANS); + + for (i = 0; i < tcp_hashinfo.ehash_size; i++) { + struct sock *sk; + struct hlist_node *node; + rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i); +retry: + read_lock_bh(lock); + sk_for_each(sk, node, &tcp_hashinfo.ehash[i].chain) { + + if (sk->owner_env != get_exec_env()) + continue; + if (sk->sk_socket) + continue; + if (!sock_flag(sk, SOCK_DEAD)) + continue; + if (lookup_cpt_object(CPT_OBJ_SOCKET, sk, ctx)) + continue; + sock_hold(sk); + read_unlock_bh(lock); + + local_bh_disable(); + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + eprintk_ctx("BUG: sk locked by whom?\n"); + sk->sk_lock.owned = 1; + bh_unlock_sock(sk); + local_bh_enable(); + + cpt_dump_socket(NULL, sk, -1, -1, ctx); + + local_bh_disable(); + bh_lock_sock(sk); + sk->sk_lock.owned = 0; + clear_backlog(sk); + tcp_done(sk); + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); + + goto retry; + } + read_unlock_bh(lock); + } + cpt_close_section(ctx); + return 0; +} + +static int can_dump(struct sock *sk, cpt_context_t *ctx) +{ + switch (sk->sk_family) { + case AF_NETLINK: + if (((struct netlink_sock *)sk)->cb) { + eprintk_ctx("netlink socket has active callback\n"); + return 0; + } + break; + } + return 1; +} + +/* We are not going to block suspend when we have external AF_UNIX connections. + * But we cannot stop feed of new packets/connections to our environment + * from outside. Taking into account that it is intrincically unreliable, + * we collect some amount of data, but when checkpointing/restoring we + * are going to drop everything, which does not make sense: skbs sent + * by outside processes, connections from outside etc. etc. + */ + +/* The first pass. When we see socket referenced by a file, we just + * add it to socket table */ +int cpt_collect_socket(struct file *file, cpt_context_t * ctx) +{ + cpt_object_t *obj; + struct socket *sock; + struct sock *sk; + + if (!S_ISSOCK(file->f_dentry->d_inode->i_mode)) + return -ENOTSOCK; + sock = &container_of(file->f_dentry->d_inode, struct socket_alloc, vfs_inode)->socket; + sk = sock->sk; + if (!can_dump(sk, ctx)) + return -EAGAIN; + if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sk, ctx)) == NULL) + return -ENOMEM; + obj->o_parent = file; + + return 0; +} + +/* + * We should end with table containing: + * * all sockets opened by our processes in the table. + * * all the sockets queued in listening queues on _our_ listening sockets, + * which are connected to our opened sockets. + */ + +static int collect_one_unix_listening_sock(cpt_object_t *obj, cpt_context_t * ctx) +{ + struct sock *sk = obj->o_obj; + cpt_object_t *cobj; + struct sk_buff *skb; + + skb = skb_peek(&sk->sk_receive_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { + struct sock *lsk = skb->sk; + if (unix_peer(lsk) && + lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(lsk), ctx)) { + if ((cobj = cpt_object_add(CPT_OBJ_SOCKET, lsk, ctx)) == NULL) + return -ENOMEM; + cobj->o_parent = obj->o_parent; + } + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_receive_queue.lock); + } + + return 0; +} + +int cpt_index_sockets(cpt_context_t * ctx) +{ + cpt_object_t *obj; + unsigned long index = 0; + + /* Collect not-yet-accepted children of listening sockets. */ + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + + if (sk->sk_state != TCP_LISTEN) + continue; + + if (sk->sk_family == AF_UNIX) + collect_one_unix_listening_sock(obj, ctx); + } + + /* Assign indices to all the sockets. */ + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + cpt_obj_setindex(obj, index++, ctx); + + if (sk->sk_socket && sk->sk_socket->file) { + cpt_object_t *tobj; + tobj = lookup_cpt_object(CPT_OBJ_FILE, sk->sk_socket->file, ctx); + if (tobj) + cpt_obj_setindex(tobj, obj->o_index, ctx); + } + } + + return 0; +} + +void cpt_unlock_sockets(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + lockdep_off(); + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + if (sk && obj->o_lock) { + if (sk->sk_socket) + release_sock(sk); + } + } + lockdep_on(); +} + +void cpt_kill_sockets(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + if (sk && obj->o_lock) { + struct ve_struct *old_env; + old_env = set_exec_env(sk->owner_env); + cpt_kill_socket(sk, ctx); + if (sk->sk_socket) + release_sock_nobacklog(sk); + set_exec_env(old_env); + } + } +} + +__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx) +{ + struct fasync_struct *fa; + struct inode *inode = file->f_dentry->d_inode; + struct socket *sock; + + sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; + + for (fa = sock->fasync_list; fa; fa = fa->fa_next) { + if (fa->fa_file == file) + return fa->fa_fd; + } + return -1; +} diff --git a/kernel/cpt/cpt_socket.h b/kernel/cpt/cpt_socket.h new file mode 100644 index 0000000..6489184 --- /dev/null +++ b/kernel/cpt/cpt_socket.h @@ -0,0 +1,33 @@ +struct sock; + +int cpt_collect_passedfds(cpt_context_t *); +int cpt_index_sockets(cpt_context_t *); +int cpt_collect_socket(struct file *, cpt_context_t *); +int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx); +int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx); +int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx); +int rst_sockets(struct cpt_context *ctx); +int rst_sockets_complete(struct cpt_context *ctx); +int cpt_dump_orphaned_sockets(struct cpt_context *ctx); + +int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx); +struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx); + +void cpt_unlock_sockets(cpt_context_t *); +void cpt_kill_sockets(cpt_context_t *); + + +int cpt_kill_socket(struct sock *, cpt_context_t *); +int cpt_dump_socket_in(struct cpt_sock_image *, struct sock *, struct cpt_context*); +int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *, struct cpt_context *ctx); +__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx); +int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *); +int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, loff_t pos, struct cpt_context *ctx); +int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx); +int cpt_dump_skb(int type, int owner, struct sk_buff *skb, struct cpt_context *ctx); +int cpt_dump_mcfilter(struct sock *sk, struct cpt_context *ctx); + +int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx); +int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx); diff --git a/kernel/cpt/cpt_socket_in.c b/kernel/cpt/cpt_socket_in.c new file mode 100644 index 0000000..c02d459 --- /dev/null +++ b/kernel/cpt/cpt_socket_in.c @@ -0,0 +1,450 @@ +/* + * + * kernel/cpt/cpt_socket_in.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" + +static inline __u32 jiffies_export(unsigned long tmo) +{ + __s32 delta = (long)(tmo - jiffies); + return delta; +} + +static inline __u32 tcp_jiffies_export(__u32 tmo) +{ + __s32 delta = tmo - tcp_time_stamp; + return delta; +} + +int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx) +{ + struct sk_buff *skb; + struct tcp_sock *tp; + + if (sk->sk_type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP) + return 0; + + tp = tcp_sk(sk); + + skb = skb_peek(&tp->out_of_order_queue); + while (skb && skb != (struct sk_buff*)&tp->out_of_order_queue) { + int err; + + err = cpt_dump_skb(CPT_SKB_OFOQ, idx, skb, ctx); + if (err) + return err; + + spin_lock_irq(&tp->out_of_order_queue.lock); + skb = skb->next; + spin_unlock_irq(&tp->out_of_order_queue.lock); + } + return 0; +} + +static int cpt_dump_socket_tcp(struct cpt_sock_image *si, struct sock *sk, + struct cpt_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + + si->cpt_pred_flags = tp->pred_flags; + si->cpt_rcv_nxt = tp->rcv_nxt; + si->cpt_snd_nxt = tp->snd_nxt; + si->cpt_snd_una = tp->snd_una; + si->cpt_snd_sml = tp->snd_sml; + si->cpt_rcv_tstamp = tcp_jiffies_export(tp->rcv_tstamp); + si->cpt_lsndtime = tcp_jiffies_export(tp->lsndtime); + si->cpt_tcp_header_len = tp->tcp_header_len; + si->cpt_ack_pending = inet_csk(sk)->icsk_ack.pending; + si->cpt_quick = inet_csk(sk)->icsk_ack.quick; + si->cpt_pingpong = inet_csk(sk)->icsk_ack.pingpong; + si->cpt_blocked = inet_csk(sk)->icsk_ack.blocked; + si->cpt_ato = inet_csk(sk)->icsk_ack.ato; + si->cpt_ack_timeout = jiffies_export(inet_csk(sk)->icsk_ack.timeout); + si->cpt_lrcvtime = tcp_jiffies_export(inet_csk(sk)->icsk_ack.lrcvtime); + si->cpt_last_seg_size = inet_csk(sk)->icsk_ack.last_seg_size; + si->cpt_rcv_mss = inet_csk(sk)->icsk_ack.rcv_mss; + si->cpt_snd_wl1 = tp->snd_wl1; + si->cpt_snd_wnd = tp->snd_wnd; + si->cpt_max_window = tp->max_window; + si->cpt_pmtu_cookie = inet_csk(sk)->icsk_pmtu_cookie; + si->cpt_mss_cache = tp->mss_cache; + si->cpt_mss_cache_std = tp->mss_cache; /* FIXMW was tp->mss_cache_std */ + si->cpt_mss_clamp = tp->rx_opt.mss_clamp; + si->cpt_ext_header_len = inet_csk(sk)->icsk_ext_hdr_len; + si->cpt_ext2_header_len = 0; + si->cpt_ca_state = inet_csk(sk)->icsk_ca_state; + si->cpt_retransmits = inet_csk(sk)->icsk_retransmits; + si->cpt_reordering = tp->reordering; + si->cpt_frto_counter = tp->frto_counter; + si->cpt_frto_highmark = tp->frto_highmark; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) + // // si->cpt_adv_cong = tp->adv_cong; +#endif + si->cpt_defer_accept = inet_csk(sk)->icsk_accept_queue.rskq_defer_accept; + si->cpt_backoff = inet_csk(sk)->icsk_backoff; + si->cpt_srtt = tp->srtt; + si->cpt_mdev = tp->mdev; + si->cpt_mdev_max = tp->mdev_max; + si->cpt_rttvar = tp->rttvar; + si->cpt_rtt_seq = tp->rtt_seq; + si->cpt_rto = inet_csk(sk)->icsk_rto; + si->cpt_packets_out = tp->packets_out; + si->cpt_left_out = tp->sacked_out + tp->lost_out; + si->cpt_retrans_out = tp->retrans_out; + si->cpt_lost_out = tp->lost_out; + si->cpt_sacked_out = tp->sacked_out; + si->cpt_fackets_out = tp->fackets_out; + si->cpt_snd_ssthresh = tp->snd_ssthresh; + si->cpt_snd_cwnd = tp->snd_cwnd; + si->cpt_snd_cwnd_cnt = tp->snd_cwnd_cnt; + si->cpt_snd_cwnd_clamp = tp->snd_cwnd_clamp; + si->cpt_snd_cwnd_used = tp->snd_cwnd_used; + si->cpt_snd_cwnd_stamp = tcp_jiffies_export(tp->snd_cwnd_stamp); + si->cpt_timeout = jiffies_export(inet_csk(sk)->icsk_timeout); + si->cpt_ka_timeout = 0; + si->cpt_rcv_wnd = tp->rcv_wnd; + si->cpt_rcv_wup = tp->rcv_wup; + si->cpt_write_seq = tp->write_seq; + si->cpt_pushed_seq = tp->pushed_seq; + si->cpt_copied_seq = tp->copied_seq; + si->cpt_tstamp_ok = tp->rx_opt.tstamp_ok; + si->cpt_wscale_ok = tp->rx_opt.wscale_ok; + si->cpt_sack_ok = tp->rx_opt.sack_ok; + si->cpt_saw_tstamp = tp->rx_opt.saw_tstamp; + si->cpt_snd_wscale = tp->rx_opt.snd_wscale; + si->cpt_rcv_wscale = tp->rx_opt.rcv_wscale; + si->cpt_nonagle = tp->nonagle; + si->cpt_keepalive_probes = tp->keepalive_probes; + si->cpt_rcv_tsval = tp->rx_opt.rcv_tsval; + si->cpt_rcv_tsecr = tp->rx_opt.rcv_tsecr; + si->cpt_ts_recent = tp->rx_opt.ts_recent; + si->cpt_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + si->cpt_user_mss = tp->rx_opt.user_mss; + si->cpt_dsack = tp->rx_opt.dsack; + si->cpt_eff_sacks = tp->rx_opt.eff_sacks; + si->cpt_sack_array[0] = tp->duplicate_sack[0].start_seq; + si->cpt_sack_array[1] = tp->duplicate_sack[0].end_seq; + si->cpt_sack_array[2] = tp->selective_acks[0].start_seq; + si->cpt_sack_array[3] = tp->selective_acks[0].end_seq; + si->cpt_sack_array[4] = tp->selective_acks[1].start_seq; + si->cpt_sack_array[5] = tp->selective_acks[1].end_seq; + si->cpt_sack_array[6] = tp->selective_acks[2].start_seq; + si->cpt_sack_array[7] = tp->selective_acks[2].end_seq; + si->cpt_sack_array[8] = tp->selective_acks[3].start_seq; + si->cpt_sack_array[9] = tp->selective_acks[3].end_seq; + si->cpt_window_clamp = tp->window_clamp; + si->cpt_rcv_ssthresh = tp->rcv_ssthresh; + si->cpt_probes_out = inet_csk(sk)->icsk_probes_out; + si->cpt_num_sacks = tp->rx_opt.num_sacks; + si->cpt_advmss = tp->advmss; + si->cpt_syn_retries = inet_csk(sk)->icsk_syn_retries; + si->cpt_ecn_flags = tp->ecn_flags; + si->cpt_prior_ssthresh = tp->prior_ssthresh; + si->cpt_high_seq = tp->high_seq; + si->cpt_retrans_stamp = tp->retrans_stamp; + si->cpt_undo_marker = tp->undo_marker; + si->cpt_undo_retrans = tp->undo_retrans; + si->cpt_urg_seq = tp->urg_seq; + si->cpt_urg_data = tp->urg_data; + si->cpt_pending = inet_csk(sk)->icsk_pending; + si->cpt_urg_mode = tp->urg_mode; + si->cpt_snd_up = tp->snd_up; + si->cpt_keepalive_time = tp->keepalive_time; + si->cpt_keepalive_intvl = tp->keepalive_intvl; + si->cpt_linger2 = tp->linger2; + + if (sk->sk_state != TCP_LISTEN && + sk->sk_state != TCP_CLOSE && + sock_flag(sk, SOCK_KEEPOPEN)) { + si->cpt_ka_timeout = jiffies_export(sk->sk_timer.expires); + } + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + { + extern struct inet_connection_sock_af_ops ipv6_mapped; + if (sk->sk_family == AF_INET6 && + inet_csk(sk)->icsk_af_ops == &ipv6_mapped) + si->cpt_mapped = 1; + } +#endif + + return 0; +} + + +int cpt_dump_socket_in(struct cpt_sock_image *si, struct sock *sk, + struct cpt_context *ctx) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + + if (sk->sk_family == AF_INET) { + struct sockaddr_in *sin = ((struct sockaddr_in*)si->cpt_laddr); + sin->sin_family = AF_INET; + sin->sin_port = inet->sport; + sin->sin_addr.s_addr = inet->rcv_saddr; + si->cpt_laddrlen = sizeof(*sin); + } else if (sk->sk_family == AF_INET6) { + struct sockaddr_in6 *sin6 = ((struct sockaddr_in6*)si->cpt_laddr); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = inet->sport; + memcpy(&sin6->sin6_addr, &np->rcv_saddr, 16); + si->cpt_laddrlen = sizeof(*sin6); + } + if (!inet->num) + si->cpt_laddrlen = 0; + + si->cpt_daddr = inet->daddr; + si->cpt_dport = inet->dport; + si->cpt_saddr = inet->saddr; + si->cpt_rcv_saddr = inet->rcv_saddr; + si->cpt_sport = inet->sport; + si->cpt_uc_ttl = inet->uc_ttl; + si->cpt_tos = inet->tos; + si->cpt_cmsg_flags = inet->cmsg_flags; + si->cpt_mc_index = inet->mc_index; + si->cpt_mc_addr = inet->mc_addr; + si->cpt_hdrincl = inet->hdrincl; + si->cpt_mc_ttl = inet->mc_ttl; + si->cpt_mc_loop = inet->mc_loop; + si->cpt_pmtudisc = inet->pmtudisc; + si->cpt_recverr = inet->recverr; + si->cpt_freebind = inet->freebind; + si->cpt_idcounter = inet->id; + + si->cpt_cork_flags = inet->cork.flags; + si->cpt_cork_fragsize = 0; + si->cpt_cork_length = inet->cork.length; + si->cpt_cork_addr = inet->cork.addr; + si->cpt_cork_saddr = inet->cork.fl.fl4_src; + si->cpt_cork_daddr = inet->cork.fl.fl4_dst; + si->cpt_cork_oif = inet->cork.fl.oif; + if (inet->cork.dst) { + struct rtable *rt = (struct rtable *)inet->cork.dst; + si->cpt_cork_fragsize = inet->cork.fragsize; + si->cpt_cork_saddr = rt->fl.fl4_src; + si->cpt_cork_daddr = rt->fl.fl4_dst; + si->cpt_cork_oif = rt->fl.oif; + } + + if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { + struct udp_sock *up = udp_sk(sk); + si->cpt_udp_pending = up->pending; + si->cpt_udp_corkflag = up->corkflag; + si->cpt_udp_encap = up->encap_type; + si->cpt_udp_len = up->len; + } + + if (sk->sk_family == AF_INET6) { + memcpy(si->cpt_saddr6, &np->saddr, 16); + memcpy(si->cpt_rcv_saddr6, &np->rcv_saddr, 16); + memcpy(si->cpt_daddr6, &np->daddr, 16); + si->cpt_flow_label6 = np->flow_label; + si->cpt_frag_size6 = np->frag_size; + si->cpt_hop_limit6 = np->hop_limit; + si->cpt_mcast_hops6 = np->mcast_hops; + si->cpt_mcast_oif6 = np->mcast_oif; + si->cpt_rxopt6 = np->rxopt.all; + si->cpt_mc_loop6 = np->mc_loop; + si->cpt_recverr6 = np->recverr; + si->cpt_sndflow6 = np->sndflow; + si->cpt_pmtudisc6 = np->pmtudisc; + si->cpt_ipv6only6 = np->ipv6only; + si->cpt_mapped = 0; + } + + if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) + cpt_dump_socket_tcp(si, sk, ctx); + + return 0; +} + +int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx) +{ + struct request_sock *req; + + for (req=inet_csk(sk)->icsk_accept_queue.rskq_accept_head; req; req=req->dl_next) + cpt_dump_socket(NULL, req->sk, -1, index, ctx); + return 0; +} + + +static int dump_openreq(struct request_sock *req, struct sock *sk, int index, + struct cpt_context *ctx) +{ + struct cpt_openreq_image *v = cpt_get_buf(ctx); + + cpt_open_object(NULL, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_OPENREQ; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + v->cpt_rcv_isn = tcp_rsk(req)->rcv_isn; + v->cpt_snt_isn = tcp_rsk(req)->snt_isn; + v->cpt_rmt_port = inet_rsk(req)->rmt_port; + v->cpt_mss = req->mss; + // // v->cpt_family = (req->class == &or_ipv4 ? AF_INET : AF_INET6); + v->cpt_retrans = req->retrans; + v->cpt_snd_wscale = inet_rsk(req)->snd_wscale; + v->cpt_rcv_wscale = inet_rsk(req)->rcv_wscale; + v->cpt_tstamp_ok = inet_rsk(req)->tstamp_ok; + v->cpt_sack_ok = inet_rsk(req)->sack_ok; + v->cpt_wscale_ok = inet_rsk(req)->wscale_ok; + v->cpt_ecn_ok = inet_rsk(req)->ecn_ok; + v->cpt_acked = inet_rsk(req)->acked; + v->cpt_window_clamp = req->window_clamp; + v->cpt_rcv_wnd = req->rcv_wnd; + v->cpt_ts_recent = req->ts_recent; + v->cpt_expires = jiffies_export(req->expires); + + if (v->cpt_family == AF_INET) { + memcpy(v->cpt_loc_addr, &inet_rsk(req)->loc_addr, 4); + memcpy(v->cpt_rmt_addr, &inet_rsk(req)->rmt_addr, 4); + } else { +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + memcpy(v->cpt_loc_addr, &inet6_rsk(req)->loc_addr, 16); + memcpy(v->cpt_rmt_addr, &inet6_rsk(req)->rmt_addr, 16); + v->cpt_iif = inet6_rsk(req)->iif; +#endif + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_close_object(ctx); + return 0; +} + +int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx) +{ + struct inet_connection_sock *icsk; + struct listen_sock *lopt; + struct request_sock *req; + int nr_entries; + int i; + + icsk = inet_csk(sk); + lopt = icsk->icsk_accept_queue.listen_opt; + nr_entries = icsk->icsk_accept_queue.listen_opt->nr_table_entries; + + for (i=0; i < nr_entries; i++) { + for (req=lopt->syn_table[i]; req; req=req->dl_next) { + loff_t saved_obj; + cpt_push_object(&saved_obj, ctx); + dump_openreq(req, sk, index, ctx); + cpt_pop_object(&saved_obj, ctx); + } + } + return 0; +} + + +int cpt_kill_socket(struct sock *sk, cpt_context_t * ctx) +{ + if (sk->sk_state != TCP_CLOSE && + (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && + sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_LISTEN) + tcp_set_state(sk, TCP_CLOSE); + else + sk->sk_prot->disconnect(sk, 0); + } + return 0; +} + +int cpt_dump_mcfilter(struct sock *sk, cpt_context_t *ctx) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_mc_socklist *iml; + + for (iml = inet->mc_list; iml; iml = iml->next) { + struct cpt_sockmc_image smi; + int scnt = 0; + int i; + + if (iml->sflist) + scnt = iml->sflist->sl_count*16; + + smi.cpt_next = sizeof(smi) + scnt; + smi.cpt_object = CPT_OBJ_SOCK_MCADDR; + smi.cpt_hdrlen = sizeof(smi); + smi.cpt_content = CPT_CONTENT_DATA; + + smi.cpt_family = AF_INET; + smi.cpt_mode = iml->sfmode; + smi.cpt_ifindex = iml->multi.imr_ifindex; + memset(&smi.cpt_mcaddr, 0, sizeof(smi.cpt_mcaddr)); + smi.cpt_mcaddr[0] = iml->multi.imr_multiaddr.s_addr; + + ctx->write(&smi, sizeof(smi), ctx); + + for (i = 0; i < scnt; i++) { + u32 addr[4]; + memset(&addr, 0, sizeof(addr)); + addr[0] = iml->sflist->sl_addr[i]; + ctx->write(&addr, sizeof(addr), ctx); + } + } + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (sk->sk_family == AF_INET6) { + struct ipv6_mc_socklist *mcl; + struct ipv6_pinfo *np = inet6_sk(sk); + + for (mcl = np->ipv6_mc_list; mcl; mcl = mcl->next) { + struct cpt_sockmc_image smi; + int scnt = 0; + int i; + + if (mcl->sflist) + scnt = mcl->sflist->sl_count*16; + + smi.cpt_next = sizeof(smi) + scnt; + smi.cpt_object = CPT_OBJ_SOCK_MCADDR; + smi.cpt_hdrlen = sizeof(smi); + smi.cpt_content = CPT_CONTENT_DATA; + + smi.cpt_family = AF_INET6; + smi.cpt_mode = mcl->sfmode; + smi.cpt_ifindex = mcl->ifindex; + memcpy(&smi.cpt_mcaddr, &mcl->addr, sizeof(smi.cpt_mcaddr)); + + ctx->write(&smi, sizeof(smi), ctx); + for (i = 0; i < scnt; i++) + ctx->write(&mcl->sflist->sl_addr[i], 16, ctx); + } + } +#endif + return 0; +} diff --git a/kernel/cpt/cpt_syscalls.h b/kernel/cpt/cpt_syscalls.h new file mode 100644 index 0000000..ba69cb5 --- /dev/null +++ b/kernel/cpt/cpt_syscalls.h @@ -0,0 +1,101 @@ +#include +#include +#include +#include + +#define WRAP(c, args) return sys_##c args +#define WRAP2(c, args) int err; mm_segment_t oldfs; \ + oldfs = get_fs(); set_fs(KERNEL_DS); \ + err = sys_##c args ;\ + set_fs(oldfs); \ + return err + +static inline int sc_close(int fd) +{ + WRAP(close, (fd)); +} + +static inline int sc_dup2(int fd1, int fd2) +{ + WRAP(dup2, (fd1, fd2)); +} + +static inline int sc_unlink(char *name) +{ + WRAP2(unlink, (name)); +} + +static inline int sc_pipe(int *pfd) +{ + return do_pipe(pfd); +} + +static inline int sc_mknod(char *name, int mode, int dev) +{ + WRAP2(mknod, (name, mode, dev)); +} + +static inline int sc_chmod(char *name, int mode) +{ + WRAP2(mkdir, (name, mode)); +} + +static inline int sc_chown(char *name, int uid, int gid) +{ + WRAP2(chown, (name, uid, gid)); +} + +static inline int sc_mkdir(char *name, int mode) +{ + WRAP2(mkdir, (name, mode)); +} + +static inline int sc_rmdir(char *name) +{ + WRAP2(rmdir, (name)); +} + +static inline int sc_mount(char *mntdev, char *mntpnt, char *type, unsigned long flags) +{ + WRAP2(mount, (mntdev ? : "none", mntpnt, type, flags, NULL)); +} + +static inline int sc_mprotect(unsigned long start, size_t len, + unsigned long prot) +{ + WRAP(mprotect, (start, len, prot)); +} + +static inline int sc_mlock(unsigned long start, size_t len) +{ + WRAP(mlock, (start, len)); +} + +static inline int sc_munlock(unsigned long start, size_t len) +{ + WRAP(munlock, (start, len)); +} + +static inline int sc_remap_file_pages(unsigned long start, size_t len, + unsigned long prot, unsigned long pgoff, + unsigned long flags) +{ + WRAP(remap_file_pages, (start, len, prot, pgoff, flags)); +} + +static inline int sc_waitx(int pid, int opt, int *stat_addr) +{ + WRAP(wait4, (pid, stat_addr, opt, NULL)); +} + +static inline int sc_flock(int fd, int flags) +{ + WRAP(flock, (fd, flags)); +} + +static inline int sc_open(char* path, int flags, int mode) +{ + WRAP(open, (path, flags, mode)); +} + +extern int sc_execve(char *cms, char **argv, char **env); diff --git a/kernel/cpt/cpt_sysvipc.c b/kernel/cpt/cpt_sysvipc.c new file mode 100644 index 0000000..820f1ac --- /dev/null +++ b/kernel/cpt/cpt_sysvipc.c @@ -0,0 +1,403 @@ +/* + * + * kernel/cpt/cpt_sysvipc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" + +struct _warg { + struct file *file; + struct cpt_sysvshm_image *v; +}; + +static int dump_one_shm(struct shmid_kernel *shp, void *arg) +{ + struct _warg *warg = arg; + struct cpt_sysvshm_image *v = (struct cpt_sysvshm_image *)warg->v; + + if (shp->shm_file != warg->file) + return 0; + + v->cpt_key = shp->shm_perm.key; + v->cpt_uid = shp->shm_perm.uid; + v->cpt_gid = shp->shm_perm.gid; + v->cpt_cuid = shp->shm_perm.cuid; + v->cpt_cgid = shp->shm_perm.cgid; + v->cpt_mode = shp->shm_perm.mode; + v->cpt_seq = shp->shm_perm.seq; + + v->cpt_id = shp->shm_perm.id; + v->cpt_segsz = shp->shm_segsz; + v->cpt_atime = shp->shm_atim; + v->cpt_ctime = shp->shm_ctim; + v->cpt_dtime = shp->shm_dtim; + v->cpt_creator = shp->shm_cprid; + v->cpt_last = shp->shm_lprid; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) + v->cpt_mlockuser = shp->mlock_user ? shp->mlock_user->uid : -1; +#else + v->cpt_mlockuser = -1; +#endif + return 1; +} + +int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx) +{ + struct cpt_sysvshm_image *v = cpt_get_buf(ctx); + struct _warg warg; + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_SYSV_SHM; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + warg.file = file; + warg.v = v; + if (sysvipc_walk_shm(dump_one_shm, &warg) == 0) { + cpt_release_buf(ctx); + return -ESRCH; + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + return 0; +} + + +int match_sem(int id, struct sem_array *sema, void *arg) +{ + if (id != (unsigned long)arg) + return 0; + return sema->sem_nsems + 1; +} + +static int get_sem_nsem(int id, cpt_context_t *ctx) +{ + int res; + res = sysvipc_walk_sem(match_sem, (void*)(unsigned long)id); + if (res > 0) + return res - 1; + eprintk_ctx("get_sem_nsem: SYSV semaphore %d not found\n", id); + return -ESRCH; +} + +static int dump_one_semundo(struct sem_undo *su, struct cpt_context *ctx) +{ + struct cpt_sysvsem_undo_image v; + loff_t saved_obj; + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_SYSVSEM_UNDO_REC; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_SEMUNDO; + v.cpt_id = su->semid; + v.cpt_nsem = get_sem_nsem(su->semid, ctx); + if ((int)v.cpt_nsem < 0) + return -ESRCH; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + ctx->write(su->semadj, v.cpt_nsem*sizeof(short), ctx); + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + return 0; +} + +struct sem_warg { + int last_id; + struct cpt_sysvsem_image *v; +}; + +static int dump_one_sem(int id, struct sem_array *sma, void *arg) +{ + struct sem_warg * warg = (struct sem_warg *)arg; + struct cpt_sysvsem_image *v = warg->v; + int i; + + if (warg->last_id != -1) { + if ((id % IPCMNI) <= warg->last_id) + return 0; + } + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_SYSV_SEM; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_SEMARRAY; + + v->cpt_key = sma->sem_perm.key; + v->cpt_uid = sma->sem_perm.uid; + v->cpt_gid = sma->sem_perm.gid; + v->cpt_cuid = sma->sem_perm.cuid; + v->cpt_cgid = sma->sem_perm.cgid; + v->cpt_mode = sma->sem_perm.mode; + v->cpt_seq = sma->sem_perm.seq; + + v->cpt_id = id; + v->cpt_ctime = sma->sem_ctime; + v->cpt_otime = sma->sem_otime; + + for (i=0; isem_nsems; i++) { + struct { + __u32 semval; + __u32 sempid; + } *s = (void*)v + v->cpt_next; + if (v->cpt_next >= PAGE_SIZE - sizeof(*s)) + return -EINVAL; + s->semval = sma->sem_base[i].semval; + s->sempid = sma->sem_base[i].sempid; + v->cpt_next += sizeof(*s); + } + + warg->last_id = id % IPCMNI; + return 1; +} + + +int cpt_dump_sysvsem(struct cpt_context *ctx) +{ + cpt_object_t *obj; + struct sem_warg warg; + + /* Dumping semaphores is quite tricky because we cannot + * write to dump file under lock inside sysvipc_walk_sem(). + */ + cpt_open_section(ctx, CPT_SECT_SYSV_SEM); + warg.last_id = -1; + warg.v = cpt_get_buf(ctx); + for (;;) { + if (sysvipc_walk_sem(dump_one_sem, &warg) <= 0) + break; + ctx->write(warg.v, warg.v->cpt_next, ctx); + } + cpt_release_buf(ctx); + cpt_close_section(ctx); + + cpt_open_section(ctx, CPT_SECT_SYSVSEM_UNDO); + for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { + struct sem_undo_list *semu = obj->o_obj; + struct sem_undo *su; + struct cpt_object_hdr v; + loff_t saved_obj; + + cpt_open_object(obj, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_SYSVSEM_UNDO; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + list_for_each_entry(su, &semu->list_proc, list_proc) { + if (su->semid != -1) { + int err; + err = dump_one_semundo(su, ctx); + if (err < 0) + return err; + } + } + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + } + cpt_close_section(ctx); + return 0; +} + +struct msg_warg { + int last_id; + struct msg_queue *msq; + struct cpt_sysvmsg_image *v; +}; + +static int dump_one_msg(int id, struct msg_queue *msq, void *arg) +{ + struct msg_warg * warg = (struct msg_warg *)arg; + struct cpt_sysvmsg_image *v = warg->v; + + if (warg->last_id != -1) { + if ((id % IPCMNI) <= warg->last_id) + return 0; + } + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_SYSVMSG; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_key = msq->q_perm.key; + v->cpt_uid = msq->q_perm.uid; + v->cpt_gid = msq->q_perm.gid; + v->cpt_cuid = msq->q_perm.cuid; + v->cpt_cgid = msq->q_perm.cgid; + v->cpt_mode = msq->q_perm.mode; + v->cpt_seq = msq->q_perm.seq; + + v->cpt_id = id; + v->cpt_stime = msq->q_stime; + v->cpt_rtime = msq->q_rtime; + v->cpt_ctime = msq->q_ctime; + v->cpt_last_sender = msq->q_lspid; + v->cpt_last_receiver = msq->q_lrpid; + v->cpt_qbytes = msq->q_qbytes; + + warg->msq = msq; + warg->last_id = id % IPCMNI; + return 1; +} + +static int do_store(void * src, int len, int offset, void * data) +{ + cpt_context_t * ctx = data; + ctx->write(src, len, ctx); + return 0; +} + +static void cpt_dump_one_sysvmsg(struct msg_msg *m, cpt_context_t * ctx) +{ + loff_t saved_obj; + struct cpt_sysvmsg_msg_image mv; + + cpt_open_object(NULL, ctx); + mv.cpt_next = CPT_NULL; + mv.cpt_object = CPT_OBJ_SYSVMSG_MSG; + mv.cpt_hdrlen = sizeof(mv); + mv.cpt_content = CPT_CONTENT_DATA; + + mv.cpt_type = m->m_type; + mv.cpt_size = m->m_ts; + + ctx->write(&mv, sizeof(mv), ctx); + + cpt_push_object(&saved_obj, ctx); + sysv_msg_store(m, do_store, m->m_ts, ctx); + cpt_pop_object(&saved_obj, ctx); + cpt_close_object(ctx); +} + +int cpt_dump_sysvmsg(struct cpt_context *ctx) +{ + struct msg_warg warg; + + /* Dumping msg queues is tricky because we cannot + * write to dump file under lock inside sysvipc_walk_msg(). + * + * And even worse, we have to access msg list in an unserialized + * context. It is fragile. But VE is still frozen, remember? + */ + cpt_open_section(ctx, CPT_SECT_SYSV_MSG); + warg.last_id = -1; + warg.v = cpt_get_buf(ctx); + for (;;) { + loff_t saved_obj; + struct msg_msg * m; + + if (sysvipc_walk_msg(dump_one_msg, &warg) <= 0) + break; + + cpt_open_object(NULL, ctx); + + ctx->write(warg.v, warg.v->cpt_next, ctx); + + cpt_push_object(&saved_obj, ctx); + list_for_each_entry(m, &warg.msq->q_messages, m_list) { + cpt_dump_one_sysvmsg(m, ctx); + } + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + } + cpt_release_buf(ctx); + cpt_close_section(ctx); + return 0; +} + +static int cpt_collect_sysvsem_undo(cpt_context_t *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->exit_state) { + /* ipc/sem.c forgets to clear tsk->sysvsem.undo_list + * on exit. Grrr... */ + continue; + } + if (tsk->sysvsem.undo_list && + cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx) == NULL) + return -ENOMEM; + } + + for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { + struct sem_undo_list *semu = obj->o_obj; + + if (atomic_read(&semu->refcnt) != obj->o_count) { + eprintk_ctx("sem_undo_list is referenced outside %d %d\n", obj->o_count, atomic_read(&semu->refcnt)); + return -EBUSY; + } + } + return 0; +} + +static int collect_one_shm(struct shmid_kernel *shp, void *arg) +{ + cpt_context_t *ctx = arg; + + if (__cpt_object_add(CPT_OBJ_FILE, shp->shm_file, GFP_ATOMIC, ctx) == NULL) + return -ENOMEM; + return 0; +} + +int cpt_collect_sysvshm(cpt_context_t * ctx) +{ + int err; + + err = sysvipc_walk_shm(collect_one_shm, ctx); + + return err < 0 ? err : 0; +} + +int cpt_collect_sysv(cpt_context_t * ctx) +{ + int err; + + err = cpt_collect_sysvsem_undo(ctx); + if (err) + return err; + err = cpt_collect_sysvshm(ctx); + if (err) + return err; + + return 0; +} diff --git a/kernel/cpt/cpt_tty.c b/kernel/cpt/cpt_tty.c new file mode 100644 index 0000000..8ac9417 --- /dev/null +++ b/kernel/cpt/cpt_tty.c @@ -0,0 +1,215 @@ +/* + * + * kernel/cpt/cpt_tty.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +/* We must support at least N_TTY. */ + +int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx) +{ + struct tty_struct *tty = file->private_data; + cpt_object_t *obj; + struct cpt_obj_ref o; + loff_t saved_pos; + + obj = lookup_cpt_object(CPT_OBJ_TTY, tty, ctx); + if (!obj) + return -EINVAL; + + cpt_push_object(&saved_pos, ctx); + + o.cpt_next = sizeof(o); + o.cpt_object = CPT_OBJ_REF; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_VOID; + o.cpt_pos = obj->o_pos; + ctx->write(&o, sizeof(o), ctx); + + cpt_pop_object(&saved_pos, ctx); + + return 0; +} + +int cpt_collect_tty(struct file *file, cpt_context_t * ctx) +{ + struct tty_struct *tty = file->private_data; + + if (tty) { + if (cpt_object_add(CPT_OBJ_TTY, tty, ctx) == NULL) + return -ENOMEM; + if (tty->link) { + cpt_object_t *obj; + + obj = cpt_object_add(CPT_OBJ_TTY, tty->link, ctx); + if (obj == NULL) + return -ENOMEM; + /* Undo o_count, tty->link is not a reference */ + obj->o_count--; + } + } + return 0; +} + +int cpt_dump_tty(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct tty_struct *tty = obj->o_obj; + struct cpt_tty_image *v; + + if (tty->link) { + if (lookup_cpt_object(CPT_OBJ_TTY, tty->link, ctx) == NULL) { + eprintk_ctx("orphan pty %s %d\n", tty->name, tty->driver->subtype == PTY_TYPE_SLAVE); + return -EINVAL; + } + if (tty->link->link != tty) { + eprintk_ctx("bad pty pair\n"); + return -EINVAL; + } + if (tty->driver->type == TTY_DRIVER_TYPE_PTY && + tty->driver->subtype == PTY_TYPE_SLAVE && + tty->link->count) + obj->o_count++; + } + if (obj->o_count != tty->count) { + eprintk_ctx("tty %s is referenced outside %d %d\n", tty->name, obj->o_count, tty->count); + return -EBUSY; + } + + cpt_open_object(obj, ctx); + + v = cpt_get_buf(ctx); + v->cpt_next = -1; + v->cpt_object = CPT_OBJ_TTY; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_index = tty->index; + v->cpt_link = -1; + if (tty->link) + v->cpt_link = tty->link->index; + v->cpt_drv_type = tty->driver->type; + v->cpt_drv_subtype = tty->driver->subtype; + v->cpt_drv_flags = tty->driver->flags; + v->cpt_packet = tty->packet; + v->cpt_stopped = tty->stopped; + v->cpt_hw_stopped = tty->hw_stopped; + v->cpt_flow_stopped = tty->flow_stopped; + v->cpt_flags = tty->flags; + v->cpt_ctrl_status = tty->ctrl_status; + v->cpt_canon_data = tty->canon_data; + v->cpt_canon_head = tty->canon_head - tty->read_tail; + v->cpt_canon_column = tty->canon_column; + v->cpt_column = tty->column; + v->cpt_erasing = tty->erasing; + v->cpt_lnext = tty->lnext; + v->cpt_icanon = tty->icanon; + v->cpt_raw = tty->raw; + v->cpt_real_raw = tty->real_raw; + v->cpt_closing = tty->closing; + v->cpt_minimum_to_wake = tty->minimum_to_wake; + v->cpt_pgrp = 0; + if (tty->pgrp) { + v->cpt_pgrp = pid_vnr(tty->pgrp); + if ((int)v->cpt_pgrp < 0) { + dprintk_ctx("cannot map tty->pgrp %d -> %d\n", pid_vnr(tty->pgrp), (int)v->cpt_pgrp); + v->cpt_pgrp = -1; + } + } + v->cpt_session = 0; + if (tty->session) { + v->cpt_session = pid_vnr(tty->session); + if ((int)v->cpt_session < 0) { + eprintk_ctx("cannot map tty->session %d -> %d\n", pid_nr(tty->session), (int)v->cpt_session); + cpt_release_buf(ctx); + return -EINVAL; + } + } + memcpy(v->cpt_name, tty->name, 64); + v->cpt_ws_row = tty->winsize.ws_row; + v->cpt_ws_col = tty->winsize.ws_col; + v->cpt_ws_prow = tty->winsize.ws_ypixel; + v->cpt_ws_pcol = tty->winsize.ws_xpixel; + if (tty->termios == NULL) { + eprintk_ctx("NULL termios"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_c_line = tty->termios->c_line; + v->cpt_c_iflag = tty->termios->c_iflag; + v->cpt_c_oflag = tty->termios->c_oflag; + v->cpt_c_cflag = tty->termios->c_cflag; + v->cpt_c_lflag = tty->termios->c_lflag; + memcpy(v->cpt_c_cc, tty->termios->c_cc, NCCS); + if (NCCS < 32) + memset(v->cpt_c_cc + NCCS, 255, 32 - NCCS); + memcpy(v->cpt_read_flags, tty->read_flags, sizeof(v->cpt_read_flags)); + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (tty->read_buf && tty->read_cnt) { + struct cpt_obj_bits *v = cpt_get_buf(ctx); + loff_t saved_pos; + + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_BITS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_DATA; + v->cpt_size = tty->read_cnt; + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (tty->read_cnt) { + int n = min(tty->read_cnt, N_TTY_BUF_SIZE - tty->read_tail); + ctx->write(tty->read_buf + tty->read_tail, n, ctx); + if (tty->read_cnt > n) + ctx->write(tty->read_buf, tty->read_cnt-n, ctx); + ctx->align(ctx); + } + + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + } + + cpt_close_object(ctx); + + return 0; +} + +__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx) +{ + struct tty_struct * tty; + struct fasync_struct *fa; + + tty = (struct tty_struct *)file->private_data; + + for (fa = tty->fasync; fa; fa = fa->fa_next) { + if (fa->fa_file == file) + return fa->fa_fd; + } + return -1; +} diff --git a/kernel/cpt/cpt_ubc.c b/kernel/cpt/cpt_ubc.c new file mode 100644 index 0000000..fc27e74 --- /dev/null +++ b/kernel/cpt/cpt_ubc.c @@ -0,0 +1,132 @@ +/* + * + * kernel/cpt/cpt_ubc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = cpt_object_add(CPT_OBJ_UBC, bc, ctx); + if (obj != NULL) { + if (obj->o_count == 1) + get_beancounter(bc); + if (bc->parent != NULL && obj->o_parent == NULL) + obj->o_parent = cpt_add_ubc(bc->parent, ctx); + } + return obj; +} + +__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_object(CPT_OBJ_UBC, bc, ctx); + if (obj == NULL) { + char buf[48]; + print_ub_uid(bc, buf, sizeof(buf)); + eprintk("CPT: unknown ub %s (%p)\n", buf, bc); + dump_stack(); + return CPT_NULL; + } + return obj->o_pos; +} + +static void dump_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm, + int held) +{ + dmp->barrier = (prm->barrier < UB_MAXVALUE ? prm->barrier : CPT_NULL); + dmp->limit = (prm->limit < UB_MAXVALUE ? prm->limit : CPT_NULL); + dmp->held = (held ? prm->held : CPT_NULL); + dmp->maxheld = prm->maxheld; + dmp->minheld = prm->minheld; + dmp->failcnt = prm->failcnt; +} + +static int dump_one_bc(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct user_beancounter *bc; + struct cpt_beancounter_image *v; + int i; + + bc = obj->o_obj; + v = cpt_get_buf(ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_UBC; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + if (obj->o_parent != NULL) + v->cpt_parent = ((cpt_object_t *)obj->o_parent)->o_pos; + else + v->cpt_parent = CPT_NULL; + v->cpt_id = (obj->o_parent != NULL) ? bc->ub_uid : 0; + for (i = 0; i < UB_RESOURCES; i++) { + dump_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0); + dump_one_bc_parm(v->cpt_parms + i * 2 + 1, bc->ub_store + i, 1); + } + memset(v->cpt_parms + UB_RESOURCES * 2, 0, + sizeof(v->cpt_parms) + - UB_RESOURCES * 2 * sizeof(v->cpt_parms[0])); + + cpt_open_object(obj, ctx); + ctx->write(v, sizeof(*v), ctx); + cpt_close_object(ctx); + + cpt_release_buf(ctx); + return 0; +} + +int cpt_dump_ubc(struct cpt_context *ctx) +{ + cpt_object_t *obj; + int skipped; + int top; + + cpt_open_section(ctx, CPT_SECT_UBC); + + do { + skipped = 0; + top = 0; + for_each_object(obj, CPT_OBJ_UBC) { + if (obj->o_parent == NULL) + top++; + if (obj->o_pos != CPT_NULL) + continue; + if (obj->o_parent != NULL && + ((cpt_object_t *)obj->o_parent)->o_pos == CPT_NULL) + skipped++; + else + dump_one_bc(obj, ctx); + } + } while (skipped && (top < 2)); + + cpt_close_section(ctx); + if (top > 1) { + eprintk_ctx("More than one top level ub exist"); + return -EINVAL; + } + + return 0; +} + +void cpt_finish_ubc(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_UBC) + put_beancounter(obj->o_obj); +} diff --git a/kernel/cpt/cpt_ubc.h b/kernel/cpt/cpt_ubc.h new file mode 100644 index 0000000..645ba79 --- /dev/null +++ b/kernel/cpt/cpt_ubc.h @@ -0,0 +1,23 @@ +#ifdef CONFIG_BEANCOUNTERS +cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx); +__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx); +int cpt_dump_ubc(struct cpt_context *ctx); + +struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx); +int rst_undump_ubc(struct cpt_context *ctx); + +void cpt_finish_ubc(struct cpt_context *ctx); +void rst_finish_ubc(struct cpt_context *ctx); +void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id); +void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id); +#else +static int inline cpt_dump_ubc(struct cpt_context *ctx) +{ return 0; } +static int inline rst_undump_ubc(struct cpt_context *ctx) +{ return 0; } +static void inline cpt_finish_ubc(struct cpt_context *ctx) +{ return; } +static void inline rst_finish_ubc(struct cpt_context *ctx) +{ return; } +#endif + diff --git a/kernel/cpt/cpt_x8664.S b/kernel/cpt/cpt_x8664.S new file mode 100644 index 0000000..0d5e361 --- /dev/null +++ b/kernel/cpt/cpt_x8664.S @@ -0,0 +1,67 @@ +#define ASSEMBLY 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + .code64 + + .macro FAKE_STACK_FRAME child_rip + /* push in order ss, rsp, eflags, cs, rip */ + xorq %rax, %rax + pushq %rax /* ss */ + pushq %rax /* rsp */ + pushq $(1<<9) /* eflags - interrupts on */ + pushq $__KERNEL_CS /* cs */ + pushq \child_rip /* rip */ + pushq %rax /* orig rax */ + .endm + + .macro UNFAKE_STACK_FRAME + addq $8*6, %rsp + .endm + +ENTRY(asm_kernel_thread) + CFI_STARTPROC + FAKE_STACK_FRAME $child_rip + SAVE_ALL + + # rdi: flags, rsi: usp, rdx: will be &pt_regs + movq %rdx,%rdi + orq $0x00800000,%rdi + movq $-1, %rsi + movq %rsp, %rdx + + xorl %r8d,%r8d + xorl %r9d,%r9d + pushq %rcx + call do_fork_pid + addq $8, %rsp + /* call do_fork */ + movq %rax,RAX(%rsp) + xorl %edi,%edi + RESTORE_ALL + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC +ENDPROC(asm_kernel_thread) + +child_rip: + pushq $0 # fake return address + CFI_STARTPROC + movq %rdi, %rax + movq %rsi, %rdi + call *%rax + movq %rax, %rdi + call do_exit + CFI_ENDPROC +ENDPROC(child_rip) + diff --git a/kernel/cpt/rst_conntrack.c b/kernel/cpt/rst_conntrack.c new file mode 100644 index 0000000..4c31f32 --- /dev/null +++ b/kernel/cpt/rst_conntrack.c @@ -0,0 +1,283 @@ +/* + * + * kernel/cpt/rst_conntrack.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_VE_IPTABLES) && \ + (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) + +#include +#include +#include +#include +#include +#include +#include +#include + +#define ASSERT_READ_LOCK(x) do { } while (0) +#define ASSERT_WRITE_LOCK(x) do { } while (0) + + +#include "cpt_obj.h" +#include "cpt_context.h" + +struct ct_holder +{ + struct ct_holder *next; + struct ip_conntrack *ct; + int index; +}; + +static void decode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple, int dir) +{ + tuple->dst.ip = v->cpt_dst; + tuple->dst.u.all = v->cpt_dstport; + tuple->dst.protonum = v->cpt_protonum; + tuple->dst.dir = v->cpt_dir; + if (dir != tuple->dst.dir) + wprintk("dir != tuple->dst.dir\n"); + + tuple->src.ip = v->cpt_src; + tuple->src.u.all = v->cpt_srcport; +} + + +static int undump_expect_list(struct ip_conntrack *ct, + struct cpt_ip_conntrack_image *ci, + loff_t pos, struct ct_holder *ct_list, + cpt_context_t *ctx) +{ + loff_t end; + int err; + + end = pos + ci->cpt_next; + pos += ci->cpt_hdrlen; + while (pos < end) { + struct cpt_ip_connexpect_image v; + struct ip_conntrack_expect *exp; + struct ip_conntrack *sibling; + + err = rst_get_object(CPT_OBJ_NET_CONNTRACK_EXPECT, pos, &v, ctx); + if (err) + return err; + + sibling = NULL; + if (v.cpt_sibling_conntrack) { + struct ct_holder *c; + + for (c = ct_list; c; c = c->next) { + if (c->index == v.cpt_sibling_conntrack) { + sibling = c->ct; + break; + } + } + if (!sibling) { + eprintk_ctx("lost sibling of expectation\n"); + return -EINVAL; + } + } + + write_lock_bh(&ip_conntrack_lock); + + /* It is possible. Helper module could be just unregistered, + * if expectation were on the list, it would be destroyed. */ + if (ct->helper == NULL) { + write_unlock_bh(&ip_conntrack_lock); + dprintk_ctx("conntrack: no helper and non-trivial expectation\n"); + continue; + } + + exp = ip_conntrack_expect_alloc(NULL); + if (exp == NULL) { + write_unlock_bh(&ip_conntrack_lock); + return -ENOMEM; + } + + if (ct->helper->timeout && !del_timer(&exp->timeout)) { + /* Dying already. We can do nothing. */ + write_unlock_bh(&ip_conntrack_lock); + dprintk_ctx("conntrack expectation is dying\n"); + continue; + } + + decode_tuple(&v.cpt_tuple, &exp->tuple, 0); + decode_tuple(&v.cpt_mask, &exp->mask, 0); + + exp->master = ct; + nf_conntrack_get(&ct->ct_general); + ip_conntrack_expect_insert(exp); +#if 0 + if (sibling) { + exp->sibling = sibling; + sibling->master = exp; + LIST_DELETE(&ve_ip_conntrack_expect_list, exp); + ct->expecting--; + nf_conntrack_get(&master_ct(sibling)->infos[0]); + } else +#endif + if (ct->helper->timeout) { + exp->timeout.expires = jiffies + v.cpt_timeout; + add_timer(&exp->timeout); + } + write_unlock_bh(&ip_conntrack_lock); + + pos += v.cpt_next; + } + return 0; +} + +static int undump_one_ct(struct cpt_ip_conntrack_image *ci, loff_t pos, + struct ct_holder **ct_list, cpt_context_t *ctx) +{ + int err = 0; + struct ip_conntrack *conntrack; + struct ct_holder *c; + struct ip_conntrack_tuple orig, repl; + + c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); + if (c == NULL) + return -ENOMEM; + + decode_tuple(&ci->cpt_tuple[0], &orig, 0); + decode_tuple(&ci->cpt_tuple[1], &repl, 1); + + conntrack = ip_conntrack_alloc(&orig, &repl, get_exec_env()->_ip_conntrack->ub); + if (!conntrack || IS_ERR(conntrack)) { + kfree(c); + return -ENOMEM; + } + + c->ct = conntrack; + c->next = *ct_list; + *ct_list = c; + c->index = ci->cpt_index; + + decode_tuple(&ci->cpt_tuple[0], &conntrack->tuplehash[0].tuple, 0); + decode_tuple(&ci->cpt_tuple[1], &conntrack->tuplehash[1].tuple, 1); + + conntrack->status = ci->cpt_status; + + memcpy(&conntrack->proto, ci->cpt_proto_data, sizeof(conntrack->proto)); + memcpy(&conntrack->help, ci->cpt_help_data, sizeof(conntrack->help)); + +#ifdef CONFIG_IP_NF_NAT_NEEDED +#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ + defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) + conntrack->nat.masq_index = ci->cpt_masq_index; +#endif + if (ci->cpt_initialized) { + conntrack->nat.info.seq[0].correction_pos = ci->cpt_nat_seq[0].cpt_correction_pos; + conntrack->nat.info.seq[0].offset_before = ci->cpt_nat_seq[0].cpt_offset_before; + conntrack->nat.info.seq[0].offset_after = ci->cpt_nat_seq[0].cpt_offset_after; + conntrack->nat.info.seq[1].correction_pos = ci->cpt_nat_seq[1].cpt_correction_pos; + conntrack->nat.info.seq[1].offset_before = ci->cpt_nat_seq[1].cpt_offset_before; + conntrack->nat.info.seq[1].offset_after = ci->cpt_nat_seq[1].cpt_offset_after; + } + if (conntrack->status & IPS_NAT_DONE_MASK) + ip_nat_hash_conntrack(conntrack); +#endif + + if (ci->cpt_ct_helper) { + conntrack->helper = ip_conntrack_helper_find_get(&conntrack->tuplehash[1].tuple); + if (conntrack->helper == NULL) { + eprintk_ctx("conntrack: cannot find helper, some module is not loaded\n"); + err = -EINVAL; + } + } + + ip_conntrack_hash_insert(conntrack); + conntrack->timeout.expires = jiffies + ci->cpt_timeout; + + if (err == 0 && ci->cpt_next > ci->cpt_hdrlen) + err = undump_expect_list(conntrack, ci, pos, *ct_list, ctx); + + return err; +} + +int rst_restore_ip_conntrack(struct cpt_context * ctx) +{ + int err = 0; + loff_t sec = ctx->sections[CPT_SECT_NET_CONNTRACK]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_ip_conntrack_image ci; + struct ct_holder *c; + struct ct_holder *ct_list = NULL; + + if (sec == CPT_NULL) + return 0; + + if (sizeof(ci.cpt_proto_data) != sizeof(union ip_conntrack_proto)) { + eprintk_ctx("conntrack module ct->proto version mismatch\n"); + return -EINVAL; + } + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_CONNTRACK || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + err = rst_get_object(CPT_OBJ_NET_CONNTRACK, sec, &ci, ctx); + if (err) + break; + err = undump_one_ct(&ci, sec, &ct_list, ctx); + if (err) + break; + sec += ci.cpt_next; + } + + while ((c = ct_list) != NULL) { + ct_list = c->next; + if (c->ct) + add_timer(&c->ct->timeout); + kfree(c); + } + + return err; +} + +#else + +#include "cpt_obj.h" +#include "cpt_context.h" + +int rst_restore_ip_conntrack(struct cpt_context * ctx) +{ + if (ctx->sections[CPT_SECT_NET_CONNTRACK] != CPT_NULL) + return -EINVAL; + return 0; +} + +#endif diff --git a/kernel/cpt/rst_context.c b/kernel/cpt/rst_context.c new file mode 100644 index 0000000..f74b81c --- /dev/null +++ b/kernel/cpt/rst_context.c @@ -0,0 +1,323 @@ +/* + * + * kernel/cpt/rst_context.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +static ssize_t file_read(void *addr, size_t count, struct cpt_context *ctx) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->read(file, addr, count, &file->f_pos); + set_fs(oldfs); + if (err != count) + return err >= 0 ? -EIO : err; + return 0; +} + +static ssize_t file_pread(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->read(file, addr, count, &pos); + set_fs(oldfs); + if (err != count) + return err >= 0 ? -EIO : err; + return 0; +} + +static void file_align(struct cpt_context *ctx) +{ + struct file *file = ctx->file; + + if (file) + file->f_pos = CPT_ALIGN(file->f_pos); +} + +int rst_get_section(int type, struct cpt_context *ctx, loff_t *start, loff_t *end) +{ + struct cpt_section_hdr hdr; + int err; + loff_t pos; + + pos = ctx->sections[type]; + *start = *end = pos; + + if (pos != CPT_NULL) { + if ((err = ctx->pread(&hdr, sizeof(hdr), ctx, pos)) != 0) + return err; + if (hdr.cpt_section != type || hdr.cpt_hdrlen < sizeof(hdr)) + return -EINVAL; + *start = pos + hdr.cpt_hdrlen; + *end = pos + hdr.cpt_next; + } + return 0; +} +EXPORT_SYMBOL(rst_get_section); + +void rst_context_init(struct cpt_context *ctx) +{ + int i; + + memset(ctx, 0, sizeof(*ctx)); + + init_MUTEX(&ctx->main_sem); + ctx->refcount = 1; + + ctx->current_section = -1; + ctx->current_object = -1; + ctx->pagesize = PAGE_SIZE; + ctx->read = file_read; + ctx->pread = file_pread; + ctx->align = file_align; + for (i=0; i < CPT_SECT_MAX; i++) + ctx->sections[i] = CPT_NULL; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + init_completion(&ctx->pgin_notify); +#endif + cpt_object_init(ctx); +} + +static int parse_sections(loff_t start, loff_t end, cpt_context_t *ctx) +{ + struct cpt_section_hdr h; + + while (start < end) { + int err; + + err = ctx->pread(&h, sizeof(h), ctx, start); + if (err) + return err; + if (h.cpt_hdrlen < sizeof(h) || + h.cpt_next < h.cpt_hdrlen || + start + h.cpt_next > end) + return -EINVAL; + if (h.cpt_section >= CPT_SECT_MAX) + return -EINVAL; + ctx->sections[h.cpt_section] = start; + start += h.cpt_next; + } + return 0; +} + +int rst_open_dumpfile(struct cpt_context *ctx) +{ + int err; + struct cpt_major_tail *v; + struct cpt_major_hdr h; + unsigned long size; + + err = -EBADF; + if (!ctx->file) + goto err_out; + + err = -ENOMEM; + ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); + if (ctx->tmpbuf == NULL) + goto err_out; + __cpt_release_buf(ctx); + + size = ctx->file->f_dentry->d_inode->i_size; + + if (size & 7) { + err = -EINVAL; + goto err_out; + } + if (size < sizeof(struct cpt_major_hdr) + + sizeof(struct cpt_major_tail)) { + err = -EINVAL; + goto err_out; + } + err = ctx->pread(&h, sizeof(h), ctx, 0); + if (err) { + eprintk_ctx("too short image 1 %d\n", err); + goto err_out; + } + if (h.cpt_signature[0] != CPT_SIGNATURE0 || + h.cpt_signature[1] != CPT_SIGNATURE1 || + h.cpt_signature[2] != CPT_SIGNATURE2 || + h.cpt_signature[3] != CPT_SIGNATURE3) { + err = -EINVAL; + goto err_out; + } + if (h.cpt_hz != HZ) { + err = -EINVAL; + eprintk_ctx("HZ mismatch: %d != %d\n", h.cpt_hz, HZ); + goto err_out; + } + ctx->virt_jiffies64 = h.cpt_start_jiffies64; + ctx->start_time.tv_sec = h.cpt_start_sec; + ctx->start_time.tv_nsec = h.cpt_start_nsec; + ctx->kernel_config_flags = h.cpt_kernel_config[0]; + ctx->iptables_mask = h.cpt_iptables_mask; + if (h.cpt_image_version > CPT_VERSION_27 || + CPT_VERSION_MINOR(h.cpt_image_version) > 1) { + eprintk_ctx("Unknown image version: %x. Can't restore.\n", + h.cpt_image_version); + err = -EINVAL; + goto err_out; + } + ctx->image_version = h.cpt_image_version; + ctx->features = (__u64)((__u64)h.cpt_ve_features2<<32 | h.cpt_ve_features); + ctx->image_arch = h.cpt_os_arch; + + v = cpt_get_buf(ctx); + err = ctx->pread(v, sizeof(*v), ctx, size - sizeof(*v)); + if (err) { + eprintk_ctx("too short image 2 %d\n", err); + cpt_release_buf(ctx); + goto err_out; + } + if (v->cpt_signature[0] != CPT_SIGNATURE0 || + v->cpt_signature[1] != CPT_SIGNATURE1 || + v->cpt_signature[2] != CPT_SIGNATURE2 || + v->cpt_signature[3] != CPT_SIGNATURE3 || + v->cpt_nsect != CPT_SECT_MAX_INDEX) { + err = -EINVAL; + cpt_release_buf(ctx); + goto err_out; + } + if ((err = parse_sections(h.cpt_hdrlen, size - sizeof(*v) - sizeof(struct cpt_section_hdr), ctx)) < 0) { + cpt_release_buf(ctx); + goto err_out; + } +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + ctx->lazypages = v->cpt_lazypages; +#endif + ctx->tasks64 = v->cpt_64bit; + cpt_release_buf(ctx); + return 0; + +err_out: + if (ctx->tmpbuf) { + free_page((unsigned long)ctx->tmpbuf); + ctx->tmpbuf = NULL; + } + return err; +} + +void rst_close_dumpfile(struct cpt_context *ctx) +{ + if (ctx->file) { + fput(ctx->file); + ctx->file = NULL; + } + if (ctx->tmpbuf) { + free_page((unsigned long)ctx->tmpbuf); + ctx->tmpbuf = NULL; + } +} + +int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx) +{ + int err; + struct cpt_object_hdr *hdr = tmp; + err = ctx->pread(hdr, sizeof(struct cpt_object_hdr), ctx, pos); + if (err) + return err; + if (type > 0 && type != hdr->cpt_object) + return -EINVAL; + if (hdr->cpt_hdrlen > hdr->cpt_next) + return -EINVAL; + if (hdr->cpt_hdrlen < sizeof(struct cpt_object_hdr)) + return -EINVAL; + if (size < sizeof(*hdr)) + return -EINVAL; + if (size > hdr->cpt_hdrlen) + size = hdr->cpt_hdrlen; + if (size > sizeof(*hdr)) + err = ctx->pread(hdr+1, size - sizeof(*hdr), + ctx, pos + sizeof(*hdr)); + return err; +} +EXPORT_SYMBOL(_rst_get_object); + +void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx) +{ + int err; + void *tmp; + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(hdr), ctx, pos); + if (err) + return NULL; + if (type > 0 && type != hdr.cpt_object) + return NULL; + if (hdr.cpt_hdrlen > hdr.cpt_next) + return NULL; + if (hdr.cpt_hdrlen < sizeof(struct cpt_object_hdr)) + return NULL; + tmp = kmalloc(hdr.cpt_hdrlen, GFP_KERNEL); + if (!tmp) + return NULL; + err = ctx->pread(tmp, hdr.cpt_hdrlen, ctx, pos); + if (!err) + return tmp; + kfree(tmp); + return NULL; +} +EXPORT_SYMBOL(__rst_get_object); + +__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx) +{ + int err; + struct cpt_object_hdr hdr; + __u8 *name; + + err = rst_get_object(CPT_OBJ_NAME, *pos_p, &hdr, ctx); + if (err) + return NULL; + if (hdr.cpt_next - hdr.cpt_hdrlen > PAGE_SIZE) + return NULL; + name = (void*)__get_free_page(GFP_KERNEL); + if (!name) + return NULL; + err = ctx->pread(name, hdr.cpt_next - hdr.cpt_hdrlen, + ctx, *pos_p + hdr.cpt_hdrlen); + if (err) { + free_page((unsigned long)name); + return NULL; + } + *pos_p += hdr.cpt_next; + return name; +} + +__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx) +{ + return __rst_get_name(&pos, ctx); +} + +void rst_put_name(__u8 *name, struct cpt_context *ctx) +{ + unsigned long addr = (unsigned long)name; + + if (addr) + free_page(addr&~(PAGE_SIZE-1)); +} diff --git a/kernel/cpt/rst_epoll.c b/kernel/cpt/rst_epoll.c new file mode 100644 index 0000000..0ac4cae --- /dev/null +++ b/kernel/cpt/rst_epoll.c @@ -0,0 +1,169 @@ +/* + * + * kernel/cpt/rst_epoll.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +/* Those funcations are static in fs/eventpoll.c */ +extern int ep_insert(struct eventpoll *ep, struct epoll_event *event, + struct file *tfile, int fd); +extern struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); +extern void ep_release_epitem(struct epitem *epi); + + +struct file *cpt_open_epolldev(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx) +{ + struct file *file; + int efd; + + /* Argument "size" is ignored, use just 1 */ + efd = sys_epoll_create(1); + if (efd < 0) + return ERR_PTR(efd); + + file = fget(efd); + sys_close(efd); + return file; +} + +static int restore_one_epoll(cpt_object_t *obj, + loff_t pos, + struct cpt_epoll_image *ebuf, + cpt_context_t *ctx) +{ + int err = 0; + loff_t endpos; + struct file *file = obj->o_obj; + struct eventpoll *ep; + + if (file->f_op != &eventpoll_fops) { + eprintk_ctx("bad epoll file\n"); + return -EINVAL; + } + + ep = file->private_data; + + if (unlikely(ep == NULL)) { + eprintk_ctx("bad epoll device\n"); + return -EINVAL; + } + + endpos = pos + ebuf->cpt_next; + pos += ebuf->cpt_hdrlen; + while (pos < endpos) { + struct cpt_epoll_file_image efi; + struct epoll_event epds; + + cpt_object_t *tobj; + + err = rst_get_object(CPT_OBJ_EPOLL_FILE, pos, &efi, ctx); + if (err) + return err; + tobj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, efi.cpt_file, ctx); + if (!tobj) { + eprintk_ctx("epoll file not found\n"); + return -EINVAL; + } + epds.events = efi.cpt_events; + epds.data = efi.cpt_data; + mutex_lock(&ep->mtx); + err = ep_insert(ep, &epds, tobj->o_obj, efi.cpt_fd); + if (!err) { + struct epitem *epi; + epi = ep_find(ep, tobj->o_obj, efi.cpt_fd); + if (epi) { + if (efi.cpt_ready) { + unsigned long flags; + spin_lock_irqsave(&ep->lock, flags); + if (list_empty(&epi->rdllink)) + list_add_tail(&epi->rdllink, &ep->rdllist); + spin_unlock_irqrestore(&ep->lock, flags); + } + } + } + mutex_unlock(&ep->mtx); + if (err) + break; + pos += efi.cpt_next; + } + return err; +} + +int rst_eventpoll(cpt_context_t *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_EPOLL]; + loff_t endsec; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_EPOLL || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + cpt_object_t *obj; + struct cpt_epoll_image *ebuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_EPOLL, sec, ebuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ebuf->cpt_file, ctx); + if (obj == NULL) { + eprintk_ctx("cannot find epoll file object\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + err = restore_one_epoll(obj, sec, ebuf, ctx); + cpt_release_buf(ctx); + if (err) + return err; + sec += ebuf->cpt_next; + } + + return 0; + +} diff --git a/kernel/cpt/rst_files.c b/kernel/cpt/rst_files.c new file mode 100644 index 0000000..8b4c688 --- /dev/null +++ b/kernel/cpt/rst_files.c @@ -0,0 +1,1648 @@ +/* + * + * kernel/cpt/rst_files.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" + +#include "cpt_syscalls.h" + + +struct filejob { + struct filejob *next; + int pid; + loff_t fdi; +}; + +static int rst_filejob_queue(loff_t pos, cpt_context_t *ctx) +{ + struct filejob *j; + + j = kmalloc(sizeof(*j), GFP_KERNEL); + if (j == NULL) + return -ENOMEM; + j->pid = current->pid; + j->fdi = pos; + j->next = ctx->filejob_queue; + ctx->filejob_queue = j; + return 0; +} + +static void _anon_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + /* + * If nobody else uses this page, and we don't already have a + * temporary page, let's keep track of it as a one-deep + * allocation cache. (Otherwise just release our reference to it) + */ + if (page_count(page) == 1 && !pipe->tmp_page) + pipe->tmp_page = page; + else + page_cache_release(page); + + module_put(THIS_MODULE); +} + +static void *_anon_pipe_buf_map(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, int atomic) +{ + if (atomic) { + buf->flags |= PIPE_BUF_FLAG_ATOMIC; + return kmap_atomic(buf->page, KM_USER0); + } + + return kmap(buf->page); +} + +static void _anon_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, void *map_data) +{ + if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { + buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; + kunmap_atomic(map_data, KM_USER0); + } else + kunmap(buf->page); +} + +static int _anon_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + if (page_count(page) == 1) { + lock_page(page); + return 0; + } + + return 1; +} + +static void _anon_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) +{ + page_cache_get(buf->page); +} + +static int _anon_pipe_buf_confirm(struct pipe_inode_info *info, struct pipe_buffer *buf) +{ + return 0; +} + +static struct pipe_buf_operations _anon_pipe_buf_ops = { + .can_merge = 1, + .map = _anon_pipe_buf_map, + .unmap = _anon_pipe_buf_unmap, + .release = _anon_pipe_buf_release, + .confirm = _anon_pipe_buf_confirm, + .get = _anon_pipe_buf_get, + .steal = _anon_pipe_buf_steal, +}; + +/* Sorta ugly... Multiple readers/writers of named pipe rewrite buffer + * many times. We need to mark it in CPT_OBJ_INODE table in some way. + */ +static int fixup_pipe_data(struct file *file, struct cpt_file_image *fi, + struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + struct cpt_inode_image ii; + struct cpt_obj_bits b; + struct pipe_inode_info *info; + int err; + int count; + + if (!S_ISFIFO(ino->i_mode)) { + eprintk_ctx("fixup_pipe_data: not a pipe %Ld\n", (long long)fi->cpt_inode); + return -EINVAL; + } + if (fi->cpt_inode == CPT_NULL) + return 0; + + err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); + if (err) + return err; + + if (ii.cpt_next <= ii.cpt_hdrlen) + return 0; + + err = rst_get_object(CPT_OBJ_BITS, fi->cpt_inode + ii.cpt_hdrlen, &b, ctx); + if (err) + return err; + + if (b.cpt_size == 0) + return 0; + + mutex_lock(&ino->i_mutex); + info = ino->i_pipe; + if (info->nrbufs) { + mutex_unlock(&ino->i_mutex); + eprintk("pipe buffer is restored already\n"); + return -EINVAL; + } + info->curbuf = 0; + count = 0; + while (count < b.cpt_size) { + struct pipe_buffer *buf = info->bufs + info->nrbufs; + void * addr; + int chars; + + chars = b.cpt_size - count; + if (chars > PAGE_SIZE) + chars = PAGE_SIZE; + if (!try_module_get(THIS_MODULE)) { + err = -EBUSY; + break; + } + + buf->page = alloc_page(GFP_HIGHUSER); + if (buf->page == NULL) { + err = -ENOMEM; + break; + } + buf->ops = &_anon_pipe_buf_ops; + buf->offset = 0; + buf->len = chars; + info->nrbufs++; + addr = kmap(buf->page); + err = ctx->pread(addr, chars, ctx, + fi->cpt_inode + ii.cpt_hdrlen + b.cpt_hdrlen + count); + if (err) + break; + count += chars; + } + mutex_unlock(&ino->i_mutex); + + return err; +} + +static int make_flags(struct cpt_file_image *fi) +{ + int flags = O_NOFOLLOW; + switch (fi->cpt_mode&(FMODE_READ|FMODE_WRITE)) { + case FMODE_READ|FMODE_WRITE: + flags |= O_RDWR; break; + case FMODE_WRITE: + flags |= O_WRONLY; break; + case FMODE_READ: + flags |= O_RDONLY; break; + default: break; + } + flags |= fi->cpt_flags&~(O_ACCMODE|O_CREAT|O_TRUNC|O_EXCL|FASYNC); + flags |= O_NONBLOCK|O_NOCTTY; + return flags; +} + +static struct file *open_pipe(char *name, + struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + struct cpt_inode_image ii; + struct file *rf, *wf; + + err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); + if (err) + return ERR_PTR(err); + + if (ii.cpt_sb == FSMAGIC_PIPEFS) { + int pfd[2]; + + if ((err = sc_pipe(pfd)) < 0) + return ERR_PTR(err); + + rf = fcheck(pfd[0]); + wf = fcheck(pfd[1]); + get_file(rf); + get_file(wf); + sc_close(pfd[0]); + sc_close(pfd[1]); + + if (fi->cpt_mode&FMODE_READ) { + struct file *tf; + tf = wf; wf = rf; rf = tf; + } + } else { + if (fi->cpt_mode&FMODE_READ) { + rf = filp_open(name, flags, 0); + if (IS_ERR(rf)) { + dprintk_ctx("filp_open\n"); + return rf; + } + dprintk_ctx(CPT_FID "open RDONLY fifo ino %Ld %p %x\n", CPT_TID(current), + (long long)fi->cpt_inode, rf, rf->f_dentry->d_inode->i_mode); + return rf; + } + + dprintk_ctx(CPT_FID "open WRONLY fifo ino %Ld\n", CPT_TID(current), (long long)fi->cpt_inode); + + rf = filp_open(name, O_RDWR|O_NONBLOCK, 0); + if (IS_ERR(rf)) + return rf; + wf = dentry_open(dget(rf->f_dentry), + mntget(rf->f_vfsmnt), flags); + } + + /* Add pipe inode to obj table. */ + obj = cpt_object_add(CPT_OBJ_INODE, wf->f_dentry->d_inode, ctx); + if (obj == NULL) { + fput(rf); fput(wf); + return ERR_PTR(-ENOMEM); + } + cpt_obj_setpos(obj, fi->cpt_inode, ctx); + obj->o_parent = rf; + + /* Add another side of pipe to obj table, it will not be used + * (o_pos = PT_NULL), another processes opeining pipe will find + * inode and open it with dentry_open(). */ + obj = cpt_object_add(CPT_OBJ_FILE, rf, ctx); + if (obj == NULL) { + fput(wf); + return ERR_PTR(-ENOMEM); + } + return wf; +} + +static struct file *open_special(struct cpt_file_image *fi, + unsigned flags, + int deleted, + struct cpt_context *ctx) +{ + struct cpt_inode_image *ii; + struct file *file; + + /* Directories and named pipes are not special actually */ + if (S_ISDIR(fi->cpt_i_mode) || S_ISFIFO(fi->cpt_i_mode)) + return NULL; + + /* No support for block devices at the moment. */ + if (S_ISBLK(fi->cpt_i_mode)) + return ERR_PTR(-EINVAL); + + if (S_ISSOCK(fi->cpt_i_mode)) { + eprintk_ctx("bug: socket is not open\n"); + return ERR_PTR(-EINVAL); + } + + /* Support only (some) character devices at the moment. */ + if (!S_ISCHR(fi->cpt_i_mode)) + return ERR_PTR(-EINVAL); + + ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx); + if (ii == NULL) + return ERR_PTR(-ENOMEM); + + /* Do not worry about this right now. /dev/null,zero,*random are here. + * To prohibit at least /dev/mem? + */ + if (MAJOR(ii->cpt_rdev) == MEM_MAJOR) { + kfree(ii); + return NULL; + } + + /* /dev/net/tun will be opened by caller */ + if (fi->cpt_lflags & CPT_DENTRY_TUNTAP) { + kfree(ii); + return NULL; + } + + file = rst_open_tty(fi, ii, flags, ctx); + kfree(ii); + return file; +} + +static int restore_posix_lock(struct file *file, struct cpt_flock_image *fli, cpt_context_t *ctx) +{ + struct file_lock lock; + cpt_object_t *obj; + + memset(&lock, 0, sizeof(lock)); + lock.fl_type = fli->cpt_type; + lock.fl_flags = fli->cpt_flags & ~FL_SLEEP; + lock.fl_start = fli->cpt_start; + lock.fl_end = fli->cpt_end; + obj = lookup_cpt_obj_byindex(CPT_OBJ_FILES, fli->cpt_owner, ctx); + if (!obj) { + eprintk_ctx("unknown lock owner %d\n", (int)fli->cpt_owner); + return -EINVAL; + } + lock.fl_owner = obj->o_obj; + lock.fl_pid = vpid_to_pid(fli->cpt_pid); + if (lock.fl_pid < 0) { + eprintk_ctx("unknown lock pid %d\n", lock.fl_pid); + return -EINVAL; + } + lock.fl_file = file; + + if (lock.fl_owner == NULL) + eprintk_ctx("no lock owner\n"); + return posix_lock_file(file, &lock, NULL); +} + +static int restore_flock(struct file *file, struct cpt_flock_image *fli, + cpt_context_t *ctx) +{ + int cmd, err, fd; + fd = get_unused_fd(); + if (fd < 0) { + eprintk_ctx("BSD flock cannot be restored\n"); + return fd; + } + get_file(file); + fd_install(fd, file); + if (fli->cpt_type == F_RDLCK) { + cmd = LOCK_SH; + } else if (fli->cpt_type == F_WRLCK) { + cmd = LOCK_EX; + } else { + eprintk_ctx("flock flavor is unknown: %u\n", fli->cpt_type); + sc_close(fd); + return -EINVAL; + } + + err = sc_flock(fd, LOCK_NB | cmd); + sc_close(fd); + return err; +} + + +static int fixup_posix_locks(struct file *file, + struct cpt_file_image *fi, + loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t end; + struct cpt_flock_image fli; + + end = pos + fi->cpt_next; + pos += fi->cpt_hdrlen; + while (pos < end) { + err = rst_get_object(-1, pos, &fli, ctx); + if (err) + return err; + if (fli.cpt_object == CPT_OBJ_FLOCK && + (fli.cpt_flags&FL_POSIX)) { + err = restore_posix_lock(file, &fli, ctx); + if (err) + return err; + dprintk_ctx("posix lock restored\n"); + } + pos += fli.cpt_next; + } + return 0; +} + +int rst_posix_locks(struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + struct cpt_file_image fi; + + if (obj->o_pos == CPT_NULL) + continue; + + err = rst_get_object(CPT_OBJ_FILE, obj->o_pos, &fi, ctx); + if (err < 0) + return err; + if (fi.cpt_next > fi.cpt_hdrlen) + fixup_posix_locks(file, &fi, obj->o_pos, ctx); + } + return 0; +} + +static int fixup_flocks(struct file *file, + struct cpt_file_image *fi, + loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t end; + struct cpt_flock_image fli; + + end = pos + fi->cpt_next; + pos += fi->cpt_hdrlen; + while (pos < end) { + err = rst_get_object(-1, pos, &fli, ctx); + if (err) + return err; + if (fli.cpt_object == CPT_OBJ_FLOCK && + (fli.cpt_flags&FL_FLOCK)) { + err = restore_flock(file, &fli, ctx); + if (err) + return err; + dprintk_ctx("bsd lock restored\n"); + } + pos += fli.cpt_next; + } + return 0; +} + + +static int fixup_reg_data(struct file *file, loff_t pos, loff_t end, + struct cpt_context *ctx) +{ + int err; + struct cpt_page_block pgb; + ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); + + do_write = file->f_op->write; + if (do_write == NULL) { + eprintk_ctx("no write method. Cannot restore contents of the file.\n"); + return -EINVAL; + } + + atomic_long_inc(&file->f_count); + + while (pos < end) { + loff_t opos; + loff_t ipos; + int count; + + err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); + if (err) + goto out; + dprintk_ctx("restoring file data block: %08x-%08x\n", + (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); + ipos = pos + pgb.cpt_hdrlen; + opos = pgb.cpt_start; + count = pgb.cpt_end-pgb.cpt_start; + while (count > 0) { + mm_segment_t oldfs; + int copy = count; + + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + (void)cpt_get_buf(ctx); + oldfs = get_fs(); set_fs(KERNEL_DS); + err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); + set_fs(oldfs); + if (err) { + __cpt_release_buf(ctx); + goto out; + } + if (!(file->f_mode & FMODE_WRITE) || + (file->f_flags&O_DIRECT)) { + fput(file); + file = dentry_open(dget(file->f_dentry), + mntget(file->f_vfsmnt), O_WRONLY); + if (IS_ERR(file)) { + __cpt_release_buf(ctx); + return PTR_ERR(file); + } + } + oldfs = get_fs(); set_fs(KERNEL_DS); + ipos += copy; + err = do_write(file, ctx->tmpbuf, copy, &opos); + set_fs(oldfs); + __cpt_release_buf(ctx); + if (err != copy) { + if (err >= 0) + err = -EIO; + goto out; + } + count -= copy; + } + pos += pgb.cpt_next; + } + err = 0; + +out: + fput(file); + return err; +} + + +static int fixup_file_content(struct file **file_p, struct cpt_file_image *fi, + struct cpt_inode_image *ii, + struct cpt_context *ctx) +{ + int err; + struct file *file = *file_p; + struct iattr newattrs; + + if (!S_ISREG(fi->cpt_i_mode)) + return 0; + + if (file == NULL) { + file = shmem_file_setup("dev/zero", ii->cpt_size, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + *file_p = file; + } + + if (ii->cpt_next > ii->cpt_hdrlen) { + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), ctx, fi->cpt_inode+ii->cpt_hdrlen); + if (err) + return err; + if (hdr.cpt_object == CPT_OBJ_PAGES) { + err = fixup_reg_data(file, fi->cpt_inode+ii->cpt_hdrlen, + fi->cpt_inode+ii->cpt_next, ctx); + if (err) + return err; + } + } + + mutex_lock(&file->f_dentry->d_inode->i_mutex); + /* stage 1 - update size like do_truncate does */ + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + newattrs.ia_size = ii->cpt_size; + cpt_timespec_import(&newattrs.ia_ctime, ii->cpt_ctime); + err = notify_change(file->f_dentry, &newattrs); + if (err) + goto out; + + /* stage 2 - update times, owner and mode */ + newattrs.ia_valid = ATTR_MTIME | ATTR_ATIME | + ATTR_ATIME_SET | ATTR_MTIME_SET | + ATTR_MODE | ATTR_UID | ATTR_GID; + newattrs.ia_uid = ii->cpt_uid; + newattrs.ia_gid = ii->cpt_gid; + newattrs.ia_mode = file->f_dentry->d_inode->i_mode & S_IFMT; + newattrs.ia_mode |= (ii->cpt_mode & ~S_IFMT); + cpt_timespec_import(&newattrs.ia_atime, ii->cpt_atime); + cpt_timespec_import(&newattrs.ia_mtime, ii->cpt_mtime); + err = notify_change(file->f_dentry, &newattrs); + +out: + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + return err; +} + +static int fixup_file_flags(struct file *file, struct cpt_file_image *fi, + int was_dentry_open, loff_t pos, + cpt_context_t *ctx) +{ + if (fi->cpt_pos != file->f_pos) { + int err = -ESPIPE; + if (file->f_op->llseek) + err = file->f_op->llseek(file, fi->cpt_pos, 0); + if (err < 0) { + dprintk_ctx("file %Ld lseek %Ld - %Ld\n", + (long long)pos, + (long long)file->f_pos, + (long long)fi->cpt_pos); + file->f_pos = fi->cpt_pos; + } + } + file->f_uid = fi->cpt_uid; + file->f_gid = fi->cpt_gid; + file->f_owner.pid = 0; + if (fi->cpt_fown_pid != CPT_FOWN_STRAY_PID) { + file->f_owner.pid = find_get_pid(fi->cpt_fown_pid); + if (file->f_owner.pid == NULL) { + wprintk_ctx("fixup_file_flags: owner %d does not exist anymore\n", + fi->cpt_fown_pid); + return -EINVAL; + } + } + file->f_owner.uid = fi->cpt_fown_uid; + file->f_owner.euid = fi->cpt_fown_euid; + file->f_owner.signum = fi->cpt_fown_signo; + + if (file->f_mode != fi->cpt_mode) { + if (was_dentry_open && + ((file->f_mode^fi->cpt_mode)&(FMODE_PREAD|FMODE_LSEEK))) { + file->f_mode &= ~(FMODE_PREAD|FMODE_LSEEK); + file->f_mode |= fi->cpt_mode&(FMODE_PREAD|FMODE_LSEEK); + } + if (file->f_mode != fi->cpt_mode) + wprintk_ctx("file %ld mode mismatch %08x %08x\n", (long)pos, file->f_mode, fi->cpt_mode); + } + if (file->f_flags != fi->cpt_flags) { + if (!(fi->cpt_flags&O_NOFOLLOW)) + file->f_flags &= ~O_NOFOLLOW; + if ((file->f_flags^fi->cpt_flags)&O_NONBLOCK) { + file->f_flags &= ~O_NONBLOCK; + file->f_flags |= fi->cpt_flags&O_NONBLOCK; + } + if (fi->cpt_flags&FASYNC) { + if (fi->cpt_fown_fd == -1) { + wprintk_ctx("No fd for FASYNC\n"); + return -EINVAL; + } else if (file->f_op && file->f_op->fasync) { + if (file->f_op->fasync(fi->cpt_fown_fd, file, 1) < 0) { + wprintk_ctx("FASYNC problem\n"); + return -EINVAL; + } else { + file->f_flags |= FASYNC; + } + } + } + if (file->f_flags != fi->cpt_flags) { + eprintk_ctx("file %ld flags mismatch %08x %08x\n", (long)pos, file->f_flags, fi->cpt_flags); + return -EINVAL; + } + } + return 0; +} + +static struct file * +open_deleted(char *name, unsigned flags, struct cpt_file_image *fi, + struct cpt_inode_image *ii, cpt_context_t *ctx) +{ + struct file * file; + char *suffix = NULL; + int attempt = 0; + int tmp_pass = 0; + mode_t mode = fi->cpt_i_mode; + + /* Strip (deleted) part... */ + if (strlen(name) > strlen(" (deleted)")) { + if (strcmp(name + strlen(name) - strlen(" (deleted)"), " (deleted)") == 0) { + suffix = &name[strlen(name) - strlen(" (deleted)")]; + *suffix = 0; + } else if (memcmp(name, "(deleted) ", strlen("(deleted) ")) == 0) { + memmove(name, name + strlen("(deleted) "), strlen(name) - strlen(" (deleted)") + 1); + suffix = name + strlen(name); + } + } + +try_again: + for (;;) { + if (attempt) { + if (attempt > 1000) { + eprintk_ctx("open_deleted: failed after %d attempts\n", attempt); + return ERR_PTR(-EEXIST); + } + if (suffix == NULL) { + eprintk_ctx("open_deleted: no suffix\n"); + return ERR_PTR(-EEXIST); + } + sprintf(suffix, ".%08x", (unsigned)((xtime.tv_nsec>>10)+attempt)); + } + attempt++; + + if (S_ISFIFO(mode)) { + int err; + err = sc_mknod(name, S_IFIFO|(mode&017777), 0); + if (err == -EEXIST) + continue; + if (err < 0 && !tmp_pass) + goto change_dir; + if (err < 0) + return ERR_PTR(err); + file = open_pipe(name, fi, flags, ctx); + sc_unlink(name); + } else if (S_ISCHR(mode)) { + int err; + err = sc_mknod(name, S_IFCHR|(mode&017777), new_encode_dev(ii->cpt_rdev)); + if (err == -EEXIST) + continue; + if (err < 0 && !tmp_pass) + goto change_dir; + if (err < 0) + return ERR_PTR(err); + file = filp_open(name, flags, mode&017777); + sc_unlink(name); + } else if (S_ISDIR(mode)) { + int err; + err = sc_mkdir(name, mode&017777); + if (err == -EEXIST) + continue; + if (err < 0 && !tmp_pass) + goto change_dir; + if (err < 0) + return ERR_PTR(err); + file = filp_open(name, flags, mode&017777); + sc_rmdir(name); + } else { + file = filp_open(name, O_CREAT|O_EXCL|flags, mode&017777); + if (IS_ERR(file)) { + if (PTR_ERR(file) == -EEXIST) + continue; + if (!tmp_pass) + goto change_dir; + } else { + sc_unlink(name); + } + } + break; + } + + if (IS_ERR(file)) { + eprintk_ctx("filp_open %s: %ld\n", name, PTR_ERR(file)); + return file; + } else { + dprintk_ctx("deleted file created as %s, %p, %x\n", name, file, file->f_dentry->d_inode->i_mode); + } + return file; + +change_dir: + sprintf(name, "/tmp/rst%u", current->pid); + suffix = name + strlen(name); + attempt = 1; + tmp_pass = 1; + goto try_again; +} + +struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx) +{ + int err; + int was_dentry_open = 0; + cpt_object_t *obj; + cpt_object_t *iobj; + struct cpt_file_image fi; + __u8 *name = NULL; + struct file *file; + int flags; + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, pos, ctx); + if (obj) { + file = obj->o_obj; + if (obj->o_index >= 0) { + dprintk_ctx("file is attached to a socket\n"); + err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); + if (err < 0) + goto err_out; + fixup_file_flags(file, &fi, 0, pos, ctx); + } + get_file(file); + return file; + } + + err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); + if (err < 0) + goto err_out; + + flags = make_flags(&fi); + + /* Easy way, inode has been already open. */ + if (fi.cpt_inode != CPT_NULL && + !(fi.cpt_lflags & CPT_DENTRY_CLONING) && + (iobj = lookup_cpt_obj_bypos(CPT_OBJ_INODE, fi.cpt_inode, ctx)) != NULL && + iobj->o_parent) { + struct file *filp = iobj->o_parent; + file = dentry_open(dget(filp->f_dentry), + mntget(filp->f_vfsmnt), flags); + dprintk_ctx("rst_file: file obtained by dentry_open\n"); + was_dentry_open = 1; + goto map_file; + } + + if (fi.cpt_next > fi.cpt_hdrlen) + name = rst_get_name(pos + sizeof(fi), ctx); + + if (!name) { + eprintk_ctx("no name for file?\n"); + err = -EINVAL; + goto err_out; + } + + if (fi.cpt_lflags & CPT_DENTRY_DELETED) { + struct cpt_inode_image ii; + if (fi.cpt_inode == CPT_NULL) { + eprintk_ctx("deleted file and no inode.\n"); + err = -EINVAL; + goto err_out; + } + + err = rst_get_object(CPT_OBJ_INODE, fi.cpt_inode, &ii, ctx); + if (err) + goto err_out; + + if (ii.cpt_next > ii.cpt_hdrlen) { + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(hdr), ctx, + fi.cpt_inode + ii.cpt_hdrlen); + if (err) + goto err_out; + if (hdr.cpt_object == CPT_OBJ_NAME) { + rst_put_name(name, ctx); + name = rst_get_name(fi.cpt_inode+ii.cpt_hdrlen, + ctx); + if (!name) { + eprintk_ctx("no name for link?\n"); + err = -EINVAL; + goto err_out; + } + goto open_file; + } + } + + /* One very special case... */ + if (S_ISREG(fi.cpt_i_mode) && + (!name[0] || strcmp(name, "/dev/zero (deleted)") == 0)) { + /* MAP_ANON|MAP_SHARED mapping. + * kernel makes this damn ugly way, when file which + * is passed to mmap by user does not match + * file finally attached to VMA. Ok, rst_mm + * has to take care of this. Otherwise, it will fail. + */ + file = NULL; + } else if (S_ISREG(fi.cpt_i_mode) || + S_ISCHR(fi.cpt_i_mode) || + S_ISFIFO(fi.cpt_i_mode) || + S_ISDIR(fi.cpt_i_mode)) { + if (S_ISCHR(fi.cpt_i_mode)) { + file = open_special(&fi, flags, 1, ctx); + if (file != NULL) + goto map_file; + } + file = open_deleted(name, flags, &fi, &ii, ctx); + if (IS_ERR(file)) + goto out; + } else { + eprintk_ctx("not a regular deleted file.\n"); + err = -EINVAL; + goto err_out; + } + + err = fixup_file_content(&file, &fi, &ii, ctx); + if (err) + goto err_put; + goto map_file; + } else { +open_file: + if (!name[0]) { + eprintk_ctx("empty name for file?\n"); + err = -EINVAL; + goto err_out; + } + if ((fi.cpt_lflags & CPT_DENTRY_EPOLL) && + (file = cpt_open_epolldev(&fi, flags, ctx)) != NULL) + goto map_file; +#ifdef CONFIG_INOTIFY_USER + if ((fi.cpt_lflags & CPT_DENTRY_INOTIFY) && + (file = rst_open_inotify(&fi, flags, ctx)) != NULL) + goto map_file; +#else + if (fi.cpt_lflags & CPT_DENTRY_INOTIFY) { + err = -EINVAL; + goto err_out; + } +#endif + if (S_ISFIFO(fi.cpt_i_mode) && + (file = open_pipe(name, &fi, flags, ctx)) != NULL) + goto map_file; + if (!S_ISREG(fi.cpt_i_mode) && + (file = open_special(&fi, flags, 0, ctx)) != NULL) + goto map_file; + } + + file = filp_open(name, flags, 0); + +map_file: + if (!IS_ERR(file)) { + fixup_file_flags(file, &fi, was_dentry_open, pos, ctx); + + if (S_ISFIFO(fi.cpt_i_mode) && !was_dentry_open) { + err = fixup_pipe_data(file, &fi, ctx); + if (err) + goto err_put; + } + + /* This is very special hack. Logically, cwd/root are + * nothing but open directories. Nevertheless, this causes + * failures of restores, when number of open files in VE + * is close to limit. So, if it is rst_file() of cwd/root + * (fd = -2) and the directory is not deleted, we skip + * adding files to object table. If the directory is + * not unlinked, this cannot cause any problems. + */ + if (fd != -2 || + !S_ISDIR(file->f_dentry->d_inode->i_mode) || + (fi.cpt_lflags & CPT_DENTRY_DELETED)) { + obj = cpt_object_get(CPT_OBJ_FILE, file, ctx); + if (!obj) { + obj = cpt_object_add(CPT_OBJ_FILE, file, ctx); + if (obj) + get_file(file); + } + if (obj) + cpt_obj_setpos(obj, pos, ctx); + + obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); + if (obj) { + cpt_obj_setpos(obj, fi.cpt_inode, ctx); + if (!obj->o_parent || !(fi.cpt_lflags & CPT_DENTRY_DELETED)) + obj->o_parent = file; + } + } + + if (fi.cpt_next > fi.cpt_hdrlen) { + err = fixup_flocks(file, &fi, pos, ctx); + if (err) + goto err_put; + } + } else { + if (fi.cpt_lflags & CPT_DENTRY_PROC) { + dprintk_ctx("rst_file /proc delayed\n"); + file = NULL; + } else if (name) + eprintk_ctx("can't open file %s\n", name); + } + +out: + if (name) + rst_put_name(name, ctx); + return file; + +err_put: + if (file) + fput(file); +err_out: + if (name) + rst_put_name(name, ctx); + return ERR_PTR(err); +} + + +__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + __u32 flag = 0; + + if (ti->cpt_files == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx)) + flag |= CLONE_FILES; + if (ti->cpt_fs == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx)) + flag |= CLONE_FS; + return flag; +} + +static void local_close_files(struct files_struct * files) +{ + int i, j; + + j = 0; + for (;;) { + unsigned long set; + i = j * __NFDBITS; + if (i >= files->fdt->max_fds) + break; + set = files->fdt->open_fds->fds_bits[j]; + while (set) { + if (set & 1) { + struct file * file = xchg(&files->fdt->fd[i], NULL); + if (file) + filp_close(file, files); + } + i++; + set >>= 1; + } + files->fdt->open_fds->fds_bits[j] = 0; + files->fdt->close_on_exec->fds_bits[j] = 0; + j++; + } +} + +extern int expand_fdtable(struct files_struct *files, int nr); + + +int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct cpt_files_struct_image fi; + struct files_struct *f = current->files; + cpt_object_t *obj; + loff_t pos, endpos; + int err; + + if (ti->cpt_files == CPT_NULL) { + current->files = NULL; + if (f) + put_files_struct(f); + return 0; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx); + if (obj) { + if (obj->o_obj != f) { + put_files_struct(f); + f = obj->o_obj; + atomic_inc(&f->count); + current->files = f; + } + return 0; + } + + err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx); + if (err) + return err; + + local_close_files(f); + + if (fi.cpt_max_fds > f->fdt->max_fds) { + spin_lock(&f->file_lock); + err = expand_fdtable(f, fi.cpt_max_fds-1); + spin_unlock(&f->file_lock); + if (err < 0) + return err; + } + + pos = ti->cpt_files + fi.cpt_hdrlen; + endpos = ti->cpt_files + fi.cpt_next; + while (pos < endpos) { + struct cpt_fd_image fdi; + struct file *filp; + + err = rst_get_object(CPT_OBJ_FILEDESC, pos, &fdi, ctx); + if (err) + return err; + filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); + if (IS_ERR(filp)) { + eprintk_ctx("rst_file: %ld %Lu\n", PTR_ERR(filp), + (long long)fdi.cpt_file); + return PTR_ERR(filp); + } + if (filp == NULL) { + int err = rst_filejob_queue(pos, ctx); + if (err) + return err; + } else { + if (fdi.cpt_fd >= f->fdt->max_fds) BUG(); + f->fdt->fd[fdi.cpt_fd] = filp; + FD_SET(fdi.cpt_fd, f->fdt->open_fds); + if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) + FD_SET(fdi.cpt_fd, f->fdt->close_on_exec); + } + pos += fdi.cpt_next; + } + f->next_fd = fi.cpt_next_fd; + + obj = cpt_object_add(CPT_OBJ_FILES, f, ctx); + if (obj) { + cpt_obj_setpos(obj, ti->cpt_files, ctx); + cpt_obj_setindex(obj, fi.cpt_index, ctx); + } + return 0; +} + +int rst_do_filejobs(cpt_context_t *ctx) +{ + struct filejob *j; + + while ((j = ctx->filejob_queue) != NULL) { + int err; + struct task_struct *tsk; + struct cpt_fd_image fdi; + struct file *filp; + + read_lock(&tasklist_lock); + tsk = find_task_by_vpid(j->pid); + if (tsk) + get_task_struct(tsk); + read_unlock(&tasklist_lock); + if (!tsk) + return -EINVAL; + + err = rst_get_object(CPT_OBJ_FILEDESC, j->fdi, &fdi, ctx); + if (err) { + put_task_struct(tsk); + return err; + } + + if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); + if (tsk->files->fdt->fd[fdi.cpt_fd] || + FD_ISSET(fdi.cpt_fd, tsk->files->fdt->open_fds)) { + eprintk_ctx("doing filejob %Ld: fd is busy\n", j->fdi); + put_task_struct(tsk); + return -EBUSY; + } + + filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); + if (IS_ERR(filp)) { + eprintk_ctx("rst_do_filejobs: 1: %ld %Lu\n", PTR_ERR(filp), (unsigned long long)fdi.cpt_file); + put_task_struct(tsk); + return PTR_ERR(filp); + } + if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); + tsk->files->fdt->fd[fdi.cpt_fd] = filp; + FD_SET(fdi.cpt_fd, tsk->files->fdt->open_fds); + if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) + FD_SET(fdi.cpt_fd, tsk->files->fdt->close_on_exec); + + dprintk_ctx("filejob %Ld done\n", j->fdi); + + put_task_struct(tsk); + ctx->filejob_queue = j->next; + kfree(j); + } + return 0; +} + +void rst_flush_filejobs(cpt_context_t *ctx) +{ + struct filejob *j; + + while ((j = ctx->filejob_queue) != NULL) { + ctx->filejob_queue = j->next; + kfree(j); + } +} + +int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct fs_struct *f = current->fs; + cpt_object_t *obj; + + if (ti->cpt_fs == CPT_NULL) { + exit_fs(current); + return 0; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx); + if (obj) { + if (obj->o_obj != f) { + exit_fs(current); + f = obj->o_obj; + atomic_inc(&f->count); + current->fs = f; + } + return 0; + } + + /* Do _not_ restore root. Image contains absolute pathnames. + * So, we fix it in context of rst process. + */ + + obj = cpt_object_add(CPT_OBJ_FS, f, ctx); + if (obj) + cpt_obj_setpos(obj, ti->cpt_fs, ctx); + + return 0; +} + +int cpt_get_dentry(struct dentry **dp, struct vfsmount **mp, + loff_t *pos, struct cpt_context *ctx) +{ + struct cpt_file_image fi; + struct file * file; + int err; + + err = rst_get_object(CPT_OBJ_FILE, *pos, &fi, ctx); + if (err) + return err; + + file = rst_file(*pos, -2, ctx); + if (IS_ERR(file)) + return PTR_ERR(file); + + *dp = dget(file->f_dentry); + *mp = mntget(file->f_vfsmnt); + *pos += fi.cpt_next; + fput(file); + return 0; +} + +static void __set_fs_root(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + struct dentry *old_root; + struct vfsmount *old_rootmnt; + write_lock(&fs->lock); + old_root = fs->root.dentry; + old_rootmnt = fs->root.mnt; + fs->root.mnt = mnt; + fs->root.dentry = dentry; + write_unlock(&fs->lock); + if (old_root) { + dput(old_root); + mntput(old_rootmnt); + } +} + +static void __set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + struct dentry *old_pwd; + struct vfsmount *old_pwdmnt; + + write_lock(&fs->lock); + old_pwd = fs->pwd.dentry; + old_pwdmnt = fs->pwd.mnt; + fs->pwd.mnt = mnt; + fs->pwd.dentry = dentry; + write_unlock(&fs->lock); + + if (old_pwd) { + dput(old_pwd); + mntput(old_pwdmnt); + } +} + + +int rst_restore_fs(struct cpt_context *ctx) +{ + loff_t pos; + cpt_object_t *obj; + int err = 0; + + for_each_object(obj, CPT_OBJ_FS) { + struct cpt_fs_struct_image fi; + struct fs_struct *fs = obj->o_obj; + int i; + struct dentry *d[3]; + struct vfsmount *m[3]; + + err = rst_get_object(CPT_OBJ_FS, obj->o_pos, &fi, ctx); + if (err) + return err; + + fs->umask = fi.cpt_umask; + + pos = obj->o_pos + fi.cpt_hdrlen; + d[0] = d[1] = d[2] = NULL; + m[0] = m[1] = m[2] = NULL; + i = 0; + while (pos < obj->o_pos + fi.cpt_next && i<3) { + err = cpt_get_dentry(d+i, m+i, &pos, ctx); + if (err) { + eprintk_ctx("cannot get_dir: %d", err); + for (--i; i >= 0; i--) { + if (d[i]) + dput(d[i]); + if (m[i]) + mntput(m[i]); + } + return err; + } + i++; + } + if (d[0]) + __set_fs_root(fs, m[0], d[0]); + if (d[1]) + __set_fs_pwd(fs, m[1], d[1]); + if (d[2]) + wprintk_ctx("altroot arrived...\n"); + } + return err; +} + +int do_one_mount(char *mntpnt, char *mnttype, char *mntbind, + unsigned long flags, unsigned long mnt_flags, + struct cpt_context *ctx) +{ + int err; + + if (mntbind && (strcmp(mntbind, "/") == 0 || strcmp(mntbind, "") == 0)) + mntbind = NULL; + + if (mntbind) + flags |= MS_BIND; + /* Join per-mountpoint flags with global flags */ + if (mnt_flags & MNT_NOSUID) + flags |= MS_NOSUID; + if (mnt_flags & MNT_NODEV) + flags |= MS_NODEV; + if (mnt_flags & MNT_NOEXEC) + flags |= MS_NOEXEC; + + err = sc_mount(mntbind, mntpnt, mnttype, flags); + if (err < 0) { + eprintk_ctx("%d mounting %s %s %08lx\n", err, mntpnt, mnttype, flags); + return err; + } + return 0; +} + +static int undumptmpfs(void *arg) +{ + int i; + int *pfd = arg; + int fd1, fd2, err; + char *argv[] = { "tar", "x", "-C", "/", "-S", NULL }; + + if (pfd[0] != 0) + sc_dup2(pfd[0], 0); + + set_fs(KERNEL_DS); + fd1 = sc_open("/dev/null", O_WRONLY, 0); + fd2 = sc_open("/dev/null", O_WRONLY, 0); +try: + if (fd1 < 0 || fd2 < 0) { + if (fd1 == -ENOENT && fd2 == -ENOENT) { + err = sc_mknod("/dev/null", S_IFCHR|0666, + new_encode_dev((MEM_MAJOR<files->fdt->max_fds; i++) + sc_close(i); + + module_put(THIS_MODULE); + + i = sc_execve("/bin/tar", argv, NULL); + eprintk("failed to exec /bin/tar: %d\n", i); + return 255 << 8; +} + +static int rst_restore_tmpfs(loff_t *pos, struct cpt_context * ctx) +{ + int err; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + int n; + loff_t end; + int pid; + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; + + err = rst_get_object(CPT_OBJ_NAME, *pos, &v, ctx); + if (err < 0) + return err; + + err = sc_pipe(pfd); + if (err < 0) + return err; + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); + pid = err = local_kernel_thread(undumptmpfs, (void*)pfd, SIGCHLD, 0); + if (err < 0) { + eprintk_ctx("tmpfs local_kernel_thread: %d\n", err); + goto out; + } + f = fget(pfd[1]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + ctx->file->f_pos = *pos + v.cpt_hdrlen; + end = *pos + v.cpt_next; + *pos += v.cpt_next; + do { + char buf[16]; + + n = end - ctx->file->f_pos; + if (n > sizeof(buf)) + n = sizeof(buf); + + if (ctx->read(buf, n, ctx)) + break; + oldfs = get_fs(); set_fs(KERNEL_DS); + f->f_op->write(f, buf, n, &f->f_pos); + set_fs(oldfs); + } while (ctx->file->f_pos < end); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("tar exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("tar terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + sigprocmask(SIG_SETMASK, &blocked, NULL); + + return err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + sigprocmask(SIG_SETMASK, &blocked, NULL); + return err; +} + +int check_ext_mount(char *mntpnt, char *mnttype, struct cpt_context *ctx) +{ + struct mnt_namespace *n; + struct list_head *p; + struct vfsmount *t; + char *path, *path_buf; + int ret; + + n = current->nsproxy->mnt_ns; + ret = -ENOENT; + path_buf = cpt_get_buf(ctx); + down_read(&namespace_sem); + list_for_each(p, &n->list) { + struct path pt; + t = list_entry(p, struct vfsmount, mnt_list); + pt.dentry = t->mnt_root; + pt.mnt = t; + path = d_path(&pt, path_buf, PAGE_SIZE); + if (IS_ERR(path)) + continue; + if (!strcmp(path, mntpnt) && + !strcmp(t->mnt_sb->s_type->name, mnttype)) { + ret = 0; + break; + } + } + up_read(&namespace_sem); + __cpt_release_buf(ctx); + return ret; +} + +int restore_one_vfsmount(struct cpt_vfsmount_image *mi, loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t endpos; + + endpos = pos + mi->cpt_next; + pos += mi->cpt_hdrlen; + + while (pos < endpos) { + char *mntdev; + char *mntpnt; + char *mnttype; + char *mntbind; + + mntdev = __rst_get_name(&pos, ctx); + mntpnt = __rst_get_name(&pos, ctx); + mnttype = __rst_get_name(&pos, ctx); + mntbind = NULL; + if (mi->cpt_mntflags & CPT_MNT_BIND) + mntbind = __rst_get_name(&pos, ctx); + err = -EINVAL; + if (mnttype && mntpnt) { + err = 0; + if (!(mi->cpt_mntflags & CPT_MNT_EXT) && + strcmp(mntpnt, "/")) { + err = do_one_mount(mntpnt, mnttype, mntbind, + mi->cpt_flags, + mi->cpt_mntflags, ctx); + if (!err && + strcmp(mnttype, "tmpfs") == 0 && + !(mi->cpt_mntflags & (CPT_MNT_BIND))) + err = rst_restore_tmpfs(&pos, ctx); + } else if (mi->cpt_mntflags & CPT_MNT_EXT) { + err = check_ext_mount(mntpnt, mnttype, ctx); + if (err) + eprintk_ctx("mount point is missing: %s\n", mntpnt); + } + } + if (mntdev) + rst_put_name(mntdev, ctx); + if (mntpnt) + rst_put_name(mntpnt, ctx); + if (mnttype) + rst_put_name(mnttype, ctx); + if (mntbind) + rst_put_name(mntbind, ctx); + if (err) + return err; + } + return 0; +} + +int restore_one_namespace(loff_t pos, loff_t endpos, struct cpt_context *ctx) +{ + int err; + struct cpt_vfsmount_image mi; + + while (pos < endpos) { + err = rst_get_object(CPT_OBJ_VFSMOUNT, pos, &mi, ctx); + if (err) + return err; + err = restore_one_vfsmount(&mi, pos, ctx); + if (err) + return err; + pos += mi.cpt_next; + } + return 0; +} + +int rst_root_namespace(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_NAMESPACE]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_object_hdr sbuf; + int done = 0; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NAMESPACE || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + err = rst_get_object(CPT_OBJ_NAMESPACE, sec, &sbuf, ctx); + if (err) + return err; + if (done) { + eprintk_ctx("multiple namespaces are not supported\n"); + break; + } + done++; + err = restore_one_namespace(sec+sbuf.cpt_hdrlen, sec+sbuf.cpt_next, ctx); + if (err) + return err; + sec += sbuf.cpt_next; + } + + return 0; +} + +int rst_stray_files(struct cpt_context *ctx) +{ + int err = 0; + loff_t sec = ctx->sections[CPT_SECT_FILES]; + loff_t endsec; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_FILES || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + struct cpt_object_hdr sbuf; + cpt_object_t *obj; + + err = _rst_get_object(CPT_OBJ_FILE, sec, &sbuf, sizeof(sbuf), ctx); + if (err) + break; + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, sec, ctx); + if (!obj) { + struct file *file; + + dprintk_ctx("stray file %Ld\n", sec); + + file = rst_sysv_shm_itself(sec, ctx); + + if (IS_ERR(file)) { + eprintk_ctx("rst_stray_files: %ld\n", PTR_ERR(file)); + return PTR_ERR(file); + } else { + fput(file); + } + } + sec += sbuf.cpt_next; + } + + return err; +} diff --git a/kernel/cpt/rst_inotify.c b/kernel/cpt/rst_inotify.c new file mode 100644 index 0000000..0dcaf47 --- /dev/null +++ b/kernel/cpt/rst_inotify.c @@ -0,0 +1,196 @@ +/* + * + * kernel/cpt/rst_inotify.c + * + * Copyright (C) 2000-2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +extern struct file_operations inotify_fops; + +struct file *rst_open_inotify(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx) +{ + struct file *file; + int fd; + + fd = sys_inotify_init(); + if (fd < 0) + return ERR_PTR(fd); + + file = fget(fd); + sys_close(fd); + return file; +} + +static int restore_one_inotify(cpt_object_t *obj, + loff_t pos, + struct cpt_inotify_image *ibuf, + cpt_context_t *ctx) +{ + int err = 0; + loff_t endpos; + struct file *file = obj->o_obj; + struct inotify_device *dev; + + if (file->f_op != &inotify_fops) { + eprintk_ctx("bad inotify file\n"); + return -EINVAL; + } + + dev = file->private_data; + + if (unlikely(dev == NULL)) { + eprintk_ctx("bad inotify device\n"); + return -EINVAL; + } + + endpos = pos + ibuf->cpt_next; + pos += ibuf->cpt_hdrlen; + while (pos < endpos) { + union { + struct cpt_inotify_wd_image wi; + struct cpt_inotify_ev_image ei; + } u; + + err = rst_get_object(-1, pos, &u, ctx); + if (err) { + eprintk_ctx("rst_get_object: %d\n", err); + return err; + } + if (u.wi.cpt_object == CPT_OBJ_INOTIFY_WATCH) { + struct path p; + loff_t fpos = pos + u.wi.cpt_hdrlen; + + err = cpt_get_dentry(&p.dentry, &p.mnt, &fpos, ctx); + if (err) { + eprintk_ctx("cpt_get_dentry: %d\n", err); + return err; + } + + mutex_lock(&dev->up_mutex); + dev->ih->last_wd = u.wi.cpt_wd - 1; + err = inotify_create_watch(dev, &p, u.wi.cpt_mask); + dev->ih->last_wd = ibuf->cpt_last_wd; + if (err != u.wi.cpt_wd) { + eprintk_ctx("wrong inotify descriptor %u %u\n", err, u.wi.cpt_wd); + if (err >= 0) + err = -EINVAL; + } else + err = 0; + mutex_unlock(&dev->up_mutex); + path_put(&p); + if (err) + break; + } else if (u.wi.cpt_object == CPT_OBJ_INOTIFY_EVENT) { + struct inotify_user_watch dummy_watch; + struct inotify_watch *w; + char *name = NULL; + + if (u.ei.cpt_namelen) { + name = kmalloc(u.ei.cpt_namelen+1, GFP_KERNEL); + if (name == NULL) { + err = -ENOMEM; + break; + } + name[u.ei.cpt_namelen] = 0; + err = ctx->pread(name, u.ei.cpt_namelen, ctx, pos + u.ei.cpt_hdrlen); + if (err) { + kfree(name); + break; + } + } + + w = &dummy_watch.wdata; + dummy_watch.dev = dev; + atomic_set(&w->count, 2); + + /* Trick to avoid destruction due to exit event */ + if (u.ei.cpt_mask & (IN_IGNORED | IN_ONESHOT)) + atomic_inc(&w->count); + dev->ih->in_ops->handle_event(w, u.ei.cpt_wd, u.ei.cpt_mask, + u.ei.cpt_cookie, name, NULL); + if (name) + kfree(name); + } else { + eprintk_ctx("bad object: %u\n", u.wi.cpt_object); + err = -EINVAL; + break; + } + pos += u.wi.cpt_next; + } + return err; +} + +int rst_inotify(cpt_context_t *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_INOTIFY]; + loff_t endsec; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_INOTIFY || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + cpt_object_t *obj; + struct cpt_inotify_image ibuf; + + err = rst_get_object(CPT_OBJ_INOTIFY, sec, &ibuf, ctx); + if (err) + return err; + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ibuf.cpt_file, ctx); + if (obj == NULL) { + eprintk_ctx("cannot find inotify file object\n"); + return -EINVAL; + } + err = restore_one_inotify(obj, sec, &ibuf, ctx); + if (err) + return err; + sec += ibuf.cpt_next; + } + + return 0; + +} diff --git a/kernel/cpt/rst_mm.c b/kernel/cpt/rst_mm.c new file mode 100644 index 0000000..377e2e8 --- /dev/null +++ b/kernel/cpt/rst_mm.c @@ -0,0 +1,1151 @@ +/* + * + * kernel/cpt/rst_mm.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#include +#endif +#include +#include +#include +#include + +#ifdef CONFIG_VE +#include +#include +#endif + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_files.h" +#include "cpt_ubc.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" +#ifdef CONFIG_VZ_CHECKPOINT_LAZY +#include "cpt_pagein.h" +#endif + +#include "cpt_syscalls.h" + +#define __PAGE_NX (1ULL<<63) + +static unsigned long make_prot(struct cpt_vma_image *vmai) +{ + unsigned long prot = 0; + + if (vmai->cpt_flags&VM_READ) + prot |= PROT_READ; + if (vmai->cpt_flags&VM_WRITE) + prot |= PROT_WRITE; + if (vmai->cpt_flags&VM_EXEC) + prot |= PROT_EXEC; + if (vmai->cpt_flags&VM_GROWSDOWN) + prot |= PROT_GROWSDOWN; + if (vmai->cpt_flags&VM_GROWSUP) + prot |= PROT_GROWSUP; + return prot; +} + +static unsigned long make_flags(struct cpt_vma_image *vmai) +{ + unsigned long flags = MAP_FIXED; + + if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE)) + flags |= MAP_SHARED; + else + flags |= MAP_PRIVATE; + + if (vmai->cpt_file == CPT_NULL) + flags |= MAP_ANONYMOUS; + if (vmai->cpt_flags&VM_GROWSDOWN) + flags |= MAP_GROWSDOWN; +#ifdef MAP_GROWSUP + if (vmai->cpt_flags&VM_GROWSUP) + flags |= MAP_GROWSUP; +#endif + if (vmai->cpt_flags&VM_DENYWRITE) + flags |= MAP_DENYWRITE; + if (vmai->cpt_flags&VM_EXECUTABLE) + flags |= MAP_EXECUTABLE; + if (!(vmai->cpt_flags&VM_ACCOUNT)) + flags |= MAP_NORESERVE; + return flags; +} + +#ifdef CONFIG_X86 +#if !defined(CONFIG_X86_64) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) \ + && !defined(CONFIG_XEN) +static int __alloc_ldt(mm_context_t *pc, int mincount) +{ + int oldsize, newsize, nr; + + if (mincount <= pc->size) + return 0; + /* + * LDT got larger - reallocate if necessary. + */ + oldsize = pc->size; + mincount = (mincount+511)&(~511); + newsize = mincount*LDT_ENTRY_SIZE; + for (nr = 0; nr * PAGE_SIZE < newsize; nr++) { + BUG_ON(nr * PAGE_SIZE >= 64*1024); + if (!pc->ldt_pages[nr]) { + pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER|__GFP_UBC); + if (!pc->ldt_pages[nr]) + goto nomem; + clear_highpage(pc->ldt_pages[nr]); + } + } + pc->size = mincount; + return 0; + +nomem: + while (--nr >= 0) + __free_page(pc->ldt_pages[nr]); + pc->size = 0; + return -ENOMEM; +} + +static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) +{ + struct mm_struct *mm = current->mm; + int i; + int err; + int size; + + err = __alloc_ldt(&mm->context, li->cpt_size/LDT_ENTRY_SIZE); + if (err) + return err; + + size = mm->context.size*LDT_ENTRY_SIZE; + + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + err = ctx->pread(kaddr, bytes, ctx, pos + li->cpt_hdrlen + i); + kunmap(mm->context.ldt_pages[nr]); + if (err) + return err; + } + + load_LDT(&mm->context); + return 0; +} + +#else + +static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) +{ + struct mm_struct *mm = current->mm; + int oldsize = mm->context.size; + void *oldldt; + void *newldt; + int err; + + if (li->cpt_size > PAGE_SIZE) + newldt = vmalloc(li->cpt_size); + else + newldt = kmalloc(li->cpt_size, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + err = ctx->pread(newldt, li->cpt_size, ctx, pos + li->cpt_hdrlen); + if (err) + return err; + + oldldt = mm->context.ldt; + mm->context.ldt = newldt; + mm->context.size = li->cpt_size/LDT_ENTRY_SIZE; + + load_LDT(&mm->context); + + if (oldsize) { + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + return 0; +} +#endif +#endif + +static int +restore_aio_ring(struct kioctx *aio_ctx, struct cpt_aio_ctx_image *aimg) +{ + struct aio_ring_info *info = &aio_ctx->ring_info; + unsigned nr_events = aio_ctx->max_reqs; + unsigned long size; + int nr_pages; + + /* We recalculate parameters of the ring exactly like + * fs/aio.c does and then compare calculated values + * with ones, stored in dump. They must be the same. */ + + nr_events += 2; + + size = sizeof(struct aio_ring); + size += sizeof(struct io_event) * nr_events; + nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + + if (nr_pages != aimg->cpt_ring_pages) + return -EINVAL; + + info->nr_pages = nr_pages; + + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + + if (nr_events != aimg->cpt_nr) + return -EINVAL; + + info->nr = 0; + info->ring_pages = info->internal_pages; + if (nr_pages > AIO_RING_PAGES) { + info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); + if (!info->ring_pages) + return -ENOMEM; + memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages); + } + + info->mmap_size = nr_pages * PAGE_SIZE; + + /* This piece of shit is not entirely my fault. Kernel aio.c makes + * something odd mmap()ping some pages and then pinning them. + * I guess it is just some mud remained of failed attempt to show ring + * to user space. The result is odd. :-) Immediately after + * creation of AIO context, kernel shares those pages with user + * and user can read and even write there. But after the first + * fork, pages are marked COW with evident consequences. + * I remember, I did the same mistake in the first version + * of mmapped packet socket, luckily that crap never reached + * mainstream. + * + * So, what are we going to do? I can simulate this odd behaviour + * exactly, but I am not insane yet. For now just take the pages + * from user space. Alternatively, we could keep kernel copy + * in AIO context image, which would be more correct. + * + * What is wrong now? If the pages are COWed, ring is transferred + * incorrectly. + */ + down_read(¤t->mm->mmap_sem); + info->mmap_base = aimg->cpt_mmap_base; + info->nr_pages = get_user_pages(current, current->mm, + info->mmap_base, nr_pages, + 1, 0, info->ring_pages, NULL); + up_read(¤t->mm->mmap_sem); + + if (unlikely(info->nr_pages != nr_pages)) { + int i; + + for (i=0; inr_pages; i++) + put_page(info->ring_pages[i]); + if (info->ring_pages && info->ring_pages != info->internal_pages) + kfree(info->ring_pages); + return -EFAULT; + } + + aio_ctx->user_id = info->mmap_base; + + info->nr = nr_events; + info->tail = aimg->cpt_tail; + + return 0; +} + +static int do_rst_aio(struct cpt_aio_ctx_image *aimg, loff_t pos, cpt_context_t *ctx) +{ + int err; + struct kioctx *aio_ctx; + extern spinlock_t aio_nr_lock; + + aio_ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); + if (!aio_ctx) + return -ENOMEM; + + memset(aio_ctx, 0, sizeof(*aio_ctx)); + aio_ctx->max_reqs = aimg->cpt_max_reqs; + + if ((err = restore_aio_ring(aio_ctx, aimg)) < 0) { + kmem_cache_free(kioctx_cachep, aio_ctx); + eprintk_ctx("AIO %Ld restore_aio_ring: %d\n", pos, err); + return err; + } + + aio_ctx->mm = current->mm; + atomic_inc(&aio_ctx->mm->mm_count); + atomic_set(&aio_ctx->users, 1); + spin_lock_init(&aio_ctx->ctx_lock); + spin_lock_init(&aio_ctx->ring_info.ring_lock); + init_waitqueue_head(&aio_ctx->wait); + INIT_LIST_HEAD(&aio_ctx->active_reqs); + INIT_LIST_HEAD(&aio_ctx->run_list); + INIT_WORK(&aio_ctx->wq.work, aio_kick_handler); + + spin_lock(&aio_nr_lock); + aio_nr += aio_ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + write_lock(&aio_ctx->mm->ioctx_list_lock); + aio_ctx->next = aio_ctx->mm->ioctx_list; + aio_ctx->mm->ioctx_list = aio_ctx; + write_unlock(&aio_ctx->mm->ioctx_list_lock); + + return 0; +} + +struct anonvma_map +{ + struct hlist_node list; + struct anon_vma *avma; + __u64 id; +}; + +static int verify_create_anonvma(struct mm_struct *mm, + struct cpt_vma_image *vmai, + cpt_context_t *ctx) +{ + struct anon_vma *avma = NULL; + struct anon_vma *new_avma; + struct vm_area_struct *vma; + int h; + + if (!ctx->anonvmas) { + if (CPT_ANONVMA_HSIZE*sizeof(struct hlist_head) > PAGE_SIZE) + return -EINVAL; + if ((ctx->anonvmas = (void*)__get_free_page(GFP_KERNEL)) == NULL) + return -ENOMEM; + for (h = 0; h < CPT_ANONVMA_HSIZE; h++) + INIT_HLIST_HEAD(&ctx->anonvmas[h]); + } else { + struct anonvma_map *map; + struct hlist_node *elem; + + h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); + hlist_for_each_entry(map, elem, &ctx->anonvmas[h], list) { + if (map->id == vmai->cpt_anonvmaid) { + avma = map->avma; + break; + } + } + } + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, vmai->cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + return -ESRCH; + } + if (vma->vm_start != vmai->cpt_start) { + up_read(&mm->mmap_sem); + eprintk_ctx("vma start mismatch\n"); + return -EINVAL; + } + if (vma->vm_pgoff != vmai->cpt_pgoff) { + dprintk_ctx("vma pgoff mismatch, fixing\n"); + if (vma->vm_file || (vma->vm_flags&(VM_SHARED|VM_MAYSHARE))) { + eprintk_ctx("cannot fixup vma pgoff\n"); + up_read(&mm->mmap_sem); + return -EINVAL; + } + vma->vm_pgoff = vmai->cpt_pgoff; + } + + if (!vma->anon_vma) { + if (avma) { + vma->anon_vma = avma; + anon_vma_link(vma); + } else { + int err; + + err = anon_vma_prepare(vma); + + if (err) { + up_read(&mm->mmap_sem); + return err; + } + } + } else { + /* Note, we _can_ arrive to the situation, when two + * different anonvmaid's point to one anon_vma, this happens + * f.e. when mmap() merged new area to previous one and + * they will share one anon_vma even if they did not on + * original host. + * + * IT IS OK. To all that I understand, we may merge all + * the anon_vma's and rmap can scan all the huge list of vmas + * searching for page. It is just "suboptimal". + * + * Real disaster would happen, if vma already got an anon_vma + * with different id. It is very rare case, kernel does the + * best efforts to merge anon_vmas when some attributes are + * different. In this case we will fall to copying memory. + */ + if (avma && vma->anon_vma != avma) { + up_read(&mm->mmap_sem); + wprintk_ctx("anon_vma mismatch\n"); + return 0; + } + } + + new_avma = vma->anon_vma; + up_read(&mm->mmap_sem); + + if (!avma) { + struct anonvma_map *map; + + if (!new_avma) + return -EINVAL; + + if ((map = kmalloc(sizeof(*map), GFP_KERNEL)) == NULL) + return -ENOMEM; + + map->id = vmai->cpt_anonvmaid; + map->avma = new_avma; + h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); + hlist_add_head(&map->list, &ctx->anonvmas[h]); + } + return 0; +} + +static int copy_mm_pages(struct mm_struct *src, unsigned long start, + unsigned long end) +{ + int err; + + for (; start < end; start += PAGE_SIZE) { + struct page *page; + struct page *spage; + void *maddr, *srcaddr; + + err = get_user_pages(current, current->mm, + start, 1, 1, 1, &page, NULL); + if (err == 0) + err = -EFAULT; + if (err < 0) + return err; + + err = get_user_pages(current, src, + start, 1, 0, 1, &spage, NULL); + + if (err == 0) + err = -EFAULT; + if (err < 0) { + page_cache_release(page); + return err; + } + + srcaddr = kmap(spage); + maddr = kmap(page); + memcpy(maddr, srcaddr, PAGE_SIZE); + set_page_dirty_lock(page); + kunmap(page); + kunmap(spage); + page_cache_release(page); + page_cache_release(spage); + } + return 0; +} + +#include + +static int do_rst_vma(struct cpt_vma_image *vmai, loff_t vmapos, loff_t mmpos, struct cpt_context *ctx) +{ + int err = 0; + unsigned long addr; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct file *file = NULL; + unsigned long prot; + int checked = 0; + + if (vmai->cpt_type == CPT_VMA_VDSO) { + if (ctx->vdso == NULL) { +#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES + err = arch_setup_additional_pages(NULL, 0, + vmai->cpt_start); +#endif + goto out; + } + } + + prot = make_prot(vmai); + + if (vmai->cpt_file != CPT_NULL) { + if (vmai->cpt_type == CPT_VMA_TYPE_0) { + file = rst_file(vmai->cpt_file, -1, ctx); + if (IS_ERR(file)) { + eprintk_ctx("do_rst_vma: rst_file: %Ld\n", (unsigned long long)vmai->cpt_file); + return PTR_ERR(file); + } + } else if (vmai->cpt_type == CPT_VMA_TYPE_SHM) { + file = rst_sysv_shm_vma(vmai, ctx); + if (IS_ERR(file)) + return PTR_ERR(file); + } + } + + down_write(&mm->mmap_sem); + + if ((make_flags(vmai) & VM_EXECUTABLE) && mm->exe_file != file) + set_mm_exe_file(mm, file); + + addr = do_mmap_pgoff(file, vmai->cpt_start, + vmai->cpt_end-vmai->cpt_start, + prot, make_flags(vmai), + vmai->cpt_pgoff); + + if (addr != vmai->cpt_start) { + up_write(&mm->mmap_sem); + + err = -EINVAL; + if (IS_ERR((void*)addr)) + err = addr; + goto out; + } + + vma = find_vma(mm, vmai->cpt_start); + if (vma == NULL) { + up_write(&mm->mmap_sem); + eprintk_ctx("cannot find mmapped vma\n"); + err = -ESRCH; + goto out; + } + + /* do_mmap_pgoff() can merge new area to previous one (not to the next, + * we mmap in order, the rest of mm is still unmapped). This can happen + * f.e. if flags are to be adjusted later, or if we had different + * anon_vma on two adjacent regions. Split it by brute force. */ + if (vma->vm_start != vmai->cpt_start) { + dprintk_ctx("vma %Ld merged, split\n", vmapos); + err = split_vma(mm, vma, (unsigned long)vmai->cpt_start, 0); + if (err) { + up_write(&mm->mmap_sem); + eprintk_ctx("cannot split vma\n"); + goto out; + } + } + up_write(&mm->mmap_sem); + + if (vmai->cpt_anonvma && vmai->cpt_anonvmaid) { + err = verify_create_anonvma(mm, vmai, ctx); + if (err) { + eprintk_ctx("cannot verify_create_anonvma %Ld\n", vmapos); + goto out; + } + } + + if (vmai->cpt_type == CPT_VMA_VDSO) { + struct page *page; + void *maddr; + + err = get_user_pages(current, current->mm, + (unsigned long)vmai->cpt_start, + 1, 1, 1, &page, NULL); + if (err == 0) + err = -EFAULT; + if (err < 0) { + eprintk_ctx("can't get vdso: get_user_pages: %d\n", err); + goto out; + } + err = 0; + maddr = kmap(page); + memcpy(maddr, ctx->vdso, PAGE_SIZE); + set_page_dirty_lock(page); + kunmap(page); + page_cache_release(page); + goto out; + } + + if (vmai->cpt_next > vmai->cpt_hdrlen) { + loff_t offset = vmapos + vmai->cpt_hdrlen; + + do { + union { + struct cpt_page_block pb; + struct cpt_remappage_block rpb; + struct cpt_copypage_block cpb; + struct cpt_lazypage_block lpb; + struct cpt_iterpage_block ipb; + } u; + loff_t pos; + + err = rst_get_object(-1, offset, &u, ctx); + if (err) { + eprintk_ctx("vma fix object: %d\n", err); + goto out; + } + if (u.rpb.cpt_object == CPT_OBJ_REMAPPAGES) { + err = sc_remap_file_pages(u.rpb.cpt_start, + u.rpb.cpt_end-u.rpb.cpt_start, + 0, u.rpb.cpt_pgoff, 0); + if (err < 0) { + eprintk_ctx("remap_file_pages: %d (%08x,%u,%u)\n", err, + (__u32)u.rpb.cpt_start, (__u32)(u.rpb.cpt_end-u.rpb.cpt_start), + (__u32)u.rpb.cpt_pgoff); + goto out; + } + offset += u.rpb.cpt_next; + continue; + } else if (u.cpb.cpt_object == CPT_OBJ_LAZYPAGES) { +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + unsigned long ptr = u.lpb.cpt_start; + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + eprintk_ctx("lost vm_area_struct\n"); + err = -ESRCH; + goto out; + } + err = anon_vma_prepare(vma); + if (err) { + up_read(&mm->mmap_sem); + goto out; + } + while (ptr < u.lpb.cpt_end) { + err = rst_pagein(vma, u.lpb.cpt_index + (ptr-u.lpb.cpt_start)/PAGE_SIZE, + ptr, ctx); + if (err) + break; + ptr += PAGE_SIZE; + } + up_read(&mm->mmap_sem); +#else + err = -EINVAL; +#endif + if (err) + goto out; + offset += u.cpb.cpt_next; + continue; + } else if (u.cpb.cpt_object == CPT_OBJ_COPYPAGES) { + struct vm_area_struct *vma, *vma1; + struct mm_struct *src; + struct anon_vma *src_anon; + cpt_object_t *mobj; + + if (!vmai->cpt_anonvmaid) { + err = -EINVAL; + eprintk_ctx("CPT_OBJ_COPYPAGES in !anonvma\n"); + goto out; + } + + mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, u.cpb.cpt_source, ctx); + if (!mobj) { + eprintk_ctx("lost mm_struct to clone pages from\n"); + err = -ESRCH; + goto out; + } + src = mobj->o_obj; + + down_read(&src->mmap_sem); + src_anon = NULL; + vma1 = find_vma(src, u.cpb.cpt_start); + if (vma1) + src_anon = vma1->anon_vma; + up_read(&src->mmap_sem); + + if (!vma1) { + eprintk_ctx("lost src vm_area_struct\n"); + err = -ESRCH; + goto out; + } + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, u.cpb.cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + eprintk_ctx("lost vm_area_struct\n"); + err = -ESRCH; + goto out; + } + + if (!src_anon || + !vma->anon_vma || + vma->anon_vma != src_anon || + vma->vm_start - vma1->vm_start != + (vma->vm_pgoff - vma1->vm_pgoff) << PAGE_SHIFT) { + up_read(&mm->mmap_sem); + wprintk_ctx("anon_vma mismatch in vm_area_struct %Ld\n", vmapos); + err = copy_mm_pages(mobj->o_obj, + u.cpb.cpt_start, + u.cpb.cpt_end); + } else { + err = __copy_page_range(vma, vma1, + u.cpb.cpt_start, + u.cpb.cpt_end-u.cpb.cpt_start); + up_read(&mm->mmap_sem); + } + if (err) { + eprintk_ctx("clone_page_range: %d (%08x,%u,%ld)\n", err, + (__u32)u.cpb.cpt_start, (__u32)(u.cpb.cpt_end-u.cpb.cpt_start), + (long)u.cpb.cpt_source); + goto out; + } + + offset += u.cpb.cpt_next; + continue; + } else if (u.pb.cpt_object == CPT_OBJ_ITERPAGES || + u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES + ) { +#ifdef CONFIG_VZ_CHECKPOINT_ITER + unsigned long ptr = u.lpb.cpt_start; + u64 page_pos[16]; + pos = offset + sizeof(u.pb); + + err = ctx->pread(&page_pos, + 8*(u.lpb.cpt_end-ptr)/PAGE_SIZE, + ctx, + pos); + if (err) { + eprintk_ctx("Oops\n"); + goto out; + } + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + eprintk_ctx("lost vm_area_struct\n"); + err = -ESRCH; + goto out; + } + err = anon_vma_prepare(vma); + if (err) { + up_read(&mm->mmap_sem); + goto out; + } + while (ptr < u.lpb.cpt_end) { + err = rst_iter(vma, + page_pos[(ptr-u.lpb.cpt_start)/PAGE_SIZE], + ptr, + ctx); + if (err) + break; + ptr += PAGE_SIZE; + } + if (u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES) { + make_pages_present((unsigned long)u.lpb.cpt_start, + (unsigned long)u.lpb.cpt_end); + } + up_read(&mm->mmap_sem); +#else + err = -EINVAL; +#endif + if (err) + goto out; + offset += u.cpb.cpt_next; + continue; + } + if (u.pb.cpt_object != CPT_OBJ_PAGES) { + eprintk_ctx("unknown vma fix object %d\n", u.pb.cpt_object); + err = -EINVAL; + goto out; + } + pos = offset + sizeof(u.pb); + if (!(vmai->cpt_flags&VM_ACCOUNT) && !(prot&PROT_WRITE)) { + /* I guess this is get_user_pages() messed things, + * this happens f.e. when gdb inserts breakpoints. + */ + int i; + for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/PAGE_SIZE; i++) { + struct page *page; + void *maddr; + err = get_user_pages(current, current->mm, + (unsigned long)u.pb.cpt_start + i*PAGE_SIZE, + 1, 1, 1, &page, NULL); + if (err == 0) + err = -EFAULT; + if (err < 0) { + eprintk_ctx("get_user_pages: %d\n", err); + goto out; + } + err = 0; + maddr = kmap(page); + if (u.pb.cpt_content == CPT_CONTENT_VOID) { + memset(maddr, 0, PAGE_SIZE); + } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { + err = ctx->pread(maddr, PAGE_SIZE, + ctx, pos + i*PAGE_SIZE); + if (err) { + kunmap(page); + goto out; + } + } else { + err = -EINVAL; + kunmap(page); + goto out; + } + set_page_dirty_lock(page); + kunmap(page); + page_cache_release(page); + } + } else { + if (!(prot&PROT_WRITE)) + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); + if (u.pb.cpt_content == CPT_CONTENT_VOID) { + int i; + for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/sizeof(unsigned long); i++) { + err = __put_user(0UL, ((unsigned long __user*)(unsigned long)u.pb.cpt_start) + i); + if (err) { + eprintk_ctx("__put_user 2 %d\n", err); + goto out; + } + } + } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { + loff_t tpos = pos; + err = ctx->file->f_op->read(ctx->file, cpt_ptr_import(u.pb.cpt_start), + u.pb.cpt_end-u.pb.cpt_start, + &tpos); + if (err != u.pb.cpt_end-u.pb.cpt_start) { + if (err >= 0) + err = -EIO; + goto out; + } + } else { + err = -EINVAL; + goto out; + } + if (!(prot&PROT_WRITE)) + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); + } + err = 0; + offset += u.pb.cpt_next; + } while (offset < vmapos + vmai->cpt_next); + } + +check: + do { + struct vm_area_struct *vma; + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + if (vma) { + if ((vma->vm_flags^vmai->cpt_flags)&VM_READHINTMASK) { + VM_ClearReadHint(vma); + vma->vm_flags |= vmai->cpt_flags&VM_READHINTMASK; + } + if ((vma->vm_flags^vmai->cpt_flags)&VM_LOCKED) { + dprintk_ctx("fixing up VM_LOCKED %Ld\n", vmapos); + up_read(&mm->mmap_sem); + if (vma->vm_flags&VM_LOCKED) + err = sc_munlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); + else + err = sc_mlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); + /* When mlock fails with EFAULT, it means + * that it could not bring in pages. + * It can happen after mlock() on unreadable + * VMAs. But VMA is correctly locked, + * so that this error can be ignored. */ + if (err == -EFAULT) + err = 0; + if (err) + goto out; + goto check; + } + if ((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&~__PAGE_NX) + wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, + (unsigned long long)vma->vm_page_prot.pgprot, + (unsigned long long)vmai->cpt_pgprot); +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) + if (((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&__PAGE_NX) && + (ctx->kernel_config_flags&CPT_KERNEL_CONFIG_PAE)) + wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, + (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot); +#endif + if (vma->vm_flags != vmai->cpt_flags) { + unsigned long x = vma->vm_flags ^ vmai->cpt_flags; + if (x & VM_EXEC) { + /* Crap. On i386 this is OK. + * It is impossible to make via mmap/mprotect + * exec.c clears VM_EXEC on stack. */ + vma->vm_flags &= ~VM_EXEC; + } else if ((x & VM_ACCOUNT) && !checked) { + checked = 1; + if (!(prot&PROT_WRITE)) { + up_read(&mm->mmap_sem); + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); + goto check; + } + wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, + (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); + } else { + wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, + (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); + } + } + } else { + wprintk_ctx("no VMA for %08lx@%ld\n", addr, (long)vmapos); + } + up_read(&mm->mmap_sem); + } while (0); + +out: + if (file) + fput(file); + return err; +} + +#ifndef CONFIG_IA64 +#define TASK_UNMAP_START 0 +#else +/* On IA64 the first page is a special VM_IO|VM_RESERVED mapping + * used to accelerate speculative dereferences of NULL pointer. */ +#define TASK_UNMAP_START PAGE_SIZE +#endif + +static int do_rst_mm(struct cpt_mm_image *vmi, loff_t pos, struct cpt_context *ctx) +{ + int err = 0; + unsigned int def_flags; + struct mm_struct *mm = current->mm; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *bc; +#endif + + down_write(&mm->mmap_sem); + do_munmap(mm, TASK_UNMAP_START, TASK_SIZE-TASK_UNMAP_START); + +#ifdef CONFIG_BEANCOUNTERS + /* + * MM beancounter is usually correct from the fork time, + * but not for init, for example. + * Luckily, mm_ub can be changed for a completely empty MM. + */ + bc = rst_lookup_ubc(vmi->cpt_mmub, ctx); + err = virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_RSTMM, bc); + if (err & NOTIFY_FAIL) { + up_write(&mm->mmap_sem); + return -ECHRNG; + } + if ((err & VIRTNOTIFY_CHANGE) && bc != mm->mm_ub) { + struct user_beancounter *old_bc; + + old_bc = mm->mm_ub; + mm->mm_ub = bc; + bc = old_bc; + } + err = 0; + put_beancounter(bc); +#endif + + mm->start_code = vmi->cpt_start_code; + mm->end_code = vmi->cpt_end_code; + mm->start_data = vmi->cpt_start_data; + mm->end_data = vmi->cpt_end_data; + mm->start_brk = vmi->cpt_start_brk; + mm->brk = vmi->cpt_brk; + mm->start_stack = vmi->cpt_start_stack; + mm->arg_start = vmi->cpt_start_arg; + mm->arg_end = vmi->cpt_end_arg; + mm->env_start = vmi->cpt_start_env; + mm->env_end = vmi->cpt_end_env; + mm->def_flags = 0; + def_flags = vmi->cpt_def_flags; + + mm->flags = vmi->cpt_dumpable; + if (ctx->image_version < CPT_VERSION_24) + mm->flags |= MMF_DUMP_FILTER_DEFAULT << MMF_DUMPABLE_BITS; + + mm->vps_dumpable = vmi->cpt_vps_dumpable; +#ifndef CONFIG_IA64 + if (ctx->image_version >= CPT_VERSION_9) { + mm->context.vdso = cpt_ptr_import(vmi->cpt_vdso); + current_thread_info()->sysenter_return = + VDSO32_SYMBOL(mm->context.vdso, SYSENTER_RETURN); + } +#endif + +#if 0 /* def CONFIG_HUGETLB_PAGE*/ +/* NB: ? */ + int used_hugetlb; +#endif + up_write(&mm->mmap_sem); + + if (vmi->cpt_next > vmi->cpt_hdrlen) { + loff_t offset = pos + vmi->cpt_hdrlen; + do { + union { + struct cpt_vma_image vmai; + struct cpt_aio_ctx_image aioi; + struct cpt_obj_bits bits; + } u; + err = rst_get_object(-1, offset, &u, ctx); + if (err) + goto out; + if (u.vmai.cpt_object == CPT_OBJ_VMA) { +#ifdef CONFIG_IA64 + //// Later... + if (u.vmai.cpt_start) +#endif + err = do_rst_vma(&u.vmai, offset, pos, ctx); + if (err) + goto out; +#ifdef CONFIG_X86 + } else if (u.bits.cpt_object == CPT_OBJ_BITS && + u.bits.cpt_content == CPT_CONTENT_MM_CONTEXT) { + err = do_rst_ldt(&u.bits, offset, ctx); + if (err) + goto out; +#endif + } else if (u.aioi.cpt_object == CPT_OBJ_AIO_CONTEXT) { + err = do_rst_aio(&u.aioi, offset, ctx); + if (err) + goto out; + } else { + eprintk_ctx("unknown object %u in mm image\n", u.vmai.cpt_object); + err = -EINVAL; + goto out; + } + offset += u.vmai.cpt_next; + } while (offset < pos + vmi->cpt_next); + } + + down_write(&mm->mmap_sem); + mm->def_flags = def_flags; + up_write(&mm->mmap_sem); + + +out: + return err; +} + +extern void exit_mm(struct task_struct * tsk); + +int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + int err = 0; + cpt_object_t *mobj; + void *tmp = (void*)__get_free_page(GFP_KERNEL); + struct cpt_mm_image *vmi = (struct cpt_mm_image *)tmp; + + if (!tmp) + return -ENOMEM; + + if (ti->cpt_mm == CPT_NULL) { + if (current->mm) { + virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT, + current); + exit_mm(current); + } + goto out; + } + + mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); + if (mobj) { + if (current->mm != mobj->o_obj) BUG(); + goto out; + } + + if (current->mm == NULL) { + struct mm_struct *mm = mm_alloc(); + if (mm == NULL) { + err = -ENOMEM; + goto out; + } + err = init_new_context(current, mm); + if (err) { + mmdrop(mm); + goto out; + } + current->mm = mm; + } + + if ((err = rst_get_object(CPT_OBJ_MM, ti->cpt_mm, vmi, ctx)) != 0) + goto out; + if ((err = do_rst_mm(vmi, ti->cpt_mm, ctx)) != 0) { + eprintk_ctx("do_rst_mm %Ld\n", (unsigned long long)ti->cpt_mm); + goto out; + } + err = -ENOMEM; + mobj = cpt_object_add(CPT_OBJ_MM, current->mm, ctx); + if (mobj != NULL) { + err = 0; + cpt_obj_setpos(mobj, ti->cpt_mm, ctx); + } + +out: + if (tmp) + free_page((unsigned long)tmp); + return err; +} + +/* This is part of mm setup, made in parent context. Mostly, it is the place, + * where we graft mm of another process to child. + */ + +int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct task_struct *tsk = obj->o_obj; + cpt_object_t *mobj; + + /* Task without mm. Just get rid of this. */ + if (ti->cpt_mm == CPT_NULL) { + if (tsk->mm) { + virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT, + tsk); + mmput(tsk->mm); + tsk->mm = NULL; + } + return 0; + } + + mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); + if (mobj) { + struct mm_struct *newmm = mobj->o_obj; + /* Good, the MM is already created. */ + if (newmm == tsk->mm) { + /* Already done by clone(). */ + return 0; + } + mmput(tsk->mm); + atomic_inc(&newmm->mm_users); + tsk->mm = newmm; + tsk->active_mm = newmm; + } + return 0; +} + +/* We use CLONE_VM when mm of child is going to be shared with parent. + * Otherwise mm is copied. + */ + +__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + if (ti->cpt_mm == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx)) + return CLONE_VM; + return 0; +} diff --git a/kernel/cpt/rst_net.c b/kernel/cpt/rst_net.c new file mode 100644 index 0000000..699a052 --- /dev/null +++ b/kernel/cpt/rst_net.c @@ -0,0 +1,741 @@ +/* + * + * kernel/cpt/rst_net.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" +#include "cpt_net.h" +#include "cpt_files.h" + +#include "cpt_syscalls.h" + +extern struct in_ifaddr *inet_alloc_ifa(void); +extern int inet_insert_ifa(struct in_ifaddr *ifa); +extern struct in_device *inetdev_init(struct net_device *dev); + +int rst_restore_ifaddr(struct cpt_context *ctx) +{ + struct net *net = get_exec_env()->ve_netns; + int err; + loff_t sec = ctx->sections[CPT_SECT_NET_IFADDR]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_ifaddr_image di; + struct net_device *dev; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_IFADDR || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int cindex = -1; + int err; + err = rst_get_object(CPT_OBJ_NET_IFADDR, sec, &di, ctx); + if (err) + return err; + cindex = di.cpt_index; + rtnl_lock(); + dev = __dev_get_by_index(net, cindex); + if (dev && di.cpt_family == AF_INET) { + struct in_device *in_dev; + struct in_ifaddr *ifa; + if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) + in_dev = inetdev_init(dev); + ifa = inet_alloc_ifa(); + if (ifa) { + ifa->ifa_local = di.cpt_address[0]; + ifa->ifa_address = di.cpt_peer[0]; + ifa->ifa_broadcast = di.cpt_broadcast[0]; + ifa->ifa_prefixlen = di.cpt_masklen; + ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); + ifa->ifa_flags = di.cpt_flags; + ifa->ifa_scope = di.cpt_scope; + memcpy(ifa->ifa_label, di.cpt_label, IFNAMSIZ); + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + err = inet_insert_ifa(ifa); + if (err && err != -EEXIST) { + rtnl_unlock(); + eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label); + return err; + } + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + } else if (dev && di.cpt_family == AF_INET6) { + __u32 prefered_lft; + __u32 valid_lft; + struct net *net = get_exec_env()->ve_ns->net_ns; + prefered_lft = (di.cpt_flags & IFA_F_DEPRECATED) ? + 0 : di.cpt_prefered_lft; + valid_lft = (di.cpt_flags & IFA_F_PERMANENT) ? + 0xFFFFFFFF : di.cpt_valid_lft; + err = inet6_addr_add(net, dev->ifindex, + (struct in6_addr *)di.cpt_address, + di.cpt_masklen, 0, + prefered_lft, + valid_lft); + if (err && err != -EEXIST) { + rtnl_unlock(); + eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label); + return err; + } +#endif + } else { + rtnl_unlock(); + eprintk_ctx("unknown ifaddr 2 for %d\n", di.cpt_index); + return -EINVAL; + } + rtnl_unlock(); + sec += di.cpt_next; + } + return 0; +} + +static int rewrite_rtmsg(struct nlmsghdr *nlh, struct cpt_context *ctx) +{ + int min_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + struct rtmsg *rtm = NLMSG_DATA(nlh); + __u32 prefix0 = 0; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *rta = (void*)nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(rta, attrlen)) { + if (rta->rta_type == RTA_DST) { + prefix0 = *(__u32*)RTA_DATA(rta); + } + rta = RTA_NEXT(rta, attrlen); + } + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (rtm->rtm_family == AF_INET6) { + if (rtm->rtm_type == RTN_LOCAL) + return 2; + if (rtm->rtm_flags & RTM_F_CLONED) + return 2; + if (rtm->rtm_protocol == RTPROT_UNSPEC || + rtm->rtm_protocol == RTPROT_RA || + rtm->rtm_protocol == RTPROT_REDIRECT || + rtm->rtm_protocol == RTPROT_KERNEL) + return 2; + if (rtm->rtm_protocol == RTPROT_BOOT && + ((rtm->rtm_dst_len == 8 && prefix0 == htonl(0xFF000000)) || + (rtm->rtm_dst_len == 64 && prefix0 == htonl(0xFE800000)))) + return 2; + } +#endif + return rtm->rtm_protocol == RTPROT_KERNEL; +} + +int rst_restore_route(struct cpt_context *ctx) +{ + int err; + struct socket *sock; + struct msghdr msg; + struct iovec iov; + struct sockaddr_nl nladdr; + mm_segment_t oldfs; + loff_t sec = ctx->sections[CPT_SECT_NET_ROUTE]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_object_hdr v; + char *pg; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_ROUTE || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + if (h.cpt_hdrlen >= h.cpt_next) + return 0; + + sec += h.cpt_hdrlen; + err = rst_get_object(CPT_OBJ_NET_ROUTE, sec, &v, ctx); + if (err < 0) + return err; + + err = sock_create(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); + if (err) + return err; + + pg = (char*)__get_free_page(GFP_KERNEL); + if (pg == NULL) { + err = -ENOMEM; + goto out_sock; + } + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + endsec = sec + v.cpt_next; + sec += v.cpt_hdrlen; + + while (sec < endsec) { + struct nlmsghdr *n; + struct nlmsghdr nh; + int kernel_flag; + + if (endsec - sec < sizeof(nh)) + break; + + err = ctx->pread(&nh, sizeof(nh), ctx, sec); + if (err) + goto out_sock_pg; + if (nh.nlmsg_len < sizeof(nh) || nh.nlmsg_len > PAGE_SIZE || + endsec - sec < nh.nlmsg_len) { + err = -EINVAL; + goto out_sock_pg; + } + err = ctx->pread(pg, nh.nlmsg_len, ctx, sec); + if (err) + goto out_sock_pg; + + n = (struct nlmsghdr*)pg; + n->nlmsg_flags = NLM_F_REQUEST|NLM_F_APPEND|NLM_F_CREATE; + + err = rewrite_rtmsg(n, ctx); + if (err < 0) + goto out_sock_pg; + kernel_flag = err; + + if (kernel_flag == 2) + goto do_next; + + iov.iov_base=n; + iov.iov_len=nh.nlmsg_len; + msg.msg_name=&nladdr; + msg.msg_namelen=sizeof(nladdr); + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_flags=MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(sock, &msg, nh.nlmsg_len); + set_fs(oldfs); + + if (err < 0) + goto out_sock_pg; + err = 0; + + iov.iov_base=pg; + iov.iov_len=PAGE_SIZE; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); + set_fs(oldfs); + if (err != -EAGAIN) { + if (err == NLMSG_LENGTH(sizeof(struct nlmsgerr)) && + n->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *e = NLMSG_DATA(n); + if (e->error != -EEXIST || !kernel_flag) + eprintk_ctx("NLMERR: %d\n", e->error); + } else { + eprintk_ctx("Res: %d %d\n", err, n->nlmsg_type); + } + } +do_next: + err = 0; + sec += NLMSG_ALIGN(nh.nlmsg_len); + } + +out_sock_pg: + free_page((unsigned long)pg); +out_sock: + sock_release(sock); + return err; +} + +int rst_resume_network(struct cpt_context *ctx) +{ + struct ve_struct *env; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + env->disable_net = 0; + put_ve(env); + return 0; +} + +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) +extern unsigned int tun_net_id; +#endif + +/* We do not restore skb queue, just reinit it */ +static int rst_restore_tuntap(loff_t start, struct cpt_netdev_image *di, + struct cpt_context *ctx) +{ + int err = -ENODEV; +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) + struct cpt_tuntap_image ti; + struct net_device *dev; + struct file *bind_file = NULL; + struct net *net; + struct tun_struct *tun; + struct tun_net *tn; + loff_t pos; + + pos = start + di->cpt_hdrlen; + err = rst_get_object(CPT_OBJ_NET_TUNTAP, pos, &ti, ctx); + if (err) + return err; + + pos += ti.cpt_next; + if (ti.cpt_bindfile) { + bind_file = rst_file(ti.cpt_bindfile, -1, ctx); + if (IS_ERR(bind_file)) { + eprintk_ctx("rst_restore_tuntap:" + "rst_file: %Ld\n", + (unsigned long long)ti.cpt_bindfile); + return PTR_ERR(bind_file); + } + } + + rtnl_lock(); + err = -ENOMEM; + dev = alloc_netdev(sizeof(struct tun_struct), di->cpt_name, tun_setup); + if (!dev) + goto out; + + tun = netdev_priv(dev); + + tun->dev = dev; + tun->owner = ti.cpt_owner; + tun->flags = ti.cpt_flags; + tun->attached = ti.cpt_attached; + tun_net_init(dev); + + tun->txflt.count = 0; + + err = register_netdevice(dev); + if (err < 0) { + free_netdev(dev); + eprintk_ctx("failed to register tun/tap net device\n"); + goto out; + } + if (pos < start + di->cpt_next) { + struct cpt_hwaddr_image hw; + /* Restore hardware address */ + err = rst_get_object(CPT_OBJ_NET_HWADDR, pos, + &hw, ctx); + if (err) + goto out; + BUILD_BUG_ON(sizeof(hw.cpt_dev_addr) != sizeof(dev->dev_addr)); + memcpy(dev->dev_addr, hw.cpt_dev_addr, + sizeof(hw.cpt_dev_addr)); + } + net = get_exec_env()->ve_ns->net_ns; + tn = net_generic(net, tun_net_id); + list_add(&tun->list, &tn->dev_list); + + bind_file->private_data = tun; + tun->bind_file = bind_file; + +out: + fput(bind_file); + rtnl_unlock(); +#endif + return err; +} + +static int rst_restore_veth(loff_t pos, struct net_device *dev, + struct cpt_context *ctx) +{ + int err = -ENODEV; +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) + struct cpt_veth_image vi; + struct veth_struct *veth; + + if (!KSYMREF(veth_open) || dev->open != KSYMREF(veth_open)) { + eprintk_ctx("Module vzethdev is not loaded, " + "or device %s is not a veth device\n", dev->name); + return -EINVAL; + } + err = rst_get_object(CPT_OBJ_NET_VETH, pos, &vi, ctx); + if (err) + return err; + veth = veth_from_netdev(dev); + veth->allow_mac_change = vi.cpt_allow_mac_change; +#endif + return err; +} + +static int rst_restore_netstats(loff_t pos, struct net_device *dev, + struct cpt_context * ctx) +{ + struct cpt_netstats_image *n; + struct net_device_stats *stats = NULL; + struct net_device *lo = get_exec_env()->ve_netns->loopback_dev; + int err; + + if (!dev->get_stats) + return 0; + + n = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_NET_STATS, pos, n, ctx); + if (err) + goto out; + BUG_ON(sizeof(struct cpt_netstats_image) != n->cpt_hdrlen); + preempt_disable(); + if (dev == lo) + stats = &lo->stats; +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) + else if (KSYMREF(veth_open) && dev->open == KSYMREF(veth_open)) + stats = veth_stats(dev, smp_processor_id()); +#endif +#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) + else if (dev == get_exec_env()->_venet_dev) + stats = venet_stats(dev, smp_processor_id()); +#endif +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) + if (dev->open == tun_net_open) + stats = &dev->stats; +#endif + if (!stats) { + err = -ENODEV; + eprintk_ctx("Network device %s is not supported\n", dev->name); + goto out; + } + + stats->rx_packets = n->cpt_rx_packets; + stats->tx_packets = n->cpt_tx_packets; + stats->rx_bytes = n->cpt_rx_bytes; + stats->tx_bytes = n->cpt_tx_bytes; + stats->rx_errors = n->cpt_rx_errors; + stats->tx_errors = n->cpt_tx_errors; + stats->rx_dropped = n->cpt_rx_dropped; + stats->tx_dropped = n->cpt_tx_dropped; + stats->multicast = n->cpt_multicast; + stats->collisions = n->cpt_collisions; + stats->rx_length_errors = n->cpt_rx_length_errors; + stats->rx_over_errors = n->cpt_rx_over_errors; + stats->rx_crc_errors = n->cpt_rx_crc_errors; + stats->rx_frame_errors = n->cpt_rx_frame_errors; + stats->rx_fifo_errors = n->cpt_rx_fifo_errors; + stats->rx_missed_errors = n->cpt_rx_missed_errors; + stats->tx_aborted_errors = n->cpt_tx_aborted_errors; + stats->tx_carrier_errors = n->cpt_tx_carrier_errors; + stats->tx_fifo_errors = n->cpt_tx_fifo_errors; + stats->tx_heartbeat_errors = n->cpt_tx_heartbeat_errors; + stats->tx_window_errors = n->cpt_tx_window_errors; + stats->rx_compressed = n->cpt_rx_compressed; + stats->tx_compressed = n->cpt_tx_compressed; + +out: + preempt_enable(); + cpt_release_buf(ctx); + return err; +} + +int rst_restore_netdev(struct cpt_context *ctx) +{ + struct net *net = get_exec_env()->ve_netns; + int err; + loff_t sec = ctx->sections[CPT_SECT_NET_DEVICE]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_netdev_image di; + struct net_device *dev; + + get_exec_env()->disable_net = 1; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_DEVICE || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + loff_t pos; + struct net_device *dev_new; + err = rst_get_object(CPT_OBJ_NET_DEVICE, sec, &di, ctx); + if (err) + return err; + + pos = sec + di.cpt_hdrlen; + if (di.cpt_next > sizeof(di)) { + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), + ctx, sec + di.cpt_hdrlen); + if (err) + return err; + if (hdr.cpt_object == CPT_OBJ_NET_TUNTAP) { + err = rst_restore_tuntap(sec, &di, ctx); + if (err) { + eprintk_ctx("restore tuntap %s: %d\n", + di.cpt_name, err); + return err; + } + pos += hdr.cpt_next; + } + } + + rtnl_lock(); + dev = __dev_get_by_name(net, di.cpt_name); + if (dev) { + if (dev->ifindex != di.cpt_index) { + dev_new = __dev_get_by_index(net, di.cpt_index); + if (!dev_new) { + write_lock_bh(&dev_base_lock); + hlist_del(&dev->index_hlist); + if (dev->iflink == dev->ifindex) + dev->iflink = di.cpt_index; + dev->ifindex = di.cpt_index; + hlist_add_head(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); + write_unlock_bh(&dev_base_lock); + } else { + write_lock_bh(&dev_base_lock); + hlist_del(&dev->index_hlist); + hlist_del(&dev_new->index_hlist); + if (dev_new->iflink == dev_new->ifindex) + dev_new->iflink = dev->ifindex; + dev_new->ifindex = dev->ifindex; + if (dev->iflink == dev->ifindex) + dev->iflink = di.cpt_index; + dev->ifindex = di.cpt_index; + hlist_add_head(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); + hlist_add_head(&dev_new->index_hlist, + dev_index_hash(net, dev_new->ifindex)); + write_unlock_bh(&dev_base_lock); + } + } + if (di.cpt_flags^dev->flags) { + err = dev_change_flags(dev, di.cpt_flags); + if (err) + eprintk_ctx("dev_change_flags err: %d\n", err); + } + while (pos < sec + di.cpt_next) { + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), + ctx, pos); + if (err) + goto out; + if (hdr.cpt_object == CPT_OBJ_NET_VETH) { + err = rst_restore_veth(pos, dev, ctx); + if (err) { + eprintk_ctx("restore veth %s: %d\n", + di.cpt_name, err); + goto out; + } + } else if (hdr.cpt_object == CPT_OBJ_NET_HWADDR) { + /* Restore hardware address */ + struct cpt_hwaddr_image hw; + err = rst_get_object(CPT_OBJ_NET_HWADDR, + pos, &hw, ctx); + if (err) + goto out; + BUILD_BUG_ON(sizeof(hw.cpt_dev_addr) != + sizeof(dev->dev_addr)); + memcpy(dev->dev_addr, hw.cpt_dev_addr, + sizeof(hw.cpt_dev_addr)); + } else if (hdr.cpt_object == CPT_OBJ_NET_STATS) { + err = rst_restore_netstats(pos, dev, ctx); + if (err) { + eprintk_ctx("rst stats %s: %d\n", + di.cpt_name, err); + goto out; + } + } + pos += hdr.cpt_next; + } + } else { + eprintk_ctx("unknown interface 2 %s\n", di.cpt_name); + } + rtnl_unlock(); + sec += di.cpt_next; + } + return 0; +out: + rtnl_unlock(); + return err; +} + +static int dumpfn(void *arg) +{ + int i; + int *pfd = arg; + char *argv[] = { "iptables-restore", "-c", NULL }; + + if (pfd[0] != 0) + sc_dup2(pfd[0], 0); + + for (i=1; ifiles->fdt->max_fds; i++) + sc_close(i); + + module_put(THIS_MODULE); + + set_fs(KERNEL_DS); + i = sc_execve("/sbin/iptables-restore", argv, NULL); + if (i == -ENOENT) + i = sc_execve("/usr/sbin/iptables-restore", argv, NULL); + eprintk("failed to exec iptables-restore: %d\n", i); + return 255 << 8; +} + +static int rst_restore_iptables(struct cpt_context * ctx) +{ + int err; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + int n; + struct cpt_section_hdr h; + loff_t sec = ctx->sections[CPT_SECT_NET_IPTABLES]; + loff_t end; + int pid; + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_IPTABLES || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + if (h.cpt_hdrlen == h.cpt_next) + return 0; + if (h.cpt_hdrlen > h.cpt_next) + return -EINVAL; + sec += h.cpt_hdrlen; + err = rst_get_object(CPT_OBJ_NAME, sec, &v, ctx); + if (err < 0) + return err; + + err = sc_pipe(pfd); + if (err < 0) + return err; + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); + pid = err = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); + if (err < 0) { + eprintk_ctx("iptables local_kernel_thread: %d\n", err); + goto out; + } + f = fget(pfd[1]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + ctx->file->f_pos = sec + v.cpt_hdrlen; + end = sec + v.cpt_next; + do { + char *p; + char buf[16]; + + n = end - ctx->file->f_pos; + if (n > sizeof(buf)) + n = sizeof(buf); + + if (ctx->read(buf, n, ctx)) + break; + if ((p = memchr(buf, 0, n)) != NULL) + n = p - buf; + oldfs = get_fs(); set_fs(KERNEL_DS); + f->f_op->write(f, buf, n, &f->f_pos); + set_fs(oldfs); + } while (ctx->file->f_pos < end); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("iptables-restore exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("iptables-restore terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + sigprocmask(SIG_SETMASK, &blocked, NULL); + + return err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + sigprocmask(SIG_SETMASK, &blocked, NULL); + return err; +} + +int rst_restore_net(struct cpt_context *ctx) +{ + int err; + + err = rst_restore_netdev(ctx); + if (!err) + err = rst_restore_ifaddr(ctx); + if (!err) + err = rst_restore_route(ctx); + if (!err) + err = rst_restore_iptables(ctx); + if (!err) + err = rst_restore_ip_conntrack(ctx); + return err; +} diff --git a/kernel/cpt/rst_proc.c b/kernel/cpt/rst_proc.c new file mode 100644 index 0000000..189649f --- /dev/null +++ b/kernel/cpt/rst_proc.c @@ -0,0 +1,580 @@ +/* + * + * kernel/cpt/rst_proc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_dump.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" + +MODULE_AUTHOR("Alexey Kuznetsov "); +MODULE_LICENSE("GPL"); + +/* List of contexts and lock protecting the list */ +static struct list_head cpt_context_list; +static spinlock_t cpt_context_lock; + +static int proc_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos = 0; + off_t begin = 0; + int len = 0; + cpt_context_t *ctx; + + len += sprintf(buffer, "Ctx Id VE State\n"); + + spin_lock(&cpt_context_lock); + + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + len += sprintf(buffer+len,"%p %08x %-8u %d", + ctx, + ctx->contextid, + ctx->ve_id, + ctx->ctx_state + ); +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + len += pagein_info_printf(buffer+len, ctx); +#endif + + buffer[len++] = '\n'; + + pos = begin+len; + if (pos < offset) { + len = 0; + begin = pos; + } + if (pos > offset+length) + goto done; + } + *eof = 1; + +done: + spin_unlock(&cpt_context_lock); + *start = buffer + (offset - begin); + len -= (offset - begin); + if(len > length) + len = length; + if(len < 0) + len = 0; + return len; +} + +void rst_context_release(cpt_context_t *ctx) +{ + list_del(&ctx->ctx_list); + spin_unlock(&cpt_context_lock); + + if (ctx->ctx_state > 0) + rst_resume(ctx); + ctx->ctx_state = CPT_CTX_ERROR; + + rst_close_dumpfile(ctx); + + if (ctx->anonvmas) { + int h; + for (h = 0; h < CPT_ANONVMA_HSIZE; h++) { + while (!hlist_empty(&ctx->anonvmas[h])) { + struct hlist_node *elem = ctx->anonvmas[h].first; + hlist_del(elem); + kfree(elem); + } + } + free_page((unsigned long)ctx->anonvmas); + } + cpt_flush_error(ctx); + if (ctx->errorfile) { + fput(ctx->errorfile); + ctx->errorfile = NULL; + } + if (ctx->error_msg) { + free_page((unsigned long)ctx->error_msg); + ctx->error_msg = NULL; + } +#ifdef CONFIG_VZ_CHECKPOINT_ITER + rst_drop_iter_dir(ctx); +#endif +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); + if (ctx->pgin_task) + put_task_struct(ctx->pgin_task); +#endif + if (ctx->filejob_queue) + rst_flush_filejobs(ctx); + if (ctx->vdso) + free_page((unsigned long)ctx->vdso); + if (ctx->objcount) + eprintk_ctx("%d objects leaked\n", ctx->objcount); + kfree(ctx); + + spin_lock(&cpt_context_lock); +} + +static void __cpt_context_put(cpt_context_t *ctx) +{ + if (!--ctx->refcount) + rst_context_release(ctx); +} + +static void cpt_context_put(cpt_context_t *ctx) +{ + spin_lock(&cpt_context_lock); + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); +} + +cpt_context_t * rst_context_open(void) +{ + cpt_context_t *ctx; + + if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { + rst_context_init(ctx); + spin_lock(&cpt_context_lock); + list_add_tail(&ctx->ctx_list, &cpt_context_list); + spin_unlock(&cpt_context_lock); + ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); + if (ctx->error_msg != NULL) + ctx->error_msg[0] = 0; + } + return ctx; +} + +void rst_report_error(int err, cpt_context_t *ctx) +{ + if (ctx->statusfile) { + mm_segment_t oldfs; + int status = 7 /* VZ_ENVCREATE_ERROR */; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (ctx->statusfile->f_op && ctx->statusfile->f_op->write) + ctx->statusfile->f_op->write(ctx->statusfile, (char*)&status, sizeof(status), &ctx->statusfile->f_pos); + set_fs(oldfs); + fput(ctx->statusfile); + ctx->statusfile = NULL; + } +} + + +static cpt_context_t * cpt_context_lookup(unsigned int ctxid) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + if (ctx->contextid == ctxid) { + ctx->refcount++; + spin_unlock(&cpt_context_lock); + return ctx; + } + } + spin_unlock(&cpt_context_lock); + return NULL; +} + +static int rst_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) +{ + int err = 0; + cpt_context_t *ctx; + struct file *dfile = NULL; + + unlock_kernel(); + + if (cmd == CPT_TEST_CAPS) { + err = test_cpu_caps(); + goto out_lock; + } + + if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { + cpt_context_t *old_ctx; + + ctx = NULL; + if (cmd == CPT_JOIN_CONTEXT) { + err = -ENOENT; + ctx = cpt_context_lookup(arg); + if (!ctx) + goto out_lock; + } + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + file->private_data = ctx; + + if (old_ctx) { + if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { + old_ctx->sticky = 0; + old_ctx->refcount--; + } + __cpt_context_put(old_ctx); + } + spin_unlock(&cpt_context_lock); + err = 0; + goto out_lock; + } + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + if (ctx) + ctx->refcount++; + spin_unlock(&cpt_context_lock); + + if (!ctx) { + cpt_context_t *old_ctx; + + err = -ENOMEM; + ctx = rst_context_open(); + if (!ctx) + goto out_lock; + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + if (!old_ctx) { + ctx->refcount++; + file->private_data = ctx; + } else { + old_ctx->refcount++; + } + if (old_ctx) { + __cpt_context_put(ctx); + ctx = old_ctx; + } + spin_unlock(&cpt_context_lock); + } + + if (cmd == CPT_GET_CONTEXT) { + unsigned int contextid = (unsigned int)arg; + + err = -EINVAL; + if (ctx->contextid && ctx->contextid != contextid) + goto out_nosem; + if (!ctx->contextid) { + cpt_context_t *c1 = cpt_context_lookup(contextid); + if (c1) { + cpt_context_put(c1); + err = -EEXIST; + goto out_nosem; + } + ctx->contextid = contextid; + } + spin_lock(&cpt_context_lock); + if (!ctx->sticky) { + ctx->sticky = 1; + ctx->refcount++; + } + spin_unlock(&cpt_context_lock); + err = 0; + goto out_nosem; + } + + down(&ctx->main_sem); + + err = -EBUSY; + if (ctx->ctx_state < 0) + goto out; + + err = 0; + switch (cmd) { + case CPT_SET_DUMPFD: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + err = -EBADF; + dfile = fget(arg); + if (dfile == NULL) + break; + if (dfile->f_op == NULL || + dfile->f_op->read == NULL) { + fput(dfile); + break; + } + err = 0; + } + if (ctx->file) + fput(ctx->file); + ctx->file = dfile; + break; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + case CPT_SET_PAGEINFDIN: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); + ctx->pagein_file_in = dfile; + break; + case CPT_SET_PAGEINFDOUT: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + ctx->pagein_file_out = dfile; + break; + case CPT_PAGEIND: + err = rst_pageind(ctx); + break; +#endif +#ifdef CONFIG_VZ_CHECKPOINT_ITER + case CPT_ITER: + err = rst_iteration(ctx); + break; +#endif + case CPT_SET_LOCKFD: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->lockfile) + fput(ctx->lockfile); + ctx->lockfile = dfile; + break; + case CPT_SET_STATUSFD: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->statusfile) + fput(ctx->statusfile); + ctx->statusfile = dfile; + break; + case CPT_SET_ERRORFD: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->errorfile) + fput(ctx->errorfile); + ctx->errorfile = dfile; + break; + case CPT_SET_VEID: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->ve_id = arg; + break; + case CPT_UNDUMP: + if (ctx->ctx_state > 0) { + err = -ENOENT; + break; + } + ctx->ctx_state = CPT_CTX_UNDUMPING; + err = vps_rst_undump(ctx); + if (err) { + rst_report_error(err, ctx); + if (rst_kill(ctx) == 0) + ctx->ctx_state = CPT_CTX_IDLE; + } else { + ctx->ctx_state = CPT_CTX_UNDUMPED; + } + break; + case CPT_RESUME: + if (!ctx->ctx_state) { + err = -ENOENT; + break; + } + err = rst_resume(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + case CPT_KILL: + if (!ctx->ctx_state) { + err = -ENOENT; + break; + } + err = rst_kill(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + default: + err = -EINVAL; + break; + } + +out: + cpt_flush_error(ctx); + up(&ctx->main_sem); +out_nosem: + cpt_context_put(ctx); +out_lock: + lock_kernel(); + if (err == -ERESTARTSYS || err == -ERESTARTNOINTR || + err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK) + err = -EINTR; + return err; +} + +static int rst_open(struct inode * inode, struct file * file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int rst_release(struct inode * inode, struct file * file) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + file->private_data = NULL; + if (ctx) + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); + + + module_put(THIS_MODULE); + return 0; +} + +static struct file_operations rst_fops = +{ + .owner = THIS_MODULE, + .ioctl = rst_ioctl, + .open = rst_open, + .release = rst_release, +}; + + +static struct proc_dir_entry *proc_ent; +extern void *schedule_tail_p; +extern void schedule_tail_hook(void); + +static struct ctl_table_header *ctl_header; + +static ctl_table debug_table[] = { + { + .procname = "rst", + .data = &debug_level, + .maxlen = sizeof(debug_level), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; +static ctl_table root_table[] = { + { + .ctl_name = CTL_DEBUG, + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { .ctl_name = 0 } +}; + +static int __init init_rst(void) +{ + int err; + + err = -ENOMEM; + ctl_header = register_sysctl_table(root_table); + if (!ctl_header) + goto err_mon; + + spin_lock_init(&cpt_context_lock); + INIT_LIST_HEAD(&cpt_context_list); + + err = -EINVAL; + proc_ent = proc_create("rst", 0600, NULL, NULL); + if (!proc_ent) + goto err_out; + + rst_fops.read = proc_ent->proc_fops->read; + rst_fops.write = proc_ent->proc_fops->write; + rst_fops.llseek = proc_ent->proc_fops->llseek; + proc_ent->proc_fops = &rst_fops; + + proc_ent->read_proc = proc_read; + proc_ent->data = NULL; + proc_ent->owner = THIS_MODULE; + return 0; + +err_out: + unregister_sysctl_table(ctl_header); +err_mon: + return err; +} +module_init(init_rst); + +static void __exit exit_rst(void) +{ + remove_proc_entry("rst", NULL); + unregister_sysctl_table(ctl_header); + + spin_lock(&cpt_context_lock); + while (!list_empty(&cpt_context_list)) { + cpt_context_t *ctx; + ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); + + if (!ctx->sticky) + ctx->refcount++; + ctx->sticky = 0; + + BUG_ON(ctx->refcount != 1); + + __cpt_context_put(ctx); + } + spin_unlock(&cpt_context_lock); +} +module_exit(exit_rst); diff --git a/kernel/cpt/rst_process.c b/kernel/cpt/rst_process.c new file mode 100644 index 0000000..38e0c38 --- /dev/null +++ b/kernel/cpt/rst_process.c @@ -0,0 +1,1641 @@ +/* + * + * kernel/cpt/rst_process.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include + +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_ubc.h" +#include "cpt_process.h" +#include "cpt_kernel.h" + + +#define HOOK_RESERVE 256 + +struct resume_info +{ + asmlinkage void (*hook)(struct resume_info *); + unsigned long hooks; +#define HOOK_TID 0 +#define HOOK_CONT 1 +#define HOOK_LSI 2 +#define HOOK_RESTART 3 + unsigned long tid_ptrs[2]; + siginfo_t last_siginfo; +}; + +#ifdef CONFIG_X86_32 + +#define IN_SYSCALL(regs) ((long)(regs)->orig_ax >= 0) +#define IN_ERROR(regs) ((long)(regs)->ax < 0) +#define SYSCALL_ERRNO(regs) (-(long)((regs)->ax)) +#define SYSCALL_RETVAL(regs) ((regs)->ax) +#define SYSCALL_NR(regs) ((regs)->orig_ax) + +#define SYSCALL_SETRET(regs,val) do { (regs)->ax = (val); } while (0) + +#define SYSCALL_RESTART2(regs,new) do { (regs)->ax = (new); \ + (regs)->ip -= 2; } while (0) + +#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name) + +/* In new kernels task_pt_regs() is define to something inappropriate */ +#undef task_pt_regs +#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.sp0) - 1) + +#elif defined(CONFIG_X86_64) + +#define IN_SYSCALL(regs) ((long)(regs)->orig_ax >= 0) +#define IN_ERROR(regs) ((long)(regs)->ax < 0) +#define SYSCALL_ERRNO(regs) (-(long)((regs)->ax)) +#define SYSCALL_RETVAL(regs) ((regs)->ax) +#define SYSCALL_NR(regs) ((regs)->orig_ax) + +#define SYSCALL_SETRET(regs,val) do { (regs)->ax = (val); } while (0) + +#define SYSCALL_RESTART2(regs,new) do { (regs)->ax = (new); \ + (regs)->ip -= 2; } while (0) + +#define __NR32_restart_syscall 0 +#define __NR32_rt_sigtimedwait 177 +#define __NR32_pause 29 +#define __NR32_futex 240 + +#define syscall_is(tsk,regs,name) ((!(task_thread_info(tsk)->flags&_TIF_IA32) && \ + SYSCALL_NR(regs) == __NR_##name) || \ + ((task_thread_info(tsk)->flags&_TIF_IA32) && \ + SYSCALL_NR(regs) == __NR32_##name)) + +#elif defined (CONFIG_IA64) + +#define IN_SYSCALL(regs) ((long)(regs)->cr_ifs >= 0) +#define IN_ERROR(regs) ((long)(regs)->r10 == -1) +#define SYSCALL_ERRNO(regs) ((regs)->r10 == -1 ? (long)((regs)->r8) : 0) +#define SYSCALL_RETVAL(regs) ((regs)->r8) +#define SYSCALL_NR(regs) ((regs)->cr_ifs >= 0 ? (regs)->r15 : -1) + +#define SYSCALL_SETRET(regs,val) do { (regs)->r8 = (val); } while (0) + +#define SYSCALL_RESTART2(regs,new) do { (regs)->r15 = (new); \ + (regs)->r10 = 0; \ + ia64_decrement_ip(regs); } while (0) + +#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name) + +#else + +#error This arch is not supported + +#endif + +#define SYSCALL_RESTART(regs) SYSCALL_RESTART2(regs, SYSCALL_NR(regs)) + +pid_t vpid_to_pid(pid_t nr) +{ + pid_t vnr; + struct pid *pid; + + rcu_read_lock(); + pid = find_vpid(nr); + vnr = (pid == NULL ? -1 : pid->numbers[0].nr); + rcu_read_unlock(); + return vnr; +} + +static void decode_siginfo(siginfo_t *info, struct cpt_siginfo_image *si) +{ + memset(info, 0, sizeof(*info)); + switch(si->cpt_code & __SI_MASK) { + case __SI_TIMER: + info->si_tid = si->cpt_pid; + info->si_overrun = si->cpt_uid; + info->_sifields._timer._sigval.sival_ptr = cpt_ptr_import(si->cpt_sigval); + info->si_sys_private = si->cpt_utime; + break; + case __SI_POLL: + info->si_band = si->cpt_pid; + info->si_fd = si->cpt_uid; + break; + case __SI_FAULT: + info->si_addr = cpt_ptr_import(si->cpt_sigval); +#ifdef __ARCH_SI_TRAPNO + info->si_trapno = si->cpt_pid; +#endif + break; + case __SI_CHLD: + info->si_pid = si->cpt_pid; + info->si_uid = si->cpt_uid; + info->si_status = si->cpt_sigval; + info->si_stime = si->cpt_stime; + info->si_utime = si->cpt_utime; + break; + case __SI_KILL: + case __SI_RT: + case __SI_MESGQ: + default: + info->si_pid = si->cpt_pid; + info->si_uid = si->cpt_uid; + info->si_ptr = cpt_ptr_import(si->cpt_sigval); + break; + } + info->si_signo = si->cpt_signo; + info->si_errno = si->cpt_errno; + info->si_code = si->cpt_code; +} + +static int restore_sigqueue(struct task_struct *tsk, + struct sigpending *queue, unsigned long start, + unsigned long end) +{ + while (start < end) { + struct cpt_siginfo_image *si = (struct cpt_siginfo_image *)start; + if (si->cpt_object == CPT_OBJ_SIGINFO) { + struct sigqueue *q = NULL; + struct user_struct *up; + + up = alloc_uid(get_exec_env()->ve_ns->user_ns, si->cpt_user); + if (!up) + return -ENOMEM; + q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC); + if (!q) { + free_uid(up); + return -ENOMEM; + } + if (ub_siginfo_charge(q, get_exec_ub())) { + kmem_cache_free(sigqueue_cachep, q); + free_uid(up); + return -ENOMEM; + } + + INIT_LIST_HEAD(&q->list); + /* Preallocated elements (posix timers) are not + * supported yet. It is safe to replace them with + * a private one. */ + q->flags = 0; + q->user = up; + atomic_inc(&q->user->sigpending); + + decode_siginfo(&q->info, si); + list_add_tail(&q->list, &queue->list); + } + start += si->cpt_next; + } + return 0; +} + +int rst_process_linkage(cpt_context_t *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + + if (tsk == NULL) { + eprintk_ctx("task %u(%s) is missing\n", ti->cpt_pid, ti->cpt_comm); + return -EINVAL; + } + + if (task_pgrp_vnr(tsk) != ti->cpt_pgrp) { + struct pid *pid; + + rcu_read_lock(); + pid = find_vpid(ti->cpt_pgrp); + if (!pid) { + eprintk_ctx("illegal PGRP " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + + write_lock_irq(&tasklist_lock); + if (task_pgrp_nr(tsk) != pid_nr(pid)) { + detach_pid(tsk, PIDTYPE_PGID); + set_task_pgrp(tsk, pid_nr(pid)); + if (thread_group_leader(tsk)) + attach_pid(tsk, PIDTYPE_PGID, pid); + } + write_unlock_irq(&tasklist_lock); + if (task_pgrp_nr(tsk) != pid_nr(pid)) { + eprintk_ctx("cannot set PGRP " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + rcu_read_unlock(); + } + if (task_session_vnr(tsk) != ti->cpt_session) { + struct pid *pid; + + rcu_read_lock(); + pid = find_vpid(ti->cpt_session); + if (!pid) { + eprintk_ctx("illegal SID " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + + write_lock_irq(&tasklist_lock); + if (task_session_nr(tsk) != pid_nr(pid)) { + detach_pid(tsk, PIDTYPE_SID); + set_task_session(tsk, pid_nr(pid)); + if (thread_group_leader(tsk)) + attach_pid(tsk, PIDTYPE_SID, pid); + } + write_unlock_irq(&tasklist_lock); + if (task_session_nr(tsk) != pid_nr(pid)) { + eprintk_ctx("cannot set SID " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + rcu_read_unlock(); + } + if (ti->cpt_old_pgrp > 0 && !tsk->signal->tty_old_pgrp) { + struct pid *pid; + + rcu_read_lock(); + pid = get_pid(find_vpid(ti->cpt_old_pgrp)); + if (!pid) { + eprintk_ctx("illegal OLD_PGRP " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + tsk->signal->tty_old_pgrp = pid; + rcu_read_unlock(); + } + } + + return 0; +} + +struct pid *alloc_vpid_safe(pid_t vnr) +{ + struct pid *pid; + + pid = alloc_pid(current->nsproxy->pid_ns, vnr); + if (!pid) + pid = find_vpid(vnr); + return pid; +} + +static int +restore_one_signal_struct(struct cpt_task_image *ti, int *exiting, cpt_context_t *ctx) +{ + int err; + struct cpt_signal_image *si = cpt_get_buf(ctx); + + current->signal->tty = NULL; + + err = rst_get_object(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, si, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + if (task_pgrp_vnr(current) != si->cpt_pgrp) { + struct pid * pid = NULL, *free = NULL; + + rcu_read_lock(); + if (si->cpt_pgrp_type == CPT_PGRP_ORPHAN) { +#if 0 + if (!is_virtual_pid(si->cpt_pgrp)) { + eprintk_ctx("external process group " CPT_FID, CPT_TID(current)); + cpt_release_buf(ctx); + return -EINVAL; + } +#endif + pid = alloc_vpid_safe(si->cpt_pgrp); + free = pid; + } + write_lock_irq(&tasklist_lock); + if (pid != NULL) { + if (task_pgrp_nr(current) != pid_nr(pid)) { + detach_pid(current, PIDTYPE_PGID); + set_task_pgrp(current, pid_nr(pid)); + if (thread_group_leader(current)) { + attach_pid(current, PIDTYPE_PGID, pid); + free = NULL; + } + } + } + write_unlock_irq(&tasklist_lock); + if (free != NULL) + free_pid(free); + rcu_read_unlock(); + } + + current->signal->tty_old_pgrp = NULL; + if ((int)si->cpt_old_pgrp > 0) { + if (si->cpt_old_pgrp_type == CPT_PGRP_STRAY) { + current->signal->tty_old_pgrp = + alloc_pid(current->nsproxy->pid_ns, 0); + if (!current->signal->tty_old_pgrp) { + eprintk_ctx("failed to allocate stray tty_old_pgrp\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + } else { + rcu_read_lock(); + current->signal->tty_old_pgrp = + get_pid(alloc_vpid_safe(si->cpt_old_pgrp)); + rcu_read_unlock(); + if (!current->signal->tty_old_pgrp) { + dprintk_ctx("forward old tty PGID\n"); + current->signal->tty_old_pgrp = NULL; + } + } + } + + if (task_session_vnr(current) != si->cpt_session) { + struct pid * pid = NULL, *free = NULL; + + rcu_read_lock(); + if (si->cpt_session_type == CPT_PGRP_ORPHAN) { +#if 0 + if (!is_virtual_pid(si->cpt_session)) { + eprintk_ctx("external process session " CPT_FID, CPT_TID(current)); + cpt_release_buf(ctx); + return -EINVAL; + } +#endif + pid = alloc_vpid_safe(si->cpt_session); + free = pid; + } + write_lock_irq(&tasklist_lock); + if (pid == NULL) + pid = find_vpid(si->cpt_session); + if (pid != NULL) { + if (task_session_nr(current) != pid_nr(pid)) { + detach_pid(current, PIDTYPE_SID); + set_task_session(current, pid_nr(pid)); + if (thread_group_leader(current)) { + attach_pid(current, PIDTYPE_SID, pid); + free = NULL; + } + } + } + write_unlock_irq(&tasklist_lock); + if (free != NULL) + free_pid(free); + rcu_read_unlock(); + } + + cpt_sigset_import(¤t->signal->shared_pending.signal, si->cpt_sigpending); + current->signal->leader = si->cpt_leader; + if (si->cpt_ctty != CPT_NULL) { + cpt_object_t *obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, si->cpt_ctty, ctx); + if (obj) { + struct tty_struct *tty = obj->o_obj; + if (!tty->session || tty->session == + task_session(current)) { + tty->session = task_session(current); + current->signal->tty = tty; + } else { + wprintk_ctx("tty session mismatch\n"); + } + } + } + + if (si->cpt_curr_target) + current->signal->curr_target = find_task_by_vpid(si->cpt_curr_target); + current->signal->flags = 0; + *exiting = si->cpt_group_exit; + current->signal->group_exit_code = si->cpt_group_exit_code; + if (si->cpt_group_exit_task) { + current->signal->group_exit_task = find_task_by_vpid(si->cpt_group_exit_task); + if (current->signal->group_exit_task == NULL) { + eprintk_ctx("oops, group_exit_task=NULL, pid=%u\n", si->cpt_group_exit_task); + cpt_release_buf(ctx); + return -EINVAL; + } + } + current->signal->notify_count = si->cpt_notify_count; + current->signal->group_stop_count = si->cpt_group_stop_count; + + if (si->cpt_next > si->cpt_hdrlen) { + char *buf = kmalloc(si->cpt_next - si->cpt_hdrlen, GFP_KERNEL); + if (buf == NULL) { + cpt_release_buf(ctx); + return -ENOMEM; + } + err = ctx->pread(buf, si->cpt_next - si->cpt_hdrlen, ctx, + ti->cpt_signal + si->cpt_hdrlen); + if (err) { + kfree(buf); + cpt_release_buf(ctx); + return err; + } + restore_sigqueue(current, + ¤t->signal->shared_pending, (unsigned long)buf, + (unsigned long)buf + si->cpt_next - si->cpt_hdrlen); + kfree(buf); + } + cpt_release_buf(ctx); + return 0; +} + +int restore_one_sighand_struct(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + int err; + struct cpt_sighand_image si; + int i; + loff_t pos, endpos; + + err = rst_get_object(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, &si, ctx); + if (err) + return err; + + for (i=0; i<_NSIG; i++) { + current->sighand->action[i].sa.sa_handler = SIG_DFL; +#ifndef CONFIG_IA64 + current->sighand->action[i].sa.sa_restorer = 0; +#endif + current->sighand->action[i].sa.sa_flags = 0; + memset(¤t->sighand->action[i].sa.sa_mask, 0, sizeof(sigset_t)); + } + + pos = ti->cpt_sighand + si.cpt_hdrlen; + endpos = ti->cpt_sighand + si.cpt_next; + while (pos < endpos) { + struct cpt_sighandler_image shi; + + err = rst_get_object(CPT_OBJ_SIGHANDLER, pos, &shi, ctx); + if (err) + return err; + current->sighand->action[shi.cpt_signo].sa.sa_handler = (void*)(unsigned long)shi.cpt_handler; +#ifndef CONFIG_IA64 + current->sighand->action[shi.cpt_signo].sa.sa_restorer = (void*)(unsigned long)shi.cpt_restorer; +#endif + current->sighand->action[shi.cpt_signo].sa.sa_flags = shi.cpt_flags; + cpt_sigset_import(¤t->sighand->action[shi.cpt_signo].sa.sa_mask, shi.cpt_mask); + pos += shi.cpt_next; + } + + return 0; +} + + +__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + __u32 flag = 0; + + if (lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx)) + flag |= CLONE_THREAD; + if (ti->cpt_sighand == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx)) + flag |= CLONE_SIGHAND; + return flag; +} + +int +rst_signal_complete(struct cpt_task_image *ti, int * exiting, cpt_context_t *ctx) +{ + int err; + cpt_object_t *obj; + + if (ti->cpt_signal == CPT_NULL || ti->cpt_sighand == CPT_NULL) { + return -EINVAL; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx); + if (obj) { + struct sighand_struct *sig = current->sighand; + if (obj->o_obj != sig) { + return -EINVAL; + } + } else { + obj = cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, current->sighand, ctx); + if (obj == NULL) + return -ENOMEM; + cpt_obj_setpos(obj, ti->cpt_sighand, ctx); + err = restore_one_sighand_struct(ti, ctx); + if (err) + return err; + } + + + obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx); + if (obj) { + struct signal_struct *sig = current->signal; + if (obj->o_obj != sig) { + return -EINVAL; + } +/* if (current->signal) { + pid_t session; + + session = process_session(current); + set_process_vgroup(current, session); + set_signal_vsession(current->signal, session); + }*/ + } else { + obj = cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, current->signal, ctx); + if (obj == NULL) + return -ENOMEM; + cpt_obj_setpos(obj, ti->cpt_signal, ctx); + err = restore_one_signal_struct(ti, exiting, ctx); + if (err) + return err; + } + + return 0; +} + +#ifdef CONFIG_X86 +static u32 decode_segment(u32 segid) +{ + if (segid == CPT_SEG_ZERO) + return 0; + + /* TLS descriptors */ + if (segid <= CPT_SEG_TLS3) + return ((GDT_ENTRY_TLS_MIN + segid-CPT_SEG_TLS1)<<3) + 3; + + /* LDT descriptor, it is just an index to LDT array */ + if (segid >= CPT_SEG_LDT) + return ((segid - CPT_SEG_LDT) << 3) | 7; + + /* Check for one of standard descriptors */ +#ifdef CONFIG_X86_64 + if (segid == CPT_SEG_USER32_DS) + return __USER32_DS; + if (segid == CPT_SEG_USER32_CS) + return __USER32_CS; + if (segid == CPT_SEG_USER64_DS) + return __USER_DS; + if (segid == CPT_SEG_USER64_CS) + return __USER_CS; +#else + if (segid == CPT_SEG_USER32_DS) + return __USER_DS; + if (segid == CPT_SEG_USER32_CS) + return __USER_CS; +#endif + wprintk("Invalid segment reg %d\n", segid); + return 0; +} +#endif + +#if defined (CONFIG_IA64) +void ia64_decrement_ip (struct pt_regs *regs) +{ + unsigned long w0, ri = ia64_psr(regs)->ri - 1; + + if (ia64_psr(regs)->ri == 0) { + regs->cr_iip -= 16; + ri = 2; + get_user(w0, (char __user *) regs->cr_iip + 0); + if (((w0 >> 1) & 0xf) == 2) { + /* + * rfi'ing to slot 2 of an MLX bundle causes + * an illegal operation fault. We don't want + * that to happen... + */ + ri = 1; + } + } + ia64_psr(regs)->ri = ri; +} +#endif + +static void rst_child_tid(unsigned long *child_tids) +{ + dprintk("rct: " CPT_FID "\n", CPT_TID(current)); + current->clear_child_tid = (void*)child_tids[0]; + current->set_child_tid = (void*)child_tids[1]; +} + +static void rst_last_siginfo(void) +{ + int signr; + siginfo_t *info = current->last_siginfo; + struct pt_regs *regs = task_pt_regs(current); + struct k_sigaction *ka; + int ptrace_id; + + dprintk("rlsi: " CPT_FID "\n", CPT_TID(current)); + + spin_lock_irq(¤t->sighand->siglock); + current->last_siginfo = NULL; + recalc_sigpending(); + + ptrace_id = current->pn_state; + clear_pn_state(current); + + switch (ptrace_id) { + case PN_STOP_TF: + case PN_STOP_TF_RT: + /* frame_*signal */ + dprintk("SIGTRAP %u/%u(%s) %u/%u %u %ld %u %lu\n", + task_pid_vnr(current), current->pid, current->comm, + info->si_signo, info->si_code, + current->exit_code, SYSCALL_NR(regs), + current->ptrace, current->ptrace_message); + goto out; + case PN_STOP_ENTRY: + case PN_STOP_LEAVE: + /* do_syscall_trace */ + spin_unlock_irq(¤t->sighand->siglock); + dprintk("ptrace do_syscall_trace: %d %d\n", ptrace_id, current->exit_code); + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } + if (IN_SYSCALL(regs)) { + if (ptrace_id == PN_STOP_ENTRY +#ifdef CONFIG_X86 + && SYSCALL_ERRNO(regs) == ENOSYS +#endif + ) + SYSCALL_RESTART(regs); + else if (IN_ERROR(regs) && + syscall_is(current, regs, rt_sigtimedwait) && + (SYSCALL_ERRNO(regs) == EAGAIN || + SYSCALL_ERRNO(regs) == EINTR)) + SYSCALL_RESTART(regs); + } + return; + case PN_STOP_FORK: + /* fork */ + SYSCALL_SETRET(regs, current->ptrace_message); + dprintk("ptrace fork returns pid %ld\n", SYSCALL_RETVAL(regs)); + goto out; + case PN_STOP_VFORK: + /* after vfork */ + SYSCALL_SETRET(regs, current->ptrace_message); + dprintk("ptrace after vfork returns pid %ld\n", SYSCALL_RETVAL(regs)); + goto out; + case PN_STOP_SIGNAL: + /* normal case : dequeue signal */ + break; + case PN_STOP_EXIT: + dprintk("ptrace exit caught\n"); + current->ptrace &= ~PT_TRACE_EXIT; + spin_unlock_irq(¤t->sighand->siglock); + module_put(THIS_MODULE); + complete_and_exit(NULL, current->ptrace_message); + BUG(); + case PN_STOP_EXEC: + eprintk("ptrace after exec caught: must not happen\n"); + BUG(); + default: + eprintk("ptrace with unknown identity %d\n", ptrace_id); + BUG(); + } + + signr = current->exit_code; + if (signr == 0) { + dprintk("rlsi: canceled signal %d\n", info->si_signo); + goto out; + } + current->exit_code = 0; + + if (signr != info->si_signo) { + info->si_signo = signr; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_pid_vnr(current->parent); + info->si_uid = current->parent->uid; + } + + /* If the (new) signal is now blocked, requeue it. */ + if (sigismember(¤t->blocked, signr)) { + dprintk("going to requeue signal %d\n", signr); + goto out_resend_sig; + } + + ka = ¤t->sighand->action[signr-1]; + if (ka->sa.sa_handler == SIG_IGN) { + dprintk("going to resend signal %d (ignored)\n", signr); + goto out; + } + if (ka->sa.sa_handler != SIG_DFL) { + dprintk("going to resend signal %d (not SIG_DFL)\n", signr); + goto out_resend_sig; + } + if (signr == SIGCONT || + signr == SIGCHLD || + signr == SIGWINCH || + signr == SIGURG || + current->pid == 1) + goto out; + + /* All the rest, which we cannot handle are requeued. */ + dprintk("going to resend signal %d (sigh)\n", signr); +out_resend_sig: + spin_unlock_irq(¤t->sighand->siglock); + send_sig_info(signr, info, current); + return; + +out: + spin_unlock_irq(¤t->sighand->siglock); +} + +static void rst_finish_stop(void) +{ + /* ... + * do_signal() -> + * get_signal_to_deliver() -> + * do_signal_stop() -> + * finish_stop() + * + * Normally after SIGCONT it will dequeue the next signal. If no signal + * is found, do_signal restarts syscall unconditionally. + * Otherwise signal handler is pushed on user stack. + */ + + dprintk("rfs: " CPT_FID "\n", CPT_TID(current)); + + clear_stop_state(current); + current->exit_code = 0; +} + +static void rst_restart_sys(void) +{ + struct pt_regs *regs = task_pt_regs(current); + + /* This hook is supposed to be executed, when we have + * to complete some interrupted syscall. + */ + dprintk("rrs: " CPT_FID "\n", CPT_TID(current)); + + if (!IN_SYSCALL(regs) || !IN_ERROR(regs)) + return; + +#ifdef __NR_pause + if (syscall_is(current,regs,pause)) { + if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + } + } else +#else + /* On this arch pause() is simulated with sigsuspend(). */ + if (syscall_is(current,regs,rt_sigsuspend)) { + if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + } + } else +#endif + if (syscall_is(current,regs,rt_sigtimedwait)) { + if (SYSCALL_ERRNO(regs) == EAGAIN || + SYSCALL_ERRNO(regs) == EINTR) { + SYSCALL_RESTART(regs); + } + } else if (syscall_is(current,regs,futex)) { + if (SYSCALL_ERRNO(regs) == EINTR && + !signal_pending(current)) { + SYSCALL_RESTART(regs); + } + } + + if (!signal_pending(current) && + !current_thread_info()->status & TS_RESTORE_SIGMASK) { + if (SYSCALL_ERRNO(regs) == ERESTARTSYS || + SYSCALL_ERRNO(regs) == ERESTARTNOINTR || + SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { + SYSCALL_RESTART(regs); + } else if (SYSCALL_ERRNO(regs) == ERESTART_RESTARTBLOCK) { + int new = __NR_restart_syscall; +#ifdef CONFIG_X86_64 + if (task_thread_info(current)->flags&_TIF_IA32) + new = __NR32_restart_syscall; +#endif + SYSCALL_RESTART2(regs, new); + } + } +} + +#ifdef CONFIG_X86_32 + +static int restore_registers(struct task_struct *tsk, struct pt_regs *regs, + struct cpt_task_image *ti, struct cpt_x86_regs *b, + struct resume_info **rip, struct cpt_context *ctx) +{ + extern char i386_ret_from_resume; + + if (b->cpt_object != CPT_OBJ_X86_REGS) + return -EINVAL; + + tsk->thread.sp = (unsigned long) regs; + tsk->thread.sp0 = (unsigned long) (regs+1); + tsk->thread.ip = (unsigned long) &i386_ret_from_resume; + + tsk->thread.gs = decode_segment(b->cpt_gs); + tsk->thread.debugreg0 = b->cpt_debugreg[0]; + tsk->thread.debugreg1 = b->cpt_debugreg[1]; + tsk->thread.debugreg2 = b->cpt_debugreg[2]; + tsk->thread.debugreg3 = b->cpt_debugreg[3]; + tsk->thread.debugreg6 = b->cpt_debugreg[6]; + tsk->thread.debugreg7 = b->cpt_debugreg[7]; + + regs->bx = b->cpt_ebx; + regs->cx = b->cpt_ecx; + regs->dx = b->cpt_edx; + regs->si = b->cpt_esi; + regs->di = b->cpt_edi; + regs->bp = b->cpt_ebp; + regs->ax = b->cpt_eax; + regs->ds = b->cpt_xds; + regs->es = b->cpt_xes; + regs->orig_ax = b->cpt_orig_eax; + regs->ip = b->cpt_eip; + regs->cs = b->cpt_xcs; + regs->flags = b->cpt_eflags; + regs->sp = b->cpt_esp; + regs->ss = b->cpt_xss; + + regs->cs = decode_segment(b->cpt_xcs); + regs->ss = decode_segment(b->cpt_xss); + regs->ds = decode_segment(b->cpt_xds); + regs->es = decode_segment(b->cpt_xes); + regs->fs = decode_segment(b->cpt_fs); + + tsk->thread.sp -= HOOK_RESERVE; + memset((void*)tsk->thread.sp, 0, HOOK_RESERVE); + *rip = (void*)tsk->thread.sp; + + return 0; +} + +#elif defined(CONFIG_X86_64) + +static void xlate_ptregs_32_to_64(struct pt_regs *d, struct cpt_x86_regs *s) +{ + memset(d, 0, sizeof(struct pt_regs)); + d->bp = s->cpt_ebp; + d->bx = s->cpt_ebx; + d->ax = (s32)s->cpt_eax; + d->cx = s->cpt_ecx; + d->dx = s->cpt_edx; + d->si = s->cpt_esi; + d->di = s->cpt_edi; + d->orig_ax = (s32)s->cpt_orig_eax; + d->ip = s->cpt_eip; + d->cs = s->cpt_xcs; + d->flags = s->cpt_eflags; + d->sp = s->cpt_esp; + d->ss = s->cpt_xss; +} + +static int restore_registers(struct task_struct *tsk, struct pt_regs *regs, + struct cpt_task_image *ti, struct cpt_obj_bits *hdr, + struct resume_info **rip, struct cpt_context *ctx) +{ + if (hdr->cpt_object == CPT_OBJ_X86_64_REGS) { + struct cpt_x86_64_regs *b = (void*)hdr; + + tsk->thread.sp = (unsigned long) regs; + tsk->thread.sp0 = (unsigned long) (regs+1); + + tsk->thread.fs = b->cpt_fsbase; + tsk->thread.gs = b->cpt_gsbase; + tsk->thread.fsindex = decode_segment(b->cpt_fsindex); + tsk->thread.gsindex = decode_segment(b->cpt_gsindex); + tsk->thread.ds = decode_segment(b->cpt_ds); + tsk->thread.es = decode_segment(b->cpt_es); + tsk->thread.debugreg0 = b->cpt_debugreg[0]; + tsk->thread.debugreg1 = b->cpt_debugreg[1]; + tsk->thread.debugreg2 = b->cpt_debugreg[2]; + tsk->thread.debugreg3 = b->cpt_debugreg[3]; + tsk->thread.debugreg6 = b->cpt_debugreg[6]; + tsk->thread.debugreg7 = b->cpt_debugreg[7]; + + memcpy(regs, &b->cpt_r15, sizeof(struct pt_regs)); + + tsk->thread.usersp = regs->sp; + regs->cs = decode_segment(b->cpt_cs); + regs->ss = decode_segment(b->cpt_ss); + } else if (hdr->cpt_object == CPT_OBJ_X86_REGS) { + struct cpt_x86_regs *b = (void*)hdr; + + tsk->thread.sp = (unsigned long) regs; + tsk->thread.sp0 = (unsigned long) (regs+1); + + tsk->thread.fs = 0; + tsk->thread.gs = 0; + tsk->thread.fsindex = decode_segment(b->cpt_fs); + tsk->thread.gsindex = decode_segment(b->cpt_gs); + tsk->thread.debugreg0 = b->cpt_debugreg[0]; + tsk->thread.debugreg1 = b->cpt_debugreg[1]; + tsk->thread.debugreg2 = b->cpt_debugreg[2]; + tsk->thread.debugreg3 = b->cpt_debugreg[3]; + tsk->thread.debugreg6 = b->cpt_debugreg[6]; + tsk->thread.debugreg7 = b->cpt_debugreg[7]; + + xlate_ptregs_32_to_64(regs, b); + + tsk->thread.usersp = regs->sp; + regs->cs = decode_segment(b->cpt_xcs); + regs->ss = decode_segment(b->cpt_xss); + tsk->thread.ds = decode_segment(b->cpt_xds); + tsk->thread.es = decode_segment(b->cpt_xes); + } else { + return -EINVAL; + } + + tsk->thread.sp -= HOOK_RESERVE; + memset((void*)tsk->thread.sp, 0, HOOK_RESERVE); + *rip = (void*)tsk->thread.sp; + return 0; +} + +#elif defined(CONFIG_IA64) + +#define MASK(nbits) ((1UL << (nbits)) - 1) /* mask with NBITS bits set */ + +#define PUT_BITS(first, last, nat) \ + ({ \ + unsigned long bit = ia64_unat_pos(&pt->r##first); \ + unsigned long nbits = (last - first + 1); \ + unsigned long mask = MASK(nbits) << first; \ + long dist; \ + if (bit < first) \ + dist = 64 + bit - first; \ + else \ + dist = bit - first; \ + ia64_rotl(nat & mask, dist); \ + }) + +unsigned long +ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat) +{ + unsigned long scratch_unat; + + /* + * Registers that are stored consecutively in struct pt_regs + * can be handled in parallel. If the register order in + * struct_pt_regs changes, this code MUST be updated. + */ + scratch_unat = PUT_BITS( 1, 1, nat); + scratch_unat |= PUT_BITS( 2, 3, nat); + scratch_unat |= PUT_BITS(12, 13, nat); + scratch_unat |= PUT_BITS(14, 14, nat); + scratch_unat |= PUT_BITS(15, 15, nat); + scratch_unat |= PUT_BITS( 8, 11, nat); + scratch_unat |= PUT_BITS(16, 31, nat); + + return scratch_unat; + +} + +static unsigned long +ia64_put_saved_nat_bits (struct switch_stack *pt, unsigned long nat) +{ + unsigned long scratch_unat; + + scratch_unat = PUT_BITS( 4, 7, nat); + + return scratch_unat; + +} + +#undef PUT_BITS + + +static int restore_registers(struct task_struct *tsk, struct pt_regs *pt, + struct cpt_task_image *ti, + struct cpt_ia64_regs *r, + struct resume_info **rip, + struct cpt_context *ctx) +{ + extern char ia64_ret_from_resume; + struct switch_stack *sw; + struct resume_info *ri; + struct ia64_psr *psr = ia64_psr(pt); + void *krbs = (void *)tsk + IA64_RBS_OFFSET; + unsigned long reg; + + if (r->cpt_object != CPT_OBJ_IA64_REGS) + return -EINVAL; + + if (r->num_regs > 96) { + eprintk(CPT_FID " too much RSE regs %lu\n", + CPT_TID(tsk), r->num_regs); + return -EINVAL; + } + + *rip = ri = ((void*)pt) - HOOK_RESERVE; + sw = ((struct switch_stack *) ri) - 1; + + memmove(sw, (void*)tsk->thread.ksp + 16, sizeof(struct switch_stack)); + memset(ri, 0, HOOK_RESERVE); + + /* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */ + memcpy(&pt->r1, &r->gr[1], 8*(2-1)); + memcpy(&pt->r2, &r->gr[2], 8*(4-2)); + memcpy(&pt->r8, &r->gr[8], 8*(12-8)); + memcpy(&pt->r12, &r->gr[12], 8*(14-12)); + memcpy(&pt->r14, &r->gr[14], 8*(15-14)); + memcpy(&pt->r15, &r->gr[15], 8*(16-15)); + memcpy(&pt->r16, &r->gr[16], 8*(32-16)); + + pt->b0 = r->br[0]; + pt->b6 = r->br[6]; + pt->b7 = r->br[7]; + + pt->ar_bspstore = r->ar_bspstore; + pt->ar_unat = r->ar_unat; + pt->ar_pfs = r->ar_pfs; + pt->ar_ccv = r->ar_ccv; + pt->ar_fpsr = r->ar_fpsr; + pt->ar_csd = r->ar_csd; + pt->ar_ssd = r->ar_ssd; + pt->ar_rsc = r->ar_rsc; + + pt->cr_iip = r->cr_iip; + pt->cr_ipsr = r->cr_ipsr; + + pt->pr = r->pr; + + pt->cr_ifs = r->cfm; + + /* fpregs 6..9,10..11 are in pt_regs */ + memcpy(&pt->f6, &r->fr[2*6], 16*(10-6)); + memcpy(&pt->f10, &r->fr[2*10], 16*(12-10)); + /* fpreg 12..15 are on switch stack */ + memcpy(&sw->f12, &r->fr[2*12], 16*(16-12)); + /* fpregs 32...127 */ + tsk->thread.flags |= IA64_THREAD_FPH_VALID; + memcpy(tsk->thread.fph, &r->fr[32*2], 16*(128-32)); + ia64_drop_fpu(tsk); + psr->dfh = 1; + + memcpy(&sw->r4, &r->gr[4], 8*(8-4)); + memcpy(&sw->b1, &r->br[1], 8*(6-1)); + sw->ar_lc = r->ar_lc; + + memcpy(&sw->f2, &r->fr[2*2], 16*(6-2)); + memcpy(&sw->f16, &r->fr[2*16], 16*(32-16)); + + sw->caller_unat = 0; + sw->ar_fpsr = pt->ar_fpsr; + sw->ar_unat = 0; + if (r->nat[0] & 0xFFFFFF0FUL) + sw->caller_unat = ia64_put_scratch_nat_bits(pt, r->nat[0]); + if (r->nat[0] & 0xF0) + sw->ar_unat = ia64_put_saved_nat_bits(sw, r->nat[0]); + + sw->ar_bspstore = (unsigned long)ia64_rse_skip_regs(krbs, r->num_regs); + memset(krbs, 0, (void*)sw->ar_bspstore - krbs); + sw->ar_rnat = 0; + sw->ar_pfs = 0; + + /* This is tricky. When we are in syscall, we have frame + * of output register (sometimes, plus one input reg sometimes). + * It is not so easy to restore such frame, RSE optimizes + * and does not fetch those regs from backstore. So, we restore + * the whole frame as local registers, and then repartition it + * in ia64_ret_from_resume(). + */ + if ((long)pt->cr_ifs >= 0) { + unsigned long out = (r->cfm&0x7F) - ((r->cfm>>7)&0x7F); + sw->ar_pfs = out | (out<<7); + } + if (r->ar_ec) + sw->ar_pfs |= (r->ar_ec & 0x3F) << 52; + + for (reg = 0; reg < r->num_regs; reg++) { + unsigned long *ptr = ia64_rse_skip_regs(krbs, reg); + unsigned long *rnatp; + unsigned long set_rnat = 0; + + *ptr = r->gr[32+reg]; + + if (reg < 32) + set_rnat = (r->nat[0] & (1UL<<(reg+32))); + else + set_rnat = (r->nat[1] & (1UL<<(reg-32))); + + if (set_rnat) { + rnatp = ia64_rse_rnat_addr(ptr); + if ((unsigned long)rnatp >= sw->ar_bspstore) + rnatp = &sw->ar_rnat; + *rnatp |= (1UL<b0 = (unsigned long) &ia64_ret_from_resume; + tsk->thread.ksp = (unsigned long) sw - 16; + +#define PRED_LEAVE_SYSCALL 1 /* TRUE iff leave from syscall */ +#define PRED_KERNEL_STACK 2 /* returning to kernel-stacks? */ +#define PRED_USER_STACK 3 /* returning to user-stacks? */ +#define PRED_SYSCALL 4 /* inside a system call? */ +#define PRED_NON_SYSCALL 5 /* complement of PRED_SYSCALL */ + + pt->loadrs = r->loadrs; + sw->pr = 0; + sw->pr &= ~(1UL << PRED_LEAVE_SYSCALL); + sw->pr &= ~((1UL << PRED_SYSCALL) | (1UL << PRED_NON_SYSCALL)); + sw->pr &= ~(1UL << PRED_KERNEL_STACK); + sw->pr |= (1UL << PRED_USER_STACK); + if ((long)pt->cr_ifs < 0) { + sw->pr |= (1UL << PRED_NON_SYSCALL); + } else { + sw->pr |= ((1UL << PRED_SYSCALL) | (1UL << PRED_LEAVE_SYSCALL)); + } + + return 0; +} +#endif + +asmlinkage void rst_resume_work(struct resume_info *ri) +{ + if (ri->hooks & (1<tid_ptrs); + if (ri->hooks & (1<hooks & (1<hooks & (1<thread.xstate->fxsave.mxcsr &= 0x0000ffbf; +#endif +} + +#ifdef CONFIG_X86 +#include +#endif + +int rst_restore_process(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + struct pt_regs * regs; + struct cpt_object_hdr *b; + struct cpt_siginfo_image *lsi = NULL; + struct group_info *gids, *ogids; + struct resume_info *ri = NULL; + int i; + int err = 0; +#ifdef CONFIG_BEANCOUNTERS + struct task_beancounter *tbc; + struct user_beancounter *new_bc, *old_bc; +#endif + + if (tsk == NULL) { + eprintk_ctx("oops, task %d/%s is missing\n", ti->cpt_pid, ti->cpt_comm); + return -EFAULT; + } + + wait_task_inactive(tsk, 0); +#ifdef CONFIG_BEANCOUNTERS + tbc = &tsk->task_bc; + new_bc = rst_lookup_ubc(ti->cpt_exec_ub, ctx); + err = virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RSTTSK, new_bc); + if (err & NOTIFY_FAIL) { + put_beancounter(new_bc); + return -ECHRNG; + } + old_bc = tbc->exec_ub; + if ((err & VIRTNOTIFY_CHANGE) && old_bc != new_bc) { + dprintk(" *** replacing ub %p by %p for %p (%d %s)\n", + old_bc, new_bc, tsk, + tsk->pid, tsk->comm); + tbc->exec_ub = new_bc; + new_bc = old_bc; + } + put_beancounter(new_bc); +#endif + regs = task_pt_regs(tsk); + + if (!tsk->exit_state) { + tsk->lock_depth = -1; +#ifdef CONFIG_PREEMPT + task_thread_info(tsk)->preempt_count--; +#endif + } + + if (tsk->static_prio != ti->cpt_static_prio) + set_user_nice(tsk, PRIO_TO_NICE((s32)ti->cpt_static_prio)); + + cpt_sigset_import(&tsk->blocked, ti->cpt_sigblocked); + cpt_sigset_import(&tsk->real_blocked, ti->cpt_sigrblocked); + cpt_sigset_import(&tsk->saved_sigmask, ti->cpt_sigsuspend_blocked); + cpt_sigset_import(&tsk->pending.signal, ti->cpt_sigpending); + + tsk->uid = ti->cpt_uid; + tsk->euid = ti->cpt_euid; + tsk->suid = ti->cpt_suid; + tsk->fsuid = ti->cpt_fsuid; + tsk->gid = ti->cpt_gid; + tsk->egid = ti->cpt_egid; + tsk->sgid = ti->cpt_sgid; + tsk->fsgid = ti->cpt_fsgid; +#ifdef CONFIG_IA64 + SET_UNALIGN_CTL(tsk, ti->cpt_prctl_uac); + SET_FPEMU_CTL(tsk, ti->cpt_prctl_fpemu); +#endif + memcpy(&tsk->cap_effective, &ti->cpt_ecap, sizeof(tsk->cap_effective)); + memcpy(&tsk->cap_inheritable, &ti->cpt_icap, sizeof(tsk->cap_inheritable)); + memcpy(&tsk->cap_permitted, &ti->cpt_pcap, sizeof(tsk->cap_permitted)); + if (ctx->image_version < CPT_VERSION_26) + tsk->securebits = (ti->cpt_keepcap != 0) ? + issecure_mask(SECURE_KEEP_CAPS) : 0; + else + tsk->securebits = ti->cpt_keepcap; + tsk->did_exec = (ti->cpt_did_exec != 0); + gids = groups_alloc(ti->cpt_ngids); + ogids = tsk->group_info; + if (gids) { + int i; + for (i=0; i<32; i++) + gids->small_block[i] = ti->cpt_gids[i]; + tsk->group_info = gids; + } + if (ogids) + put_group_info(ogids); + tsk->utime = ti->cpt_utime; + tsk->stime = ti->cpt_stime; + if (ctx->image_version == CPT_VERSION_8) + tsk->start_time = _ns_to_timespec(ti->cpt_starttime*TICK_NSEC); + else + cpt_timespec_import(&tsk->start_time, ti->cpt_starttime); + _set_normalized_timespec(&tsk->start_time, + tsk->start_time.tv_sec + + VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_sec, + tsk->start_time.tv_nsec + + VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_nsec); + + tsk->nvcsw = ti->cpt_nvcsw; + tsk->nivcsw = ti->cpt_nivcsw; + tsk->min_flt = ti->cpt_min_flt; + tsk->maj_flt = ti->cpt_maj_flt; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) + tsk->cutime = ti->cpt_cutime; + tsk->cstime = ti->cpt_cstime; + tsk->cnvcsw = ti->cpt_cnvcsw; + tsk->cnivcsw = ti->cpt_cnivcsw; + tsk->cmin_flt = ti->cpt_cmin_flt; + tsk->cmaj_flt = ti->cpt_cmaj_flt; + + BUILD_BUG_ON(RLIM_NLIMITS > CPT_RLIM_NLIMITS); + + for (i=0; irlim[i].rlim_cur = ti->cpt_rlim_cur[i]; + tsk->rlim[i].rlim_max = ti->cpt_rlim_max[i]; + } +#else + if (thread_group_leader(tsk) && tsk->signal) { + tsk->signal->utime = ti->cpt_utime; + tsk->signal->stime = ti->cpt_stime; + tsk->signal->cutime = ti->cpt_cutime; + tsk->signal->cstime = ti->cpt_cstime; + tsk->signal->nvcsw = ti->cpt_nvcsw; + tsk->signal->nivcsw = ti->cpt_nivcsw; + tsk->signal->cnvcsw = ti->cpt_cnvcsw; + tsk->signal->cnivcsw = ti->cpt_cnivcsw; + tsk->signal->min_flt = ti->cpt_min_flt; + tsk->signal->maj_flt = ti->cpt_maj_flt; + tsk->signal->cmin_flt = ti->cpt_cmin_flt; + tsk->signal->cmaj_flt = ti->cpt_cmaj_flt; + + for (i=0; isignal->rlim[i].rlim_cur = ti->cpt_rlim_cur[i]; + tsk->signal->rlim[i].rlim_max = ti->cpt_rlim_max[i]; + } + } +#endif + +#ifdef CONFIG_X86 + for (i=0; i<3; i++) { + if (i >= GDT_ENTRY_TLS_ENTRIES) { + eprintk_ctx("too many tls descs\n"); + } else { + tsk->thread.tls_array[i].a = ti->cpt_tls[i]&0xFFFFFFFF; + tsk->thread.tls_array[i].b = ti->cpt_tls[i]>>32; + } + } +#endif + + clear_stopped_child_used_math(tsk); + + b = (void *)(ti+1); + while ((void*)b < ((void*)ti) + ti->cpt_next) { + /* Siginfo objects are at the end of obj array */ + if (b->cpt_object == CPT_OBJ_SIGINFO) { + struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); + restore_sigqueue(tsk, &tsk->pending, (unsigned long)b, (unsigned long)ti + ti->cpt_next); + set_exec_env(env); + break; + } + + switch (b->cpt_object) { +#ifdef CONFIG_X86 + case CPT_OBJ_BITS: + if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE && + cpu_has_fxsr) { + if (init_fpu(tsk)) + return -ENOMEM; + memcpy(tsk->thread.xstate, + (void*)b + b->cpt_hdrlen, + sizeof(struct i387_fxsave_struct)); + rst_apply_mxcsr_mask(tsk); + if (ti->cpt_used_math) + set_stopped_child_used_math(tsk); + } +#ifndef CONFIG_X86_64 + else if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE_OLD && + !cpu_has_fxsr) { + if (init_fpu(tsk)) + return -ENOMEM; + memcpy(tsk->thread.xstate, + (void*)b + b->cpt_hdrlen, + sizeof(struct i387_fsave_struct)); + if (ti->cpt_used_math) + set_stopped_child_used_math(tsk); + } +#endif + break; +#endif + case CPT_OBJ_LASTSIGINFO: + lsi = (void*)b; + break; + case CPT_OBJ_X86_REGS: + case CPT_OBJ_X86_64_REGS: + case CPT_OBJ_IA64_REGS: + if (restore_registers(tsk, regs, ti, (void*)b, &ri, ctx)) { + eprintk_ctx("cannot restore registers: image is corrupted\n"); + return -EINVAL; + } + break; + case CPT_OBJ_SIGALTSTACK: { + struct cpt_sigaltstack_image *sas; + sas = (struct cpt_sigaltstack_image *)b; + tsk->sas_ss_sp = sas->cpt_stack; + tsk->sas_ss_size = sas->cpt_stacksize; + break; + } + case CPT_OBJ_TASK_AUX: { + struct cpt_task_aux_image *ai; + ai = (struct cpt_task_aux_image *)b; + tsk->robust_list = cpt_ptr_import(ai->cpt_robust_list); +#ifdef CONFIG_X86_64 +#ifdef CONFIG_COMPAT + if (task_thread_info(tsk)->flags&_TIF_IA32) { + tsk->robust_list = (void __user *)NULL; + tsk->compat_robust_list = cpt_ptr_import(ai->cpt_robust_list); + } +#endif +#endif + break; + } + } + b = ((void*)b) + b->cpt_next; + } + + if (ri == NULL && !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + eprintk_ctx("missing register info\n"); + return -EINVAL; + } + + if (ti->cpt_ppid != ti->cpt_rppid) { + struct task_struct *parent; + struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); + write_lock_irq(&tasklist_lock); + parent = find_task_by_vpid(ti->cpt_ppid); + if (parent && parent != tsk->parent) { + list_add(&tsk->ptrace_entry, &tsk->parent->ptraced); + /* + * Ptraced kids are no longer in the parent children + * remove_parent(tsk); + * tsk->parent = parent; + * add_parent(tsk); + */ + } + write_unlock_irq(&tasklist_lock); + set_exec_env(env); + } + + tsk->ptrace_message = ti->cpt_ptrace_message; + tsk->pn_state = ti->cpt_pn_state; + tsk->stopped_state = ti->cpt_stopped_state; + task_thread_info(tsk)->flags = ti->cpt_thrflags; + + /* The image was created with kernel < 2.6.16, while + * task hanged in sigsuspend -> do_signal. + * + * FIXME! This needs more brain efforts... + */ + if (ti->cpt_sigsuspend_state) { + set_restore_sigmask(); + } + +#ifdef CONFIG_X86_64 + task_thread_info(tsk)->flags |= _TIF_FORK | _TIF_RESUME; + if (!ti->cpt_64bit) + task_thread_info(tsk)->flags |= _TIF_IA32; +#endif + +#ifdef CONFIG_X86_32 + do { + if (regs->orig_ax == __NR__newselect && regs->di) { + struct timeval tv; + if (access_process_vm(tsk, regs->di, &tv, + sizeof(tv), 0) != sizeof(tv)) { + wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm: edi %ld\n", + task_pid_vnr(tsk), tsk->pid, tsk->comm, + regs->di); + break; + } + dprintk_ctx("task %d/%d(%s): Old timeval in newselect: %ld.%ld\n", + task_pid_vnr(tsk), tsk->pid, tsk->comm, + tv.tv_sec, tv.tv_usec); + tv.tv_sec -= ctx->delta_time.tv_sec; + if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { + tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; + tv.tv_sec--; + } else { + tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; + } + if (tv.tv_sec < 0) { + tv.tv_sec = 0; + tv.tv_usec = 0; + } + dprintk_ctx("task %d/%d(%s): New timeval in newselect: %ld.%ld\n", + task_pid_vnr(tsk), tsk->pid, tsk->comm, + tv.tv_sec, tv.tv_usec); + if (access_process_vm(tsk, regs->di, &tv, + sizeof(tv), 1) != sizeof(tv)) { + wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm write: edi %ld\n", + task_pid_vnr(tsk), tsk->pid, tsk->comm, regs->di); + } + + } else if (regs->orig_ax == __NR_select && regs->di) { + struct { + unsigned long n; + fd_set __user *inp, *outp, *exp; + struct timeval __user *tvp; + } a; + struct timeval tv; + if (access_process_vm(tsk, regs->bx, &a, + sizeof(a), 0) != sizeof(a)) { + wprintk_ctx("task %d: Error 2 in access_process_vm\n", tsk->pid); + break; + } + if (access_process_vm(tsk, (unsigned long)a.tvp, + &tv, sizeof(tv), 0) != sizeof(tv)) { + wprintk_ctx("task %d: Error 3 in access_process_vm\n", tsk->pid); + break; + } + dprintk_ctx("task %d: Old timeval in select: %ld.%ld\n", + tsk->pid, tv.tv_sec, tv.tv_usec); + tv.tv_sec -= ctx->delta_time.tv_sec; + if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { + tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; + tv.tv_sec--; + } else { + tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; + } + if (tv.tv_sec < 0) { + tv.tv_sec = 0; + tv.tv_usec = 0; + } + dprintk_ctx("task %d: New timeval in select: %ld.%ld\n", + tsk->pid, tv.tv_sec, tv.tv_usec); + if (access_process_vm(tsk, (unsigned long)a.tvp, + &tv, sizeof(tv), 1) != sizeof(tv)) { + wprintk_ctx("task %d: Error 3 in access_process_vm write\n", tsk->pid); + } + } + } while (0); +#endif + + if (ri && IN_SYSCALL(regs) && IN_ERROR(regs)) { + switch (SYSCALL_ERRNO(regs)) { + case ERESTARTSYS: + case ERESTARTNOINTR: + case ERESTARTNOHAND: + case ERESTART_RESTARTBLOCK: + case EAGAIN: + case EINTR: + ri->hooks |= (1<pn_state)) { + /* ... -> ptrace_notify() + * or + * ... -> do_signal() -> get_signal_to_deliver() -> + * ptrace stop + */ + tsk->last_siginfo = &ri->last_siginfo; + ri->hooks |= (1<last_siginfo, lsi); + } + + tsk->ptrace = ti->cpt_ptrace; + tsk->flags = ti->cpt_flags & ~PF_FROZEN; + clear_tsk_thread_flag(tsk, TIF_FREEZE); + tsk->exit_signal = ti->cpt_exit_signal; + + if (ri && tsk->stopped_state) { + dprintk_ctx("finish_stop\n"); + if (ti->cpt_state != TASK_STOPPED) + eprintk_ctx("Hellooo, state is %u\n", (unsigned)ti->cpt_state); + ri->hooks |= (1<cpt_set_tid || ti->cpt_clear_tid)) { + ri->hooks |= (1<tid_ptrs[0] = ti->cpt_clear_tid; + ri->tid_ptrs[1] = ti->cpt_set_tid; + dprintk_ctx("settids\n"); + } + + if (ri && ri->hooks && + !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + if (try_module_get(THIS_MODULE)) + ri->hook = rst_resume_work; + } + + if (ti->cpt_state == TASK_TRACED) + tsk->state = TASK_TRACED; + else if (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD)) { + tsk->signal->it_virt_expires = 0; + tsk->signal->it_prof_expires = 0; + if (tsk->state != TASK_DEAD) + eprintk_ctx("oops, schedule() did not make us dead\n"); + } + + if (thread_group_leader(tsk) && + ti->cpt_it_real_value && + !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + ktime_t val; + s64 nsec; + + nsec = ti->cpt_it_real_value; + val.tv64 = 0; + + if (ctx->image_version < CPT_VERSION_9) + nsec *= TICK_NSEC; + + val = ktime_add_ns(val, nsec - ctx->delta_nsec); + if (val.tv64 <= 0) + val.tv64 = NSEC_PER_USEC; + dprintk("rst itimer " CPT_FID " +%Ld %Lu\n", CPT_TID(tsk), + (long long)val.tv64, + (unsigned long long)ti->cpt_it_real_value); + + spin_lock_irq(&tsk->sighand->siglock); + if (hrtimer_try_to_cancel(&tsk->signal->real_timer) >= 0) { + /* FIXME. Check!!!! */ + hrtimer_start(&tsk->signal->real_timer, val, HRTIMER_MODE_REL); + } else { + wprintk_ctx("Timer clash. Impossible?\n"); + } + spin_unlock_irq(&tsk->sighand->siglock); + + dprintk_ctx("itimer " CPT_FID " +%Lu\n", CPT_TID(tsk), + (unsigned long long)val.tv64); + } + + module_put(THIS_MODULE); + } + return 0; +} diff --git a/kernel/cpt/rst_socket.c b/kernel/cpt/rst_socket.c new file mode 100644 index 0000000..22e1d1b --- /dev/null +++ b/kernel/cpt/rst_socket.c @@ -0,0 +1,918 @@ +/* + * + * kernel/cpt/rst_socket.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" + +#include "cpt_syscalls.h" + + +static int setup_sock_common(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + struct timeval tmptv; + + if (sk->sk_socket) { + sk->sk_socket->flags = si->cpt_ssflags; + sk->sk_socket->state = si->cpt_sstate; + } + sk->sk_reuse = si->cpt_reuse; + sk->sk_shutdown = si->cpt_shutdown; + sk->sk_userlocks = si->cpt_userlocks; + sk->sk_no_check = si->cpt_no_check; + sock_reset_flag(sk, SOCK_DBG); + if (si->cpt_debug) + sock_set_flag(sk, SOCK_DBG); + sock_reset_flag(sk, SOCK_RCVTSTAMP); + if (si->cpt_rcvtstamp) + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_LOCALROUTE); + if (si->cpt_localroute) + sock_set_flag(sk, SOCK_LOCALROUTE); + sk->sk_protocol = si->cpt_protocol; + sk->sk_err = si->cpt_err; + sk->sk_err_soft = si->cpt_err_soft; + sk->sk_priority = si->cpt_priority; + sk->sk_rcvlowat = si->cpt_rcvlowat; + sk->sk_rcvtimeo = si->cpt_rcvtimeo; + if (si->cpt_rcvtimeo == CPT_NULL) + sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sk_sndtimeo = si->cpt_sndtimeo; + if (si->cpt_sndtimeo == CPT_NULL) + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sk_rcvbuf = si->cpt_rcvbuf; + sk->sk_sndbuf = si->cpt_sndbuf; + sk->sk_bound_dev_if = si->cpt_bound_dev_if; + sk->sk_flags = si->cpt_flags; + sk->sk_lingertime = si->cpt_lingertime; + if (si->cpt_lingertime == CPT_NULL) + sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + sk->sk_peercred.pid = si->cpt_peer_pid; + sk->sk_peercred.uid = si->cpt_peer_uid; + sk->sk_peercred.gid = si->cpt_peer_gid; + cpt_timeval_import(&tmptv, si->cpt_stamp); + sk->sk_stamp = timeval_to_ktime(tmptv); + return 0; +} + +static struct file *sock_mapfile(struct socket *sock) +{ + int fd = sock_map_fd(sock, 0); + + if (fd >= 0) { + struct file *file = sock->file; + get_file(file); + sc_close(fd); + return file; + } + return ERR_PTR(fd); +} + +/* Assumption is that /tmp exists and writable. + * In previous versions we assumed that listen() will autobind + * the socket. It does not do this for AF_UNIX by evident reason: + * socket in abstract namespace is accessible, unlike socket bound + * to deleted FS object. + */ + +static int +select_deleted_name(char * name, cpt_context_t *ctx) +{ + int i; + + for (i=0; i<100; i++) { + struct nameidata nd; + unsigned int rnd = net_random(); + + sprintf(name, "/tmp/SOCK.%08x", rnd); + + if (path_lookup(name, 0, &nd) != 0) + return 0; + + path_put(&nd.path); + } + + eprintk_ctx("failed to allocate deleted socket inode\n"); + return -ELOOP; +} + +static int +bind_unix_socket(struct socket *sock, struct cpt_sock_image *si, + cpt_context_t *ctx) +{ + int err; + char *name; + struct sockaddr* addr; + int addrlen; + struct sockaddr_un sun; + struct nameidata nd; + + if ((addrlen = si->cpt_laddrlen) <= 2) + return 0; + + nd.path.dentry = NULL; + name = ((char*)si->cpt_laddr) + 2; + addr = (struct sockaddr *)si->cpt_laddr; + + if (name[0]) { + if (path_lookup(name, 0, &nd)) + nd.path.dentry = NULL; + + if (si->cpt_deleted) { + if (nd.path.dentry == NULL && + sock->ops->bind(sock, addr, addrlen) == 0) { + sc_unlink(name); + return 0; + } + + addr = (struct sockaddr*)&sun; + addr->sa_family = AF_UNIX; + name = ((char*)addr) + 2; + err = select_deleted_name(name, ctx); + if (err) + goto out; + addrlen = 2 + strlen(name); + } else if (nd.path.dentry) { + if (!S_ISSOCK(nd.path.dentry->d_inode->i_mode)) { + eprintk_ctx("bind_unix_socket: not a socket dentry\n"); + err = -EINVAL; + goto out; + } + sc_unlink(name); + } + } + + err = sock->ops->bind(sock, addr, addrlen); + + if (!err && name[0]) { + if (nd.path.dentry) { + sc_chown(name, nd.path.dentry->d_inode->i_uid, + nd.path.dentry->d_inode->i_gid); + sc_chmod(name, nd.path.dentry->d_inode->i_mode); + } + if (si->cpt_deleted) + sc_unlink(name); + } + +out: + if (nd.path.dentry) + path_put(&nd.path); + return err; +} + +static int fixup_unix_address(struct socket *sock, struct cpt_sock_image *si, + struct cpt_context *ctx) +{ + struct sock *sk = sock->sk; + cpt_object_t *obj; + struct sock *parent; + + if (sk->sk_family != AF_UNIX || sk->sk_state == TCP_LISTEN) + return 0; + + if (si->cpt_parent == -1) + return bind_unix_socket(sock, si, ctx); + + obj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); + if (!obj) + return 0; + + parent = obj->o_obj; + if (unix_sk(parent)->addr) { + if (unix_sk(sk)->addr && + atomic_dec_and_test(&unix_sk(sk)->addr->refcnt)) + kfree(unix_sk(sk)->addr); + atomic_inc(&unix_sk(parent)->addr->refcnt); + unix_sk(sk)->addr = unix_sk(parent)->addr; + } + return 0; +} + +static int generic_restore_queues(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + loff_t endpos; + + pos = pos + si->cpt_hdrlen; + endpos = pos + si->cpt_next; + while (pos < endpos) { + struct sk_buff *skb; + __u32 type; + + skb = rst_skb(&pos, NULL, &type, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; + + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + } + return PTR_ERR(skb); + } + + if (type == CPT_SKB_RQ) { + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->sk_receive_queue, skb); + } else { + wprintk_ctx("strange socket queue type %u\n", type); + kfree_skb(skb); + } + } + return 0; +} + +static int open_socket(cpt_object_t *obj, struct cpt_sock_image *si, + struct cpt_context *ctx) +{ + int err; + struct socket *sock; + struct socket *sock2 = NULL; + struct file *file; + cpt_object_t *fobj; + cpt_object_t *pobj = NULL; + + err = sock_create(si->cpt_family, si->cpt_type, si->cpt_protocol, + &sock); + if (err) + return err; + + if (si->cpt_socketpair) { + err = sock_create(si->cpt_family, si->cpt_type, + si->cpt_protocol, &sock2); + if (err) + goto err_out; + + err = sock->ops->socketpair(sock, sock2); + if (err < 0) + goto err_out; + + /* Socketpair with a peer outside our environment. + * So, we create real half-open pipe and do not worry + * about dead end anymore. */ + if (si->cpt_peer == -1) { + sock_release(sock2); + sock2 = NULL; + } + } + + cpt_obj_setobj(obj, sock->sk, ctx); + + if (si->cpt_file != CPT_NULL) { + file = sock_mapfile(sock); + err = PTR_ERR(file); + if (IS_ERR(file)) + goto err_out; + + err = -ENOMEM; + + obj->o_parent = file; + + if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) + goto err_out; + cpt_obj_setpos(fobj, si->cpt_file, ctx); + cpt_obj_setindex(fobj, si->cpt_index, ctx); + } + + if (sock2) { + struct file *file2; + + pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_peer, ctx); + if (!pobj) BUG(); + if (pobj->o_obj) BUG(); + cpt_obj_setobj(pobj, sock2->sk, ctx); + + if (pobj->o_ppos != CPT_NULL) { + file2 = sock_mapfile(sock2); + err = PTR_ERR(file2); + if (IS_ERR(file2)) + goto err_out; + + err = -ENOMEM; + if ((fobj = cpt_object_add(CPT_OBJ_FILE, file2, ctx)) == NULL) + goto err_out; + cpt_obj_setpos(fobj, pobj->o_ppos, ctx); + cpt_obj_setindex(fobj, si->cpt_peer, ctx); + + pobj->o_parent = file2; + } + } + + setup_sock_common(sock->sk, si, obj->o_pos, ctx); + if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) { + int saved_reuse = sock->sk->sk_reuse; + + inet_sk(sock->sk)->freebind = 1; + sock->sk->sk_reuse = 2; + if (si->cpt_laddrlen) { + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); + if (err) { + dprintk_ctx("binding failed: %d, do not worry\n", err); + } + } + sock->sk->sk_reuse = saved_reuse; + rst_socket_in(si, obj->o_pos, sock->sk, ctx); + } else if (sock->sk->sk_family == AF_NETLINK) { + struct sockaddr_nl *nl = (struct sockaddr_nl *)&si->cpt_laddr; + if (nl->nl_pid) { + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); + if (err) { + eprintk_ctx("AF_NETLINK binding failed: %d\n", err); + } + } + if (si->cpt_raddrlen && nl->nl_pid) { + err = sock->ops->connect(sock, (struct sockaddr *)&si->cpt_raddr, si->cpt_raddrlen, O_NONBLOCK); + if (err) { + eprintk_ctx("oops, AF_NETLINK connect failed: %d\n", err); + } + } + generic_restore_queues(sock->sk, si, obj->o_pos, ctx); + } else if (sock->sk->sk_family == PF_PACKET) { + struct sockaddr_ll *ll = (struct sockaddr_ll *)&si->cpt_laddr; + if (ll->sll_protocol || ll->sll_ifindex) { + int alen = si->cpt_laddrlen; + if (alen < sizeof(struct sockaddr_ll)) + alen = sizeof(struct sockaddr_ll); + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, alen); + if (err) { + eprintk_ctx("AF_PACKET binding failed: %d\n", err); + } + } + generic_restore_queues(sock->sk, si, obj->o_pos, ctx); + } + fixup_unix_address(sock, si, ctx); + + if (sock2) { + err = rst_get_object(CPT_OBJ_SOCKET, pobj->o_pos, si, ctx); + if (err) + return err; + setup_sock_common(sock2->sk, si, pobj->o_pos, ctx); + fixup_unix_address(sock2, si, ctx); + } + + if ((sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) + && (int)si->cpt_parent != -1) { + cpt_object_t *lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); + if (lobj && cpt_attach_accept(lobj->o_obj, sock->sk, ctx) == 0) + sock->sk = NULL; + } + + + if (si->cpt_file == CPT_NULL && sock->sk && + sock->sk->sk_family == AF_INET) { + struct sock *sk = sock->sk; + + if (sk) { + sock->sk = NULL; + + local_bh_disable(); + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + eprintk_ctx("oops, sock is locked by user\n"); + + sock_hold(sk); + sock_orphan(sk); + ub_inc_orphan_count(sk); + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); + dprintk_ctx("orphaning socket %p\n", sk); + } + } + + if (si->cpt_file == CPT_NULL && sock->sk == NULL) + sock_release(sock); + + return 0; + +err_out: + if (sock2) + sock_release(sock2); + sock_release(sock); + return err; +} + +static int open_listening_socket(loff_t pos, struct cpt_sock_image *si, + struct cpt_context *ctx) +{ + int err; + struct socket *sock; + struct file *file; + cpt_object_t *obj, *fobj; + + err = sock_create(si->cpt_family, si->cpt_type, si->cpt_protocol, + &sock); + if (err) { + eprintk_ctx("open_listening_socket: sock_create: %d\n", err); + return err; + } + + sock->sk->sk_reuse = 2; + sock->sk->sk_bound_dev_if = si->cpt_bound_dev_if; + + if (sock->sk->sk_family == AF_UNIX) { + err = bind_unix_socket(sock, si, ctx); + } else if (si->cpt_laddrlen) { + if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) + inet_sk(sock->sk)->freebind = 1; + + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); + + if (err) { + eprintk_ctx("open_listening_socket: bind: %d\n", err); + goto err_out; + } + } + + err = sock->ops->listen(sock, si->cpt_max_ack_backlog); + if (err) { + eprintk_ctx("open_listening_socket: listen: %d, %Ld, %d\n", err, pos, si->cpt_deleted); + goto err_out; + } + + /* Now we may access socket body directly and fixup all the things. */ + + file = sock_mapfile(sock); + err = PTR_ERR(file); + if (IS_ERR(file)) { + eprintk_ctx("open_listening_socket: map: %d\n", err); + goto err_out; + } + + err = -ENOMEM; + if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) + goto err_out; + if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sock->sk, ctx)) == NULL) + goto err_out; + cpt_obj_setpos(obj, pos, ctx); + cpt_obj_setindex(obj, si->cpt_index, ctx); + obj->o_parent = file; + cpt_obj_setpos(fobj, si->cpt_file, ctx); + cpt_obj_setindex(fobj, si->cpt_index, ctx); + + setup_sock_common(sock->sk, si, pos, ctx); + + if (si->cpt_family == AF_INET || si->cpt_family == AF_INET6) + rst_restore_synwait_queue(sock->sk, si, pos, ctx); + + return 0; + +err_out: + sock_release(sock); + return err; +} + +static int +rst_sock_attr_mcfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) +{ + int err; + loff_t pos = *pos_p; + struct cpt_sockmc_image v; + + err = rst_get_object(CPT_OBJ_SOCK_MCADDR, pos, &v, ctx); + if (err) + return err; + + *pos_p += v.cpt_next; + + if (v.cpt_family == AF_INET) + return rst_sk_mcfilter_in(sk, &v, pos, ctx); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (v.cpt_family == AF_INET6) + return rst_sk_mcfilter_in6(sk, &v, pos, ctx); +#endif + else + return -EAFNOSUPPORT; +} + + +static int +rst_sock_attr_skfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) +{ + int err; + struct sk_filter *fp, *old_fp; + loff_t pos = *pos_p; + struct cpt_obj_bits v; + + err = rst_get_object(CPT_OBJ_SKFILTER, pos, &v, ctx); + if (err) + return err; + + *pos_p += v.cpt_next; + + if (v.cpt_size % sizeof(struct sock_filter)) + return -EINVAL; + + fp = sock_kmalloc(sk, v.cpt_size+sizeof(*fp), GFP_KERNEL_UBC); + if (fp == NULL) + return -ENOMEM; + atomic_set(&fp->refcnt, 1); + fp->len = v.cpt_size/sizeof(struct sock_filter); + + err = ctx->pread(fp->insns, v.cpt_size, ctx, pos+v.cpt_hdrlen); + if (err) { + sk_filter_uncharge(sk, fp); + return err; + } + + old_fp = sk->sk_filter; + sk->sk_filter = fp; + if (old_fp) + sk_filter_uncharge(sk, old_fp); + return 0; +} + + +int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) +{ + int err; + loff_t pos = *pos_p; + + err = rst_sock_attr_skfilter(pos_p, sk, ctx); + if (err && pos == *pos_p) + err = rst_sock_attr_mcfilter(pos_p, sk, ctx); + return err; +} + +struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx) +{ + int err; + struct sk_buff *skb; + struct cpt_skb_image v; + loff_t pos = *pos_p; + struct scm_fp_list *fpl = NULL; + struct timeval tmptv; + + err = rst_get_object(CPT_OBJ_SKB, pos, &v, ctx); + if (err) + return ERR_PTR(err); + *pos_p = pos + v.cpt_next; + + if (owner) + *owner = v.cpt_owner; + if (queue) + *queue = v.cpt_queue; + + skb = alloc_skb(v.cpt_len + v.cpt_hspace + v.cpt_tspace, GFP_KERNEL); + if (skb == NULL) + return ERR_PTR(-ENOMEM); + skb_reserve(skb, v.cpt_hspace); + skb_put(skb, v.cpt_len); +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->transport_header = v.cpt_h; + skb->network_header = v.cpt_nh; + skb->mac_header = v.cpt_mac; +#else + skb->transport_header = skb->head + v.cpt_h; + skb->network_header = skb->head + v.cpt_nh; + skb->mac_header = skb->head + v.cpt_mac; +#endif + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v.cpt_cb)); + memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb)); + skb->mac_len = v.cpt_mac_len; + + skb->csum = v.cpt_csum; + skb->local_df = v.cpt_local_df; + skb->pkt_type = v.cpt_pkt_type; + skb->ip_summed = v.cpt_ip_summed; + skb->priority = v.cpt_priority; + skb->protocol = v.cpt_protocol; + cpt_timeval_import(&tmptv, v.cpt_stamp); + skb->tstamp = timeval_to_ktime(tmptv); + + skb_shinfo(skb)->gso_segs = v.cpt_gso_segs; + skb_shinfo(skb)->gso_size = v.cpt_gso_size; + if (ctx->image_version == 0) { + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + } + + if (v.cpt_next > v.cpt_hdrlen) { + pos = pos + v.cpt_hdrlen; + while (pos < *pos_p) { + union { + struct cpt_obj_bits b; + struct cpt_fd_image f; + } u; + + err = rst_get_object(-1, pos, &u, ctx); + if (err) { + kfree_skb(skb); + return ERR_PTR(err); + } + if (u.b.cpt_object == CPT_OBJ_BITS) { + if (u.b.cpt_size != v.cpt_hspace + skb->len) { + eprintk_ctx("invalid skb image %u != %u + %u\n", u.b.cpt_size, v.cpt_hspace, skb->len); + kfree_skb(skb); + return ERR_PTR(-EINVAL); + } + + err = ctx->pread(skb->head, u.b.cpt_size, ctx, pos+u.b.cpt_hdrlen); + if (err) { + kfree_skb(skb); + return ERR_PTR(err); + } + } else if (u.f.cpt_object == CPT_OBJ_FILEDESC) { + if (!fpl) { + fpl = kmalloc(sizeof(struct scm_fp_list), + GFP_KERNEL_UBC); + if (!fpl) { + kfree_skb(skb); + return ERR_PTR(-ENOMEM); + } + fpl->count = 0; + UNIXCB(skb).fp = fpl; + } + fpl->fp[fpl->count] = rst_file(u.f.cpt_file, -1, ctx); + if (!IS_ERR(fpl->fp[fpl->count])) + fpl->count++; + } + pos += u.b.cpt_next; + } + } + + return skb; +} + +static int restore_unix_rqueue(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + loff_t endpos; + + pos = pos + si->cpt_hdrlen; + endpos = pos + si->cpt_next; + while (pos < endpos) { + struct sk_buff *skb; + struct sock *owner_sk; + __u32 owner; + + skb = rst_skb(&pos, &owner, NULL, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; + + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + } + return PTR_ERR(skb); + } + + owner_sk = unix_peer(sk); + if (owner != -1) { + cpt_object_t *pobj; + pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, owner, ctx); + if (pobj == NULL) { + eprintk_ctx("orphan af_unix skb?\n"); + kfree_skb(skb); + continue; + } + owner_sk = pobj->o_obj; + } + if (owner_sk == NULL) { + dprintk_ctx("orphan af_unix skb 2?\n"); + kfree_skb(skb); + continue; + } + skb_set_owner_w(skb, owner_sk); + if (UNIXCB(skb).fp) + skb->destructor = unix_destruct_fds; + skb_queue_tail(&sk->sk_receive_queue, skb); + if (sk->sk_state == TCP_LISTEN) { + struct socket *sock = skb->sk->sk_socket; + if (sock == NULL) BUG(); + if (sock->file) BUG(); + skb->sk->sk_socket = NULL; + skb->sk->sk_sleep = NULL; + sock->sk = NULL; + sock_release(sock); + } + } + return 0; +} + + +/* All the sockets are created before we start to open files */ + +int rst_sockets(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_SOCKET]; + loff_t endsec; + cpt_object_t *obj; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) { + eprintk_ctx("rst_sockets: ctx->pread: %d\n", err); + return err; + } + if (h.cpt_section != CPT_SECT_SOCKET || h.cpt_hdrlen < sizeof(h)) { + eprintk_ctx("rst_sockets: hdr err\n"); + return -EINVAL; + } + + /* The first pass: we create socket index and open listening sockets. */ + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + struct cpt_sock_image *sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); + if (err) { + eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); + cpt_release_buf(ctx); + return err; + } + if (sbuf->cpt_state == TCP_LISTEN) { + err = open_listening_socket(sec, sbuf, ctx); + cpt_release_buf(ctx); + if (err) { + eprintk_ctx("rst_sockets: open_listening_socket: %d\n", err); + return err; + } + } else { + cpt_release_buf(ctx); + obj = alloc_cpt_object(GFP_KERNEL, ctx); + if (obj == NULL) + return -ENOMEM; + cpt_obj_setindex(obj, sbuf->cpt_index, ctx); + cpt_obj_setpos(obj, sec, ctx); + obj->o_ppos = sbuf->cpt_file; + intern_cpt_object(CPT_OBJ_SOCKET, obj, ctx); + } + sec += sbuf->cpt_next; + } + + /* Pass 2: really restore sockets */ + for_each_object(obj, CPT_OBJ_SOCKET) { + struct cpt_sock_image *sbuf; + if (obj->o_obj != NULL) + continue; + sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); + if (err) { + eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); + cpt_release_buf(ctx); + return err; + } + if (sbuf->cpt_state == TCP_LISTEN) BUG(); + err = open_socket(obj, sbuf, ctx); + cpt_release_buf(ctx); + if (err) { + eprintk_ctx("rst_sockets: open_socket: %d\n", err); + return err; + } + } + + return 0; +} + +int rst_orphans(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_ORPHANS]; + loff_t endsec; + cpt_object_t *obj; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_ORPHANS || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + struct cpt_sock_image *sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + obj = alloc_cpt_object(GFP_KERNEL, ctx); + if (obj == NULL) { + cpt_release_buf(ctx); + return -ENOMEM; + } + obj->o_pos = sec; + obj->o_ppos = sbuf->cpt_file; + err = open_socket(obj, sbuf, ctx); + dprintk_ctx("Restoring orphan: %d\n", err); + free_cpt_object(obj, ctx); + cpt_release_buf(ctx); + if (err) + return err; + sec += sbuf->cpt_next; + } + + return 0; +} + + +/* Pass 3: I understand, this is not funny already :-), + * but we have to do another pass to establish links between + * not-paired AF_UNIX SOCK_DGRAM sockets and to restore AF_UNIX + * skb queues with proper skb->sk links. + * + * This could be made at the end of rst_sockets(), but we defer + * restoring af_unix queues up to the end of restoring files to + * make restoring passed FDs cleaner. + */ + +int rst_sockets_complete(struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_SOCKET) { + struct cpt_sock_image *sbuf; + struct sock *sk = obj->o_obj; + struct sock *peer; + + if (!sk) BUG(); + + if (sk->sk_family != AF_UNIX) + continue; + + sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + if (sbuf->cpt_next > sbuf->cpt_hdrlen) + restore_unix_rqueue(sk, sbuf, obj->o_pos, ctx); + + cpt_release_buf(ctx); + + if (sk->sk_type == SOCK_DGRAM && unix_peer(sk) == NULL) { + cpt_object_t *pobj; + + sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + if (sbuf->cpt_peer != -1) { + pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, sbuf->cpt_peer, ctx); + if (pobj) { + peer = pobj->o_obj; + sock_hold(peer); + unix_peer(sk) = peer; + } + } + cpt_release_buf(ctx); + } + } + + rst_orphans(ctx); + + return 0; +} + diff --git a/kernel/cpt/rst_socket_in.c b/kernel/cpt/rst_socket_in.c new file mode 100644 index 0000000..ddc2d5a --- /dev/null +++ b/kernel/cpt/rst_socket_in.c @@ -0,0 +1,489 @@ +/* + * + * kernel/cpt/rst_socket_in.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" + +static inline unsigned long jiffies_import(__u32 tmo) +{ + __s32 delta = tmo; + return jiffies + (long)delta; +} + +static inline __u32 tcp_jiffies_import(__u32 tmo) +{ + return ((__u32)jiffies) + tmo; +} + + +static int restore_queues(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + loff_t endpos; + + pos = pos + si->cpt_hdrlen; + endpos = pos + si->cpt_next; + while (pos < endpos) { + struct sk_buff *skb; + __u32 type; + + skb = rst_skb(&pos, NULL, &type, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; + + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + } + return PTR_ERR(skb); + } + + if (sk->sk_type == SOCK_STREAM) { + if (type == CPT_SKB_RQ) { + skb_set_owner_r(skb, sk); + ub_tcprcvbuf_charge_forced(sk, skb); + skb_queue_tail(&sk->sk_receive_queue, skb); + } else if (type == CPT_SKB_OFOQ) { + struct tcp_sock *tp = tcp_sk(sk); + skb_set_owner_r(skb, sk); + ub_tcprcvbuf_charge_forced(sk, skb); + skb_queue_tail(&tp->out_of_order_queue, skb); + } else if (type == CPT_SKB_WQ) { + sk->sk_wmem_queued += skb->truesize; + sk->sk_forward_alloc -= skb->truesize; + ub_tcpsndbuf_charge_forced(sk, skb); + skb_queue_tail(&sk->sk_write_queue, skb); + } else { + wprintk_ctx("strange stream queue type %u\n", type); + kfree_skb(skb); + } + } else { + if (type == CPT_SKB_RQ) { + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->sk_receive_queue, skb); + } else if (type == CPT_SKB_WQ) { + struct inet_sock *inet = inet_sk(sk); + if (inet->cork.fragsize) { + skb_set_owner_w(skb, sk); + skb_queue_tail(&sk->sk_write_queue, skb); + } else { + eprintk_ctx("cork skb is dropped\n"); + kfree_skb(skb); + } + } else { + wprintk_ctx("strange dgram queue type %u\n", type); + kfree_skb(skb); + } + } + } + return 0; +} + +static struct sock *find_parent(__u16 sport, cpt_context_t *ctx) +{ + cpt_object_t *obj; + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + if (sk && + sk->sk_state == TCP_LISTEN && + (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && + inet_sk(sk)->sport == sport) + return sk; + } + return NULL; +} + +static int rst_socket_tcp(struct cpt_sock_image *si, loff_t pos, struct sock *sk, + struct cpt_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + tp->pred_flags = si->cpt_pred_flags; + tp->rcv_nxt = si->cpt_rcv_nxt; + tp->snd_nxt = si->cpt_snd_nxt; + tp->snd_una = si->cpt_snd_una; + tp->snd_sml = si->cpt_snd_sml; + tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp); + tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime); + tp->tcp_header_len = si->cpt_tcp_header_len; + inet_csk(sk)->icsk_ack.pending = si->cpt_ack_pending; + inet_csk(sk)->icsk_ack.quick = si->cpt_quick; + inet_csk(sk)->icsk_ack.pingpong = si->cpt_pingpong; + inet_csk(sk)->icsk_ack.blocked = si->cpt_blocked; + inet_csk(sk)->icsk_ack.ato = si->cpt_ato; + inet_csk(sk)->icsk_ack.timeout = jiffies_import(si->cpt_ack_timeout); + inet_csk(sk)->icsk_ack.lrcvtime = tcp_jiffies_import(si->cpt_lrcvtime); + inet_csk(sk)->icsk_ack.last_seg_size = si->cpt_last_seg_size; + inet_csk(sk)->icsk_ack.rcv_mss = si->cpt_rcv_mss; + tp->snd_wl1 = si->cpt_snd_wl1; + tp->snd_wnd = si->cpt_snd_wnd; + tp->max_window = si->cpt_max_window; + inet_csk(sk)->icsk_pmtu_cookie = si->cpt_pmtu_cookie; + tp->mss_cache = si->cpt_mss_cache; + tp->rx_opt.mss_clamp = si->cpt_mss_clamp; + inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len; + inet_csk(sk)->icsk_ca_state = si->cpt_ca_state; + inet_csk(sk)->icsk_retransmits = si->cpt_retransmits; + tp->reordering = si->cpt_reordering; + tp->frto_counter = si->cpt_frto_counter; + tp->frto_highmark = si->cpt_frto_highmark; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) + // // tp->adv_cong = si->cpt_adv_cong; +#endif + inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept; + inet_csk(sk)->icsk_backoff = si->cpt_backoff; + tp->srtt = si->cpt_srtt; + tp->mdev = si->cpt_mdev; + tp->mdev_max = si->cpt_mdev_max; + tp->rttvar = si->cpt_rttvar; + tp->rtt_seq = si->cpt_rtt_seq; + inet_csk(sk)->icsk_rto = si->cpt_rto; + tp->packets_out = si->cpt_packets_out; + tp->retrans_out = si->cpt_retrans_out; + tp->lost_out = si->cpt_lost_out; + tp->sacked_out = si->cpt_sacked_out; + tp->fackets_out = si->cpt_fackets_out; + tp->snd_ssthresh = si->cpt_snd_ssthresh; + tp->snd_cwnd = si->cpt_snd_cwnd; + tp->snd_cwnd_cnt = si->cpt_snd_cwnd_cnt; + tp->snd_cwnd_clamp = si->cpt_snd_cwnd_clamp; + tp->snd_cwnd_used = si->cpt_snd_cwnd_used; + tp->snd_cwnd_stamp = tcp_jiffies_import(si->cpt_snd_cwnd_stamp); + inet_csk(sk)->icsk_timeout = tcp_jiffies_import(si->cpt_timeout); + tp->rcv_wnd = si->cpt_rcv_wnd; + tp->rcv_wup = si->cpt_rcv_wup; + tp->write_seq = si->cpt_write_seq; + tp->pushed_seq = si->cpt_pushed_seq; + tp->copied_seq = si->cpt_copied_seq; + tp->rx_opt.tstamp_ok = si->cpt_tstamp_ok; + tp->rx_opt.wscale_ok = si->cpt_wscale_ok; + tp->rx_opt.sack_ok = si->cpt_sack_ok; + tp->rx_opt.saw_tstamp = si->cpt_saw_tstamp; + tp->rx_opt.snd_wscale = si->cpt_snd_wscale; + tp->rx_opt.rcv_wscale = si->cpt_rcv_wscale; + tp->nonagle = si->cpt_nonagle; + tp->keepalive_probes = si->cpt_keepalive_probes; + tp->rx_opt.rcv_tsval = si->cpt_rcv_tsval; + tp->rx_opt.rcv_tsecr = si->cpt_rcv_tsecr; + tp->rx_opt.ts_recent = si->cpt_ts_recent; + tp->rx_opt.ts_recent_stamp = si->cpt_ts_recent_stamp; + tp->rx_opt.user_mss = si->cpt_user_mss; + tp->rx_opt.dsack = si->cpt_dsack; + tp->rx_opt.eff_sacks = si->cpt_num_sacks; + tp->duplicate_sack[0].start_seq = si->cpt_sack_array[0]; + tp->duplicate_sack[0].end_seq = si->cpt_sack_array[1]; + tp->selective_acks[0].start_seq = si->cpt_sack_array[2]; + tp->selective_acks[0].end_seq = si->cpt_sack_array[3]; + tp->selective_acks[1].start_seq = si->cpt_sack_array[4]; + tp->selective_acks[1].end_seq = si->cpt_sack_array[5]; + tp->selective_acks[2].start_seq = si->cpt_sack_array[6]; + tp->selective_acks[2].end_seq = si->cpt_sack_array[7]; + tp->selective_acks[3].start_seq = si->cpt_sack_array[8]; + tp->selective_acks[3].end_seq = si->cpt_sack_array[9]; + + tp->window_clamp = si->cpt_window_clamp; + tp->rcv_ssthresh = si->cpt_rcv_ssthresh; + inet_csk(sk)->icsk_probes_out = si->cpt_probes_out; + tp->rx_opt.num_sacks = si->cpt_num_sacks; + tp->advmss = si->cpt_advmss; + inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries; + tp->ecn_flags = si->cpt_ecn_flags; + tp->prior_ssthresh = si->cpt_prior_ssthresh; + tp->high_seq = si->cpt_high_seq; + tp->retrans_stamp = si->cpt_retrans_stamp; + tp->undo_marker = si->cpt_undo_marker; + tp->undo_retrans = si->cpt_undo_retrans; + tp->urg_seq = si->cpt_urg_seq; + tp->urg_data = si->cpt_urg_data; + inet_csk(sk)->icsk_pending = si->cpt_pending; + tp->urg_mode = si->cpt_urg_mode; + tp->snd_up = si->cpt_snd_up; + tp->keepalive_time = si->cpt_keepalive_time; + tp->keepalive_intvl = si->cpt_keepalive_intvl; + tp->linger2 = si->cpt_linger2; + + sk->sk_send_head = NULL; + for (skb = skb_peek(&sk->sk_write_queue); + skb && skb != (struct sk_buff*)&sk->sk_write_queue; + skb = skb->next) { + if (!after(tp->snd_nxt, TCP_SKB_CB(skb)->seq)) { + sk->sk_send_head = skb; + break; + } + } + + if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) { + struct inet_sock *inet = inet_sk(sk); + if (inet->num == 0) { + cpt_object_t *lobj = NULL; + + if ((int)si->cpt_parent != -1) + lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); + + if (lobj && lobj->o_obj) { + inet->num = ntohs(inet->sport); + local_bh_disable(); + __inet_inherit_port(lobj->o_obj, sk); + local_bh_enable(); + dprintk_ctx("port inherited from parent\n"); + } else { + struct sock *lsk = find_parent(inet->sport, ctx); + if (lsk) { + inet->num = ntohs(inet->sport); + local_bh_disable(); + __inet_inherit_port(lsk, sk); + local_bh_enable(); + dprintk_ctx("port inherited\n"); + } else { + eprintk_ctx("we are kinda lost...\n"); + } + } + } + + sk->sk_prot->hash(sk); + + if (inet_csk(sk)->icsk_ack.pending&ICSK_ACK_TIMER) + sk_reset_timer(sk, &inet_csk(sk)->icsk_delack_timer, inet_csk(sk)->icsk_ack.timeout); + if (inet_csk(sk)->icsk_pending) + sk_reset_timer(sk, &inet_csk(sk)->icsk_retransmit_timer, + inet_csk(sk)->icsk_timeout); + if (sock_flag(sk, SOCK_KEEPOPEN)) { + unsigned long expires = jiffies_import(si->cpt_ka_timeout); + if (time_after(jiffies, expires)) + expires = jiffies + HZ; + sk_reset_timer(sk, &sk->sk_timer, expires); + } + } + + return 0; +} + + +int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *sk, + struct cpt_context *ctx) +{ + struct inet_sock *inet = inet_sk(sk); + struct net *net = get_exec_env()->ve_ns->net_ns; + + lock_sock(sk); + + sk->sk_state = si->cpt_state; + + inet->daddr = si->cpt_daddr; + inet->dport = si->cpt_dport; + inet->saddr = si->cpt_saddr; + inet->rcv_saddr = si->cpt_rcv_saddr; + inet->sport = si->cpt_sport; + inet->uc_ttl = si->cpt_uc_ttl; + inet->tos = si->cpt_tos; + inet->cmsg_flags = si->cpt_cmsg_flags; + inet->mc_index = si->cpt_mc_index; + inet->mc_addr = si->cpt_mc_addr; + inet->hdrincl = si->cpt_hdrincl; + inet->mc_ttl = si->cpt_mc_ttl; + inet->mc_loop = si->cpt_mc_loop; + inet->pmtudisc = si->cpt_pmtudisc; + inet->recverr = si->cpt_recverr; + inet->freebind = si->cpt_freebind; + inet->id = si->cpt_idcounter; + + inet->cork.flags = si->cpt_cork_flags; + inet->cork.fragsize = si->cpt_cork_fragsize; + inet->cork.length = si->cpt_cork_length; + inet->cork.addr = si->cpt_cork_addr; + inet->cork.fl.fl4_src = si->cpt_cork_saddr; + inet->cork.fl.fl4_dst = si->cpt_cork_daddr; + inet->cork.fl.oif = si->cpt_cork_oif; + if (inet->cork.fragsize) { + if (ip_route_output_key(net, (struct rtable **)&inet->cork.dst, &inet->cork.fl)) { + eprintk_ctx("failed to restore cork route\n"); + inet->cork.fragsize = 0; + } + } + + if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { + struct udp_sock *up = udp_sk(sk); + up->pending = si->cpt_udp_pending; + up->corkflag = si->cpt_udp_corkflag; + up->encap_type = si->cpt_udp_encap; + up->len = si->cpt_udp_len; + } + + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + memcpy(&np->saddr, si->cpt_saddr6, 16); + memcpy(&np->rcv_saddr, si->cpt_rcv_saddr6, 16); + memcpy(&np->daddr, si->cpt_daddr6, 16); + np->flow_label = si->cpt_flow_label6; + np->frag_size = si->cpt_frag_size6; + np->hop_limit = si->cpt_hop_limit6; + np->mcast_hops = si->cpt_mcast_hops6; + np->mcast_oif = si->cpt_mcast_oif6; + np->rxopt.all = si->cpt_rxopt6; + np->mc_loop = si->cpt_mc_loop6; + np->recverr = si->cpt_recverr6; + np->sndflow = si->cpt_sndflow6; + np->pmtudisc = si->cpt_pmtudisc6; + np->ipv6only = si->cpt_ipv6only6; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (si->cpt_mapped) { + extern struct inet_connection_sock_af_ops ipv6_mapped; + if (sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP) { + inet_csk(sk)->icsk_af_ops = &ipv6_mapped; + sk->sk_backlog_rcv = tcp_v4_do_rcv; + } + } +#endif + } + + restore_queues(sk, si, pos, ctx); + + if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) + rst_socket_tcp(si, pos, sk, ctx); + + release_sock(sk); + return 0; +} + +int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *ctx) +{ + struct request_sock *req; + + if (lsk->sk_state != TCP_LISTEN) + return -EINVAL; + + req = reqsk_alloc(&tcp_request_sock_ops); + if (!req) + return -ENOMEM; + + sk->sk_socket = NULL; + sk->sk_sleep = NULL; + inet_csk_reqsk_queue_add(lsk, req, sk); + return 0; +} + +int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t end = si->cpt_next; + + pos += si->cpt_hdrlen; + while (pos < end) { + struct cpt_openreq_image oi; + + err = rst_get_object(CPT_OBJ_OPENREQ, pos, &oi, ctx); + if (err) { + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + continue; + } + + if (oi.cpt_object == CPT_OBJ_OPENREQ) { + struct request_sock *req = reqsk_alloc(&tcp_request_sock_ops); + if (req == NULL) + return -ENOMEM; + + memset(req, 0, sizeof(*req)); + tcp_rsk(req)->rcv_isn = oi.cpt_rcv_isn; + tcp_rsk(req)->snt_isn = oi.cpt_snt_isn; + inet_rsk(req)->rmt_port = oi.cpt_rmt_port; + req->mss = oi.cpt_mss; + req->retrans = oi.cpt_retrans; + inet_rsk(req)->snd_wscale = oi.cpt_snd_wscale; + inet_rsk(req)->rcv_wscale = oi.cpt_rcv_wscale; + inet_rsk(req)->tstamp_ok = oi.cpt_tstamp_ok; + inet_rsk(req)->sack_ok = oi.cpt_sack_ok; + inet_rsk(req)->wscale_ok = oi.cpt_wscale_ok; + inet_rsk(req)->ecn_ok = oi.cpt_ecn_ok; + inet_rsk(req)->acked = oi.cpt_acked; + req->window_clamp = oi.cpt_window_clamp; + req->rcv_wnd = oi.cpt_rcv_wnd; + req->ts_recent = oi.cpt_ts_recent; + req->expires = jiffies_import(oi.cpt_expires); + + if (oi.cpt_family == AF_INET) { + memcpy(&inet_rsk(req)->loc_addr, oi.cpt_loc_addr, 4); + memcpy(&inet_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 4); + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } else { +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + memcpy(&inet6_rsk(req)->loc_addr, oi.cpt_loc_addr, 16); + memcpy(&inet6_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 16); + inet6_rsk(req)->iif = oi.cpt_iif; + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); +#endif + } + } + pos += oi.cpt_next; + } + return 0; +} + +int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx) +{ + struct ip_mreqn imr; + + if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) { + eprintk_ctx("IGMPv3 is still not supported\n"); + return -EINVAL; + } + + memset(&imr, 0, sizeof(imr)); + imr.imr_ifindex = v->cpt_ifindex; + imr.imr_multiaddr.s_addr = v->cpt_mcaddr[0]; + return ip_mc_join_group(sk, &imr); +} + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx) +{ + + if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) { + eprintk_ctx("IGMPv3 is still not supported\n"); + return -EINVAL; + } + + return ipv6_sock_mc_join(sk, v->cpt_ifindex, + (struct in6_addr*)v->cpt_mcaddr); +} +#endif diff --git a/kernel/cpt/rst_sysvipc.c b/kernel/cpt/rst_sysvipc.c new file mode 100644 index 0000000..40127d7 --- /dev/null +++ b/kernel/cpt/rst_sysvipc.c @@ -0,0 +1,633 @@ +/* + * + * kernel/cpt/rst_sysvipc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" + +struct _warg { + struct file *file; + struct cpt_sysvshm_image *v; +}; + +static int fixup_one_shm(struct shmid_kernel *shp, void *arg) +{ + struct _warg *warg = arg; + + if (shp->shm_file != warg->file) + return 0; + if (shp->shm_nattch) + return -EEXIST; + + shp->shm_perm.uid = warg->v->cpt_uid; + shp->shm_perm.gid = warg->v->cpt_gid; + shp->shm_perm.cuid = warg->v->cpt_cuid; + shp->shm_perm.cgid = warg->v->cpt_cgid; + shp->shm_perm.mode = warg->v->cpt_mode; + + shp->shm_atim = warg->v->cpt_atime; + shp->shm_dtim = warg->v->cpt_dtime; + shp->shm_ctim = warg->v->cpt_ctime; + shp->shm_cprid = warg->v->cpt_creator; + shp->shm_lprid = warg->v->cpt_last; + + /* TODO: fix shp->mlock_user? */ + return 1; +} + +static int fixup_shm(struct file *file, struct cpt_sysvshm_image *v) +{ + struct _warg warg; + + warg.file = file; + warg.v = v; + + return sysvipc_walk_shm(fixup_one_shm, &warg); +} + +static int fixup_shm_data(struct file *file, loff_t pos, loff_t end, + struct cpt_context *ctx) +{ + struct cpt_page_block pgb; + ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); + + do_write = file->f_dentry->d_inode->i_fop->write; + if (do_write == NULL) { + eprintk_ctx("No TMPFS? Cannot restore content of SYSV SHM\n"); + return -EINVAL; + } + + while (pos < end) { + loff_t opos; + loff_t ipos; + int count; + int err; + + err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); + if (err) + return err; + dprintk_ctx("restoring SHM block: %08x-%08x\n", + (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); + ipos = pos + pgb.cpt_hdrlen; + opos = pgb.cpt_start; + count = pgb.cpt_end-pgb.cpt_start; + while (count > 0) { + mm_segment_t oldfs; + int copy = count; + + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + (void)cpt_get_buf(ctx); + oldfs = get_fs(); set_fs(KERNEL_DS); + err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); + set_fs(oldfs); + if (err) { + __cpt_release_buf(ctx); + return err; + } + oldfs = get_fs(); set_fs(KERNEL_DS); + ipos += copy; + err = do_write(file, ctx->tmpbuf, copy, &opos); + set_fs(oldfs); + __cpt_release_buf(ctx); + if (err != copy) { + eprintk_ctx("write() failure\n"); + if (err >= 0) + err = -EIO; + return err; + } + count -= copy; + } + pos += pgb.cpt_next; + } + return 0; +} + +struct file * rst_sysv_shm_itself(loff_t pos, struct cpt_context *ctx) +{ + struct file *file; + int err; + loff_t dpos, epos; + union { + struct cpt_file_image fi; + struct cpt_sysvshm_image shmi; + struct cpt_inode_image ii; + } u; + + err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx); + if (err < 0) + goto err_out; + pos = u.fi.cpt_inode; + err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx); + if (err < 0) + goto err_out; + dpos = pos + u.ii.cpt_hdrlen; + epos = pos + u.ii.cpt_next; + err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx); + if (err < 0) + goto err_out; + dpos += u.shmi.cpt_next; + + file = sysvipc_setup_shm(u.shmi.cpt_key, u.shmi.cpt_id, + u.shmi.cpt_segsz, u.shmi.cpt_mode); + if (!IS_ERR(file)) { + err = fixup_shm(file, &u.shmi); + if (err != -EEXIST && dpos < epos) + err = fixup_shm_data(file, dpos, epos, ctx); + } else if (IS_ERR(file) && PTR_ERR(file) == -EEXIST) { + struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; + struct shmid_kernel *shp; + + shp = shm_lock(ipc_ns, u.shmi.cpt_id); + BUG_ON(IS_ERR(shp)); + get_file(shp->shm_file); + file = shp->shm_file; + shm_unlock(shp); + } + return file; + +err_out: + return ERR_PTR(err); +} + +struct file * rst_sysv_shm_vma(struct cpt_vma_image *vmai, struct cpt_context *ctx) +{ + struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; + struct file *file; + union { + struct cpt_file_image fi; + struct cpt_inode_image ii; + struct cpt_sysvshm_image shmi; + } u; + struct shmid_kernel *shp; + struct shm_file_data *sfd; + struct path path; + mode_t f_mode; + loff_t pos; + int err; + + pos = vmai->cpt_file; + file = rst_sysv_shm_itself(pos, ctx); + if (IS_ERR(file) && PTR_ERR(file) != -EEXIST) + return file; + fput(file); + + err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx); + if (err < 0) + goto err_out; + pos = u.fi.cpt_inode; + err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx); + if (err < 0) + goto err_out; + err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx); + if (err < 0) + goto err_out; + + shp = shm_lock(ipc_ns, u.shmi.cpt_id); + BUG_ON(IS_ERR(shp)); + path.dentry = dget(shp->shm_file->f_path.dentry); + path.mnt = shp->shm_file->f_path.mnt; + shm_unlock(shp); + + err = -ENOMEM; + sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); + if (!sfd) + goto out_put_dentry; + + f_mode = 0; + if (vmai->cpt_flags & VM_READ) + f_mode |= FMODE_READ; + if (vmai->cpt_flags & VM_WRITE) + f_mode |= FMODE_WRITE; + if (vmai->cpt_flags & VM_EXEC) + f_mode |= FMODE_EXEC; + + err = -ENOMEM; + file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations); + if (!file) + goto out_free; + + file->private_data = sfd; + file->f_mapping = shp->shm_file->f_mapping; + sfd->id = shp->shm_perm.id; + sfd->ns = get_ipc_ns(ipc_ns); + sfd->file = shp->shm_file; + sfd->vm_ops = NULL; + + return file; + +out_free: + kfree(sfd); +out_put_dentry: + dput(path.dentry); +err_out: + return ERR_PTR(err); +} + +static int attach_one_undo(int semid, struct sem_array *sma, void *arg) +{ + struct sem_undo *su = arg; + struct sem_undo_list *undo_list = current->sysvsem.undo_list; + + if (semid != su->semid) + return 0; + + list_add(&su->list_proc, &undo_list->list_proc); + list_add(&su->list_id, &sma->list_id); + + return 1; +} + +static int attach_undo(struct sem_undo *su) +{ + return sysvipc_walk_sem(attach_one_undo, su); +} + +static int do_rst_semundo(struct cpt_object_hdr *sui, loff_t pos, struct cpt_context *ctx) +{ + int err; + struct sem_undo_list *undo_list; + + if (current->sysvsem.undo_list) { + eprintk_ctx("Funny undo_list\n"); + return 0; + } + + undo_list = kzalloc(sizeof(struct sem_undo_list), GFP_KERNEL_UBC); + if (undo_list == NULL) + return -ENOMEM; + + atomic_set(&undo_list->refcnt, 1); + spin_lock_init(&undo_list->lock); + current->sysvsem.undo_list = undo_list; + + if (sui->cpt_next > sui->cpt_hdrlen) { + loff_t offset = pos + sui->cpt_hdrlen; + do { + struct sem_undo *new; + struct cpt_sysvsem_undo_image spi; + err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO_REC, offset, &spi, ctx); + if (err) + goto out; + new = kmalloc(sizeof(struct sem_undo) + + sizeof(short)*spi.cpt_nsem, + GFP_KERNEL_UBC); + if (!new) { + err = -ENOMEM; + goto out; + } + + memset(new, 0, sizeof(struct sem_undo) + sizeof(short)*spi.cpt_nsem); + new->semadj = (short *) &new[1]; + new->semid = spi.cpt_id; + err = ctx->pread(new->semadj, spi.cpt_nsem*sizeof(short), ctx, offset + spi.cpt_hdrlen); + if (err) { + kfree(new); + goto out; + } + err = attach_undo(new); + if (err <= 0) { + if (err == 0) + err = -ENOENT; + kfree(new); + goto out; + } + offset += spi.cpt_next; + } while (offset < pos + sui->cpt_next); + } + err = 0; + +out: + return err; +} + +__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + __u32 flag = 0; + +#if 0 + if (ti->cpt_sysvsem_undo == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo)) + flag |= CLONE_SYSVSEM; +#endif + return flag; +} + +int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + int err; + struct sem_undo_list *f = current->sysvsem.undo_list; + cpt_object_t *obj; + struct cpt_object_hdr sui; + + if (ti->cpt_sysvsem_undo == CPT_NULL) { + exit_sem(current); + return 0; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, ctx); + if (obj) { + if (obj->o_obj != f) { + exit_sem(current); + f = obj->o_obj; + atomic_inc(&f->refcnt); + current->sysvsem.undo_list = f; + } + return 0; + } + + if ((err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, &sui, ctx)) != 0) + goto out; + + if ((err = do_rst_semundo(&sui, ti->cpt_sysvsem_undo, ctx)) != 0) + goto out; + + err = -ENOMEM; + obj = cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, f, ctx); + if (obj) { + err = 0; + cpt_obj_setpos(obj, ti->cpt_sysvsem_undo, ctx); + } + + return 0; + +out: + return err; +} + +struct _sarg { + int semid; + struct cpt_sysvsem_image *v; + __u32 *arr; +}; + +static int fixup_one_sem(int semid, struct sem_array *sma, void *arg) +{ + struct _sarg *warg = arg; + + if (semid != warg->semid) + return 0; + + sma->sem_perm.uid = warg->v->cpt_uid; + sma->sem_perm.gid = warg->v->cpt_gid; + sma->sem_perm.cuid = warg->v->cpt_cuid; + sma->sem_perm.cgid = warg->v->cpt_cgid; + sma->sem_perm.mode = warg->v->cpt_mode; + sma->sem_perm.seq = warg->v->cpt_seq; + + sma->sem_ctime = warg->v->cpt_ctime; + sma->sem_otime = warg->v->cpt_otime; + memcpy(sma->sem_base, warg->arr, sma->sem_nsems*8); + return 1; +} + +static int fixup_sem(int semid, struct cpt_sysvsem_image *v, __u32 *arr) +{ + struct _sarg warg; + + warg.semid = semid; + warg.v = v; + warg.arr = arr; + + return sysvipc_walk_sem(fixup_one_sem, &warg); +} + + +static int restore_sem(loff_t pos, struct cpt_sysvsem_image *si, + struct cpt_context *ctx) +{ + int err; + __u32 *arr; + int nsems = (si->cpt_next - si->cpt_hdrlen)/8; + + arr = kmalloc(nsems*8, GFP_KERNEL); + if (!arr) + return -ENOMEM; + + err = ctx->pread(arr, nsems*8, ctx, pos+si->cpt_hdrlen); + if (err) + goto out; + err = sysvipc_setup_sem(si->cpt_key, si->cpt_id, nsems, si->cpt_mode); + if (err < 0) { + eprintk_ctx("SEM 3\n"); + goto out; + } + err = fixup_sem(si->cpt_id, si, arr); + if (err == 0) + err = -ESRCH; + if (err > 0) + err = 0; +out: + kfree(arr); + return err; +} + +static int rst_sysv_sem(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_SYSV_SEM]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_sysvsem_image sbuf; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_SYSV_SEM || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int err; + err = rst_get_object(CPT_OBJ_SYSV_SEM, sec, &sbuf, ctx); + if (err) + return err; + err = restore_sem(sec, &sbuf, ctx); + if (err) + return err; + sec += sbuf.cpt_next; + } + return 0; +} + +struct _marg { + int msqid; + struct cpt_sysvmsg_image *v; + struct msg_queue *m; +}; + +static int fixup_one_msg(int msqid, struct msg_queue *msq, void *arg) +{ + struct _marg *warg = arg; + + if (msqid != warg->msqid) + return 0; + + msq->q_perm.uid = warg->v->cpt_uid; + msq->q_perm.gid = warg->v->cpt_gid; + msq->q_perm.cuid = warg->v->cpt_cuid; + msq->q_perm.cgid = warg->v->cpt_cgid; + msq->q_perm.mode = warg->v->cpt_mode; + msq->q_perm.seq = warg->v->cpt_seq; + + msq->q_stime = warg->v->cpt_stime; + msq->q_rtime = warg->v->cpt_rtime; + msq->q_ctime = warg->v->cpt_ctime; + msq->q_lspid = warg->v->cpt_last_sender; + msq->q_lrpid = warg->v->cpt_last_receiver; + msq->q_qbytes = warg->v->cpt_qbytes; + + warg->m = msq; + return 1; +} + +struct _larg +{ + cpt_context_t * ctx; + loff_t pos; +}; + +static int do_load_msg(void * dst, int len, int offset, void * data) +{ + struct _larg * arg = data; + return arg->ctx->pread(dst, len, arg->ctx, arg->pos + offset); +} + +static int fixup_msg(int msqid, struct cpt_sysvmsg_image *v, loff_t pos, + cpt_context_t * ctx) +{ + int err; + struct _marg warg; + loff_t endpos = pos + v->cpt_next; + struct ipc_namespace *ns = current->nsproxy->ipc_ns; + + pos += v->cpt_hdrlen; + + warg.msqid = msqid; + warg.v = v; + + err = sysvipc_walk_msg(fixup_one_msg, &warg); + if (err <= 0) + return err; + + while (pos < endpos) { + struct cpt_sysvmsg_msg_image mi; + struct msg_msg *m; + struct _larg data = { + .ctx = ctx + }; + + err = rst_get_object(CPT_OBJ_SYSVMSG_MSG, pos, &mi, ctx); + if (err) + return err; + data.pos = pos + mi.cpt_hdrlen; + m = sysv_msg_load(do_load_msg, mi.cpt_size, &data); + if (IS_ERR(m)) + return PTR_ERR(m); + m->m_type = mi.cpt_type; + m->m_ts = mi.cpt_size; + list_add_tail(&m->m_list, &warg.m->q_messages); + warg.m->q_cbytes += m->m_ts; + warg.m->q_qnum++; + atomic_add(m->m_ts, &ns->msg_bytes); + atomic_inc(&ns->msg_hdrs); + + pos += mi.cpt_next; + } + return 1; +} + +static int restore_msg(loff_t pos, struct cpt_sysvmsg_image *si, + struct cpt_context *ctx) +{ + int err; + + err = sysvipc_setup_msg(si->cpt_key, si->cpt_id, si->cpt_mode); + if (err < 0) { + eprintk_ctx("MSG 3\n"); + goto out; + } + err = fixup_msg(si->cpt_id, si, pos, ctx); + if (err == 0) + err = -ESRCH; + if (err > 0) + err = 0; +out: + return err; +} + +static int rst_sysv_msg(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_SYSV_MSG]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_sysvmsg_image sbuf; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_SYSV_MSG || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int err; + err = rst_get_object(CPT_OBJ_SYSVMSG, sec, &sbuf, ctx); + if (err) + return err; + err = restore_msg(sec, &sbuf, ctx); + if (err) + return err; + sec += sbuf.cpt_next; + } + return 0; +} + + +int rst_sysv_ipc(struct cpt_context *ctx) +{ + int err; + + err = rst_sysv_sem(ctx); + if (!err) + err = rst_sysv_msg(ctx); + + return err; +} diff --git a/kernel/cpt/rst_tty.c b/kernel/cpt/rst_tty.c new file mode 100644 index 0000000..48bc4ce --- /dev/null +++ b/kernel/cpt/rst_tty.c @@ -0,0 +1,384 @@ +/* + * + * kernel/cpt/rst_tty.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_process.h" +#include "cpt_files.h" +#include "cpt_kernel.h" + +static int pty_setup(struct tty_struct *stty, loff_t pos, + struct cpt_tty_image *pi, struct cpt_context *ctx) +{ + unsigned long flags; + + stty->pgrp = NULL; + stty->session = NULL; + stty->packet = pi->cpt_packet; + stty->stopped = pi->cpt_stopped; + stty->hw_stopped = pi->cpt_hw_stopped; + stty->flow_stopped = pi->cpt_flow_stopped; +#define DONOT_CHANGE ((1<flags & DONOT_CHANGE; + stty->flags = flags | (pi->cpt_flags & ~DONOT_CHANGE); + stty->ctrl_status = pi->cpt_ctrl_status; + stty->winsize.ws_row = pi->cpt_ws_row; + stty->winsize.ws_col = pi->cpt_ws_col; + stty->winsize.ws_ypixel = pi->cpt_ws_prow; + stty->winsize.ws_xpixel = pi->cpt_ws_pcol; + stty->canon_column = pi->cpt_canon_column; + stty->column = pi->cpt_column; + stty->raw = pi->cpt_raw; + stty->real_raw = pi->cpt_real_raw; + stty->erasing = pi->cpt_erasing; + stty->lnext = pi->cpt_lnext; + stty->icanon = pi->cpt_icanon; + stty->closing = pi->cpt_closing; + stty->minimum_to_wake = pi->cpt_minimum_to_wake; + + stty->termios->c_iflag = pi->cpt_c_iflag; + stty->termios->c_oflag = pi->cpt_c_oflag; + stty->termios->c_lflag = pi->cpt_c_lflag; + stty->termios->c_cflag = pi->cpt_c_cflag; + memcpy(&stty->termios->c_cc, &pi->cpt_c_cc, NCCS); + memcpy(stty->read_flags, pi->cpt_read_flags, sizeof(stty->read_flags)); + + if (pi->cpt_next > pi->cpt_hdrlen) { + int err; + struct cpt_obj_bits b; + err = rst_get_object(CPT_OBJ_BITS, pos + pi->cpt_hdrlen, &b, ctx); + if (err) + return err; + if (b.cpt_size == 0) + return 0; + err = ctx->pread(stty->read_buf, b.cpt_size, ctx, pos + pi->cpt_hdrlen + b.cpt_hdrlen); + if (err) + return err; + + spin_lock_irq(&stty->read_lock); + stty->read_tail = 0; + stty->read_cnt = b.cpt_size; + stty->read_head = b.cpt_size; + stty->canon_head = stty->read_tail + pi->cpt_canon_head; + stty->canon_data = pi->cpt_canon_data; + spin_unlock_irq(&stty->read_lock); + } + + return 0; +} + +/* Find slave/master tty in image, when we already know master/slave. + * It might be optimized, of course. */ +static loff_t find_pty_pair(struct tty_struct *stty, loff_t pos, struct cpt_tty_image *pi, struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_TTY]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_tty_image *pibuf; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return CPT_NULL; + if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) + return CPT_NULL; + pibuf = kmalloc(sizeof(*pibuf), GFP_KERNEL); + if (pibuf == NULL) { + eprintk_ctx("cannot allocate buffer\n"); + return CPT_NULL; + } + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) + return CPT_NULL; + if (pibuf->cpt_index == pi->cpt_index && + !((pi->cpt_drv_flags^pibuf->cpt_drv_flags)&TTY_DRIVER_DEVPTS_MEM) && + pos != sec) { + pty_setup(stty, sec, pibuf, ctx); + return sec; + } + sec += pibuf->cpt_next; + } + kfree(pibuf); + return CPT_NULL; +} + +static int fixup_tty_attrs(struct cpt_inode_image *ii, struct file *master, + struct cpt_context *ctx) +{ + int err; + struct iattr newattrs; + struct dentry *d = master->f_dentry; + + newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; + newattrs.ia_uid = ii->cpt_uid; + newattrs.ia_gid = ii->cpt_gid; + newattrs.ia_mode = ii->cpt_mode; + + mutex_lock(&d->d_inode->i_mutex); + err = notify_change(d, &newattrs); + mutex_unlock(&d->d_inode->i_mutex); + + return err; +} + +/* NOTE: "portable", but ugly thing. To allocate /dev/pts/N, we open + * /dev/ptmx until we get pty with desired index. + */ + +struct file *ptmx_open(int index, unsigned int flags) +{ + struct file *file; + struct file **stack = NULL; + int depth = 0; + + for (;;) { + struct tty_struct *tty; + + file = filp_open("/dev/ptmx", flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); + if (IS_ERR(file)) + break; + tty = file->private_data; + if (tty->index == index) + break; + + if (depth == PAGE_SIZE/sizeof(struct file *)) { + fput(file); + file = ERR_PTR(-EBUSY); + break; + } + if (stack == NULL) { + stack = (struct file **)__get_free_page(GFP_KERNEL); + if (!stack) { + fput(file); + file = ERR_PTR(-ENOMEM); + break; + } + } + stack[depth] = file; + depth++; + } + while (depth > 0) { + depth--; + fput(stack[depth]); + } + if (stack) + free_page((unsigned long)stack); + return file; +} + + +struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, + unsigned flags, struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + struct file *master, *slave; + struct tty_struct *stty; + struct cpt_tty_image *pi; + static char *a = "pqrstuvwxyzabcde"; + static char *b = "0123456789abcdef"; + char pairname[16]; + unsigned master_flags, slave_flags; + + if (fi->cpt_priv == CPT_NULL) + return ERR_PTR(-EINVAL); + + obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, fi->cpt_priv, ctx); + if (obj && obj->o_parent) { + dprintk_ctx("obtained pty as pair to existing\n"); + master = obj->o_parent; + stty = master->private_data; + + if (stty->driver->subtype == PTY_TYPE_MASTER && + (stty->driver->flags&TTY_DRIVER_DEVPTS_MEM)) { + wprintk_ctx("cloning ptmx\n"); + get_file(master); + return master; + } + + master = dentry_open(dget(master->f_dentry), + mntget(master->f_vfsmnt), flags); + if (!IS_ERR(master)) { + stty = master->private_data; + if (stty->driver->subtype != PTY_TYPE_MASTER) + fixup_tty_attrs(ii, master, ctx); + } + return master; + } + + pi = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_TTY, fi->cpt_priv, pi, ctx); + if (err) { + cpt_release_buf(ctx); + return ERR_PTR(err); + } + + master_flags = slave_flags = 0; + if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) + master_flags = flags; + else + slave_flags = flags; + + /* + * Open pair master/slave. + */ + if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) { + master = ptmx_open(pi->cpt_index, master_flags); + } else { + sprintf(pairname, "/dev/pty%c%c", a[pi->cpt_index/16], b[pi->cpt_index%16]); + master = filp_open(pairname, master_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); + } + if (IS_ERR(master)) { + eprintk_ctx("filp_open master: %Ld %ld\n", (long long)fi->cpt_priv, PTR_ERR(master)); + cpt_release_buf(ctx); + return master; + } + stty = master->private_data; + clear_bit(TTY_PTY_LOCK, &stty->flags); + if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) + sprintf(pairname, "/dev/pts/%d", stty->index); + else + sprintf(pairname, "/dev/tty%c%c", a[stty->index/16], b[stty->index%16]); + slave = filp_open(pairname, slave_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); + if (IS_ERR(slave)) { + eprintk_ctx("filp_open slave %s: %ld\n", pairname, PTR_ERR(slave)); + fput(master); + cpt_release_buf(ctx); + return slave; + } + + if (pi->cpt_drv_subtype != PTY_TYPE_MASTER) + fixup_tty_attrs(ii, slave, ctx); + + cpt_object_add(CPT_OBJ_TTY, master->private_data, ctx); + cpt_object_add(CPT_OBJ_TTY, slave->private_data, ctx); + cpt_object_add(CPT_OBJ_FILE, master, ctx); + cpt_object_add(CPT_OBJ_FILE, slave, ctx); + + if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) { + loff_t pos; + obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); + obj->o_parent = master; + cpt_obj_setpos(obj, fi->cpt_priv, ctx); + pty_setup(stty, fi->cpt_priv, pi, ctx); + + obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); + obj->o_parent = slave; + pos = find_pty_pair(stty->link, fi->cpt_priv, pi, ctx); + cpt_obj_setpos(obj, pos, ctx); + + obj = lookup_cpt_object(CPT_OBJ_FILE, slave, ctx); + cpt_obj_setpos(obj, CPT_NULL, ctx); + get_file(master); + cpt_release_buf(ctx); + return master; + } else { + loff_t pos; + obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); + obj->o_parent = slave; + cpt_obj_setpos(obj, fi->cpt_priv, ctx); + pty_setup(stty->link, fi->cpt_priv, pi, ctx); + + obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); + obj->o_parent = master; + pos = find_pty_pair(stty, fi->cpt_priv, pi, ctx); + cpt_obj_setpos(obj, pos, ctx); + + obj = lookup_cpt_object(CPT_OBJ_FILE, master, ctx); + cpt_obj_setpos(obj, CPT_NULL, ctx); + get_file(slave); + cpt_release_buf(ctx); + return slave; + } +} + +int rst_tty_jobcontrol(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_TTY]; + loff_t endsec; + struct cpt_section_hdr h; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + cpt_object_t *obj; + struct cpt_tty_image *pibuf = cpt_get_buf(ctx); + + if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) { + cpt_release_buf(ctx); + return -EINVAL; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, sec, ctx); + if (obj) { + struct tty_struct *stty = obj->o_obj; + if ((int)pibuf->cpt_pgrp > 0) { + rcu_read_lock(); + stty->pgrp = get_pid(alloc_vpid_safe(pibuf->cpt_pgrp)); + rcu_read_unlock(); + if (!stty->pgrp) + dprintk_ctx("unknown tty pgrp %d\n", pibuf->cpt_pgrp); + } else if (pibuf->cpt_pgrp) { + stty->pgrp = alloc_pid(current->nsproxy->pid_ns, + 0); + if (!stty->pgrp) { + eprintk_ctx("cannot allocate stray tty->pgrp"); + cpt_release_buf(ctx); + return -EINVAL; + } + } + if ((int)pibuf->cpt_session > 0) { + struct pid *sess; + + rcu_read_lock(); + sess = get_pid(alloc_vpid_safe(pibuf->cpt_session)); + rcu_read_unlock(); + if (!sess) { + dprintk_ctx("unknown tty session %d\n", pibuf->cpt_session); + } else if (!stty->session) { + stty->session = sess; + } + } + } + sec += pibuf->cpt_next; + cpt_release_buf(ctx); + } + return 0; +} diff --git a/kernel/cpt/rst_ubc.c b/kernel/cpt/rst_ubc.c new file mode 100644 index 0000000..a39ae28 --- /dev/null +++ b/kernel/cpt/rst_ubc.c @@ -0,0 +1,131 @@ +/* + * + * kernel/cpt/rst_ubc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, pos, ctx); + if (obj == NULL) { + eprintk("RST: unknown ub @%Ld\n", (long long)pos); + return get_beancounter(get_exec_ub()); + } + return get_beancounter(obj->o_obj); +} + +void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id) +{ + to[bc_parm_id].barrier = from[bc_parm_id].barrier; + to[bc_parm_id].limit = from[bc_parm_id].limit; +} + +void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id) +{ + ubprm[bc_parm_id].barrier = UB_MAXVALUE; + ubprm[bc_parm_id].limit = UB_MAXVALUE; +} + +static void restore_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm, + int held) +{ + prm->barrier = (dmp->barrier == CPT_NULL ? UB_MAXVALUE : dmp->barrier); + prm->limit = (dmp->limit == CPT_NULL ? UB_MAXVALUE : dmp->limit); + if (held) + prm->held = dmp->held; + prm->maxheld = dmp->maxheld; + prm->minheld = dmp->minheld; + prm->failcnt = dmp->failcnt; +} + +static int restore_one_bc(struct cpt_beancounter_image *v, + cpt_object_t *obj, struct cpt_context *ctx) +{ + struct user_beancounter *bc; + cpt_object_t *pobj; + int i; + + if (v->cpt_parent != CPT_NULL) { + pobj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, v->cpt_parent, ctx); + if (pobj == NULL) + return -ESRCH; + bc = get_subbeancounter_byid(pobj->o_obj, v->cpt_id, 1); + } else { + bc = get_exec_ub(); + while (bc->parent) + bc = bc->parent; + get_beancounter(bc); + } + if (bc == NULL) + return -ENOMEM; + obj->o_obj = bc; + + if (ctx->image_version < CPT_VERSION_18 && + CPT_VERSION_MINOR(ctx->image_version) < 1) + goto out; + + for (i = 0; i < UB_RESOURCES; i++) { + restore_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0); + restore_one_bc_parm(v->cpt_parms + i * 2 + 1, + bc->ub_store + i, 1); + } + +out: + if (!bc->parent) + for (i = 0; i < UB_RESOURCES; i++) + copy_one_ubparm(bc->ub_parms, ctx->saved_ubc, i); + + return 0; +} + +int rst_undump_ubc(struct cpt_context *ctx) +{ + loff_t start, end; + struct cpt_beancounter_image *v; + cpt_object_t *obj; + int err; + + err = rst_get_section(CPT_SECT_UBC, ctx, &start, &end); + if (err) + return err; + + while (start < end) { + v = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_UBC, start, v, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + obj = alloc_cpt_object(GFP_KERNEL, ctx); + cpt_obj_setpos(obj, start, ctx); + intern_cpt_object(CPT_OBJ_UBC, obj, ctx); + + restore_one_bc(v, obj, ctx); + + cpt_release_buf(ctx); + start += v->cpt_next; + } + return 0; +} + +void rst_finish_ubc(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_UBC) + put_beancounter(obj->o_obj); +} diff --git a/kernel/cpt/rst_undump.c b/kernel/cpt/rst_undump.c new file mode 100644 index 0000000..5c8977a --- /dev/null +++ b/kernel/cpt/rst_undump.c @@ -0,0 +1,1007 @@ +/* + * + * kernel/cpt/rst_undump.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_process.h" +#include "cpt_socket.h" +#include "cpt_net.h" +#include "cpt_ubc.h" +#include "cpt_kernel.h" + +static int rst_utsname(cpt_context_t *ctx); + + +struct thr_context { + struct completion init_complete; + struct completion task_done; + int error; + struct cpt_context *ctx; + cpt_object_t *tobj; +}; + +static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx); + +static int vps_rst_veinfo(struct cpt_context *ctx) +{ + int err; + struct cpt_veinfo_image *i; + struct ve_struct *ve; + struct timespec delta; + loff_t start, end; + struct ipc_namespace *ns; + + err = rst_get_section(CPT_SECT_VEINFO, ctx, &start, &end); + if (err) + goto out; + + i = cpt_get_buf(ctx); + memset(i, 0, sizeof(*i)); + err = rst_get_object(CPT_OBJ_VEINFO, start, i, ctx); + if (err) + goto out_rel; + + ve = get_exec_env(); + ns = ve->ve_ns->ipc_ns; + + /* Damn. Fatal mistake, these two values are size_t! */ + ns->shm_ctlall = i->shm_ctl_all ? : 0xFFFFFFFFU; + ns->shm_ctlmax = i->shm_ctl_max ? : 0xFFFFFFFFU; + ns->shm_ctlmni = i->shm_ctl_mni; + + ns->msg_ctlmax = i->msg_ctl_max; + ns->msg_ctlmni = i->msg_ctl_mni; + ns->msg_ctlmnb = i->msg_ctl_mnb; + + BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr)); + ns->sem_ctls[0] = i->sem_ctl_arr[0]; + ns->sem_ctls[1] = i->sem_ctl_arr[1]; + ns->sem_ctls[2] = i->sem_ctl_arr[2]; + ns->sem_ctls[3] = i->sem_ctl_arr[3]; + + cpt_timespec_import(&delta, i->start_timespec_delta); + _set_normalized_timespec(&ve->start_timespec, + ve->start_timespec.tv_sec - delta.tv_sec, + ve->start_timespec.tv_nsec - delta.tv_nsec); + ve->start_jiffies -= i->start_jiffies_delta; + // // FIXME: what??? + // // ve->start_cycles -= (s64)i->start_jiffies_delta * cycles_per_jiffy; + + ctx->last_vpid = i->last_pid; + + err = 0; +out_rel: + cpt_release_buf(ctx); +out: + return err; +} + +static int vps_rst_reparent_root(cpt_object_t *obj, struct cpt_context *ctx) +{ + int err; + struct env_create_param3 param; + + do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time); + do_gettimespec(&ctx->delta_time); + + _set_normalized_timespec(&ctx->delta_time, + ctx->delta_time.tv_sec - ctx->start_time.tv_sec, + ctx->delta_time.tv_nsec - ctx->start_time.tv_nsec); + ctx->delta_nsec = (s64)ctx->delta_time.tv_sec*NSEC_PER_SEC + ctx->delta_time.tv_nsec; + if (ctx->delta_nsec < 0) { + wprintk_ctx("Wall time is behind source by %Ld ns, " + "time sensitive applications can misbehave\n", (long long)-ctx->delta_nsec); + } + + _set_normalized_timespec(&ctx->cpt_monotonic_time, + ctx->cpt_monotonic_time.tv_sec - ctx->delta_time.tv_sec, + ctx->cpt_monotonic_time.tv_nsec - ctx->delta_time.tv_nsec); + + memset(¶m, 0, sizeof(param)); + param.iptables_mask = ctx->iptables_mask; + param.feature_mask = ctx->features; + + /* feature_mask is set as required - pretend we know everything */ + param.known_features = (ctx->image_version < CPT_VERSION_18) ? + VE_FEATURES_OLD : ~(__u64)0; + + err = real_env_create(ctx->ve_id, VE_CREATE|VE_LOCK, 2, + ¶m, sizeof(param)); + if (err < 0) + eprintk_ctx("real_env_create: %d\n", err); + + get_exec_env()->jiffies_fixup = + (ctx->delta_time.tv_sec < 0 ? + 0 : timespec_to_jiffies(&ctx->delta_time)) - + (unsigned long)(get_jiffies_64() - ctx->virt_jiffies64); + dprintk_ctx("JFixup %ld %Ld\n", get_exec_env()->jiffies_fixup, + (long long)ctx->delta_nsec); + return err < 0 ? err : 0; +} + +static int hook(void *arg) +{ + struct thr_context *thr_ctx = arg; + struct cpt_context *ctx; + cpt_object_t *tobj; + struct cpt_task_image *ti; + int err = 0; + int exiting = 0; + + current->state = TASK_UNINTERRUPTIBLE; + complete(&thr_ctx->init_complete); + schedule(); + + ctx = thr_ctx->ctx; + tobj = thr_ctx->tobj; + ti = tobj->o_image; + + current->fs->umask = 0; + + if (ti->cpt_pid == 1) { +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *bc; +#endif + + err = vps_rst_reparent_root(tobj, ctx); + + if (err) { + rst_report_error(err, ctx); + goto out; + } + + memcpy(&get_exec_env()->ve_cap_bset, &ti->cpt_ecap, sizeof(kernel_cap_t)); + + if (ctx->statusfile) { + fput(ctx->statusfile); + ctx->statusfile = NULL; + } + + if (ctx->lockfile) { + char b; + mm_segment_t oldfs; + err = -EINVAL; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (ctx->lockfile->f_op && ctx->lockfile->f_op->read) + err = ctx->lockfile->f_op->read(ctx->lockfile, &b, 1, &ctx->lockfile->f_pos); + set_fs(oldfs); + fput(ctx->lockfile); + ctx->lockfile = NULL; + } + + if (err) { + eprintk_ctx("CPT: lock fd is closed incorrectly: %d\n", err); + goto out; + } + err = vps_rst_veinfo(ctx); + if (err) { + eprintk_ctx("rst_veinfo: %d\n", err); + goto out; + } + + err = rst_utsname(ctx); + if (err) { + eprintk_ctx("rst_utsname: %d\n", err); + goto out; + } + + err = rst_root_namespace(ctx); + if (err) { + eprintk_ctx("rst_namespace: %d\n", err); + goto out; + } + + if ((err = rst_restore_net(ctx)) != 0) { + eprintk_ctx("rst_restore_net: %d\n", err); + goto out; + } + + err = rst_sockets(ctx); + if (err) { + eprintk_ctx("rst_sockets: %d\n", err); + goto out; + } + err = rst_sysv_ipc(ctx); + if (err) { + eprintk_ctx("rst_sysv_ipc: %d\n", err); + goto out; + } +#ifdef CONFIG_BEANCOUNTERS + bc = get_exec_ub(); + set_one_ubparm_to_max(bc->ub_parms, UB_KMEMSIZE); + set_one_ubparm_to_max(bc->ub_parms, UB_NUMPROC); + set_one_ubparm_to_max(bc->ub_parms, UB_NUMFILE); + set_one_ubparm_to_max(bc->ub_parms, UB_DCACHESIZE); +#endif + } + + do { + if (current->user->uid != ti->cpt_user) { + struct user_struct *u; + + u = alloc_uid(get_exec_env()->ve_ns->user_ns, ti->cpt_user); + if (!u) { + eprintk_ctx("alloc_user\n"); + } else { + switch_uid(u); + } + } + } while (0); + + if ((err = rst_mm_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_mm: %d\n", err); + goto out; + } + + if ((err = rst_files_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_files: %d\n", err); + goto out; + } + + if ((err = rst_fs_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_fs: %d\n", err); + goto out; + } + + if ((err = rst_semundo_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_semundo: %d\n", err); + goto out; + } + + if ((err = rst_signal_complete(ti, &exiting, ctx)) != 0) { + eprintk_ctx("rst_signal: %d\n", err); + goto out; + } + + if (ti->cpt_personality != 0) + __set_personality(ti->cpt_personality); + +#ifdef CONFIG_X86_64 + /* 32bit app from 32bit OS, won't have PER_LINUX32 set... :/ */ + if (!ti->cpt_64bit) + __set_personality(PER_LINUX32); +#endif + + current->set_child_tid = NULL; + current->clear_child_tid = NULL; + current->flags &= ~(PF_FORKNOEXEC|PF_SUPERPRIV); + current->flags |= ti->cpt_flags&(PF_FORKNOEXEC|PF_SUPERPRIV); + current->exit_code = ti->cpt_exit_code; + current->pdeath_signal = ti->cpt_pdeath_signal; + + if (ti->cpt_restart.fn != CPT_RBL_0) { + if (ti->cpt_restart.fn == CPT_RBL_NANOSLEEP +#ifdef CONFIG_COMPAT + || ti->cpt_restart.fn == CPT_RBL_COMPAT_NANOSLEEP +#endif + ) { + struct restart_block *rb; + ktime_t e; + + e.tv64 = 0; + + if (ctx->image_version >= CPT_VERSION_20) + e = ktime_add_ns(e, ti->cpt_restart.arg2); + else if (ctx->image_version >= CPT_VERSION_9) + e = ktime_add_ns(e, ti->cpt_restart.arg0); + else + e = ktime_add_ns(e, ti->cpt_restart.arg0*TICK_NSEC); + if (e.tv64 < 0) + e.tv64 = TICK_NSEC; + e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + + rb = &task_thread_info(current)->restart_block; + if (ti->cpt_restart.fn == CPT_RBL_NANOSLEEP) + rb->fn = hrtimer_nanosleep_restart; +#ifdef CONFIG_COMPAT + else + rb->fn = compat_nanosleep_restart; +#endif + if (ctx->image_version >= CPT_VERSION_20) { + rb->arg0 = ti->cpt_restart.arg0; + rb->arg1 = ti->cpt_restart.arg1; + rb->arg2 = e.tv64 & 0xFFFFFFFF; + rb->arg3 = e.tv64 >> 32; + } else if (ctx->image_version >= CPT_VERSION_9) { + rb->arg0 = ti->cpt_restart.arg2; + rb->arg1 = ti->cpt_restart.arg3; + rb->arg2 = e.tv64 & 0xFFFFFFFF; + rb->arg3 = e.tv64 >> 32; + } else { + rb->arg0 = ti->cpt_restart.arg1; + rb->arg1 = CLOCK_MONOTONIC; + rb->arg2 = e.tv64 & 0xFFFFFFFF; + rb->arg3 = e.tv64 >> 32; + } + } else if (ti->cpt_restart.fn == CPT_RBL_POLL) { + struct restart_block *rb; + ktime_t e; + struct timespec ts; + unsigned long timeout_jiffies; + + e.tv64 = 0; + e = ktime_add_ns(e, ti->cpt_restart.arg2); + e = ktime_sub(e, timespec_to_ktime(ctx->delta_time)); + ts = ns_to_timespec(ktime_to_ns(e)); + timeout_jiffies = timespec_to_jiffies(&ts); + + rb = &task_thread_info(current)->restart_block; + rb->fn = do_restart_poll; + rb->arg0 = ti->cpt_restart.arg0; + rb->arg1 = ti->cpt_restart.arg1; + rb->arg2 = timeout_jiffies & 0xFFFFFFFF; + rb->arg3 = (u64)timeout_jiffies >> 32; + } else if (ti->cpt_restart.fn == CPT_RBL_FUTEX_WAIT) { + struct restart_block *rb; + ktime_t e; + + e.tv64 = 0; + e = ktime_add_ns(e, ti->cpt_restart.arg2); + e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + + rb = &task_thread_info(current)->restart_block; + rb->fn = futex_wait_restart; + rb->futex.uaddr = (void *)(unsigned long)ti->cpt_restart.arg0; + rb->futex.val = ti->cpt_restart.arg1; + rb->futex.time = e.tv64; + rb->futex.flags = ti->cpt_restart.arg3; + } else + eprintk_ctx("unknown restart block\n"); + } + + if (thread_group_leader(current)) { + current->signal->it_real_incr.tv64 = 0; + if (ctx->image_version >= CPT_VERSION_9) { + current->signal->it_real_incr = + ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr); + } else { + current->signal->it_real_incr = + ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr*TICK_NSEC); + } + current->signal->it_prof_incr = ti->cpt_it_prof_incr; + current->signal->it_virt_incr = ti->cpt_it_virt_incr; + current->signal->it_prof_expires = ti->cpt_it_prof_value; + current->signal->it_virt_expires = ti->cpt_it_virt_value; + } + + err = rst_clone_children(tobj, ctx); + if (err) { + eprintk_ctx("rst_clone_children\n"); + goto out; + } + + if (exiting) + current->signal->flags |= SIGNAL_GROUP_EXIT; + + if (ti->cpt_pid == 1) { + if ((err = rst_process_linkage(ctx)) != 0) { + eprintk_ctx("rst_process_linkage: %d\n", err); + goto out; + } + if ((err = rst_do_filejobs(ctx)) != 0) { + eprintk_ctx("rst_do_filejobs: %d\n", err); + goto out; + } + if ((err = rst_eventpoll(ctx)) != 0) { + eprintk_ctx("rst_eventpoll: %d\n", err); + goto out; + } +#ifdef CONFIG_INOTIFY_USER + if ((err = rst_inotify(ctx)) != 0) { + eprintk_ctx("rst_inotify: %d\n", err); + goto out; + } +#endif + if ((err = rst_sockets_complete(ctx)) != 0) { + eprintk_ctx("rst_sockets_complete: %d\n", err); + goto out; + } + if ((err = rst_stray_files(ctx)) != 0) { + eprintk_ctx("rst_stray_files: %d\n", err); + goto out; + } + if ((err = rst_posix_locks(ctx)) != 0) { + eprintk_ctx("rst_posix_locks: %d\n", err); + goto out; + } + if ((err = rst_tty_jobcontrol(ctx)) != 0) { + eprintk_ctx("rst_tty_jobcontrol: %d\n", err); + goto out; + } + if ((err = rst_restore_fs(ctx)) != 0) { + eprintk_ctx("rst_restore_fs: %d\n", err); + goto out; + } + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RESTORE, ctx) & NOTIFY_FAIL) { + err = -ECHRNG; + eprintk_ctx("scp_restore failed\n"); + goto out; + } + if (ctx->last_vpid) + get_exec_env()->ve_ns->pid_ns->last_pid = + ctx->last_vpid; + } + +out: + thr_ctx->error = err; + complete(&thr_ctx->task_done); + + if (!err && (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + current->flags |= PF_EXIT_RESTART; + do_exit(ti->cpt_exit_code); + } else { + __set_current_state(TASK_UNINTERRUPTIBLE); + } + + schedule(); + + dprintk_ctx("leaked through %d/%d %p\n", task_pid_nr(current), task_pid_vnr(current), current->mm); + + module_put(THIS_MODULE); + complete_and_exit(NULL, 0); + return 0; +} + +#if 0 +static void set_task_ubs(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct task_beancounter *tbc; + + tbc = task_bc(current); + + put_beancounter(tbc->fork_sub); + tbc->fork_sub = rst_lookup_ubc(ti->cpt_task_ub, ctx); + if (ti->cpt_mm_ub != CPT_NULL) { + put_beancounter(tbc->exec_ub); + tbc->exec_ub = rst_lookup_ubc(ti->cpt_mm_ub, ctx); + } +} +#endif + +static int create_root_task(cpt_object_t *obj, struct cpt_context *ctx, + struct thr_context *thr_ctx) +{ + struct task_struct *tsk; + int pid; + + thr_ctx->ctx = ctx; + thr_ctx->error = 0; + init_completion(&thr_ctx->init_complete); + init_completion(&thr_ctx->task_done); +#if 0 + set_task_ubs(obj->o_image, ctx); +#endif + + pid = local_kernel_thread(hook, thr_ctx, 0, 0); + if (pid < 0) + return pid; + read_lock(&tasklist_lock); + tsk = find_task_by_vpid(pid); + if (tsk) + get_task_struct(tsk); + read_unlock(&tasklist_lock); + if (tsk == NULL) + return -ESRCH; + cpt_obj_setobj(obj, tsk, ctx); + thr_ctx->tobj = obj; + return 0; +} + +static int rst_basic_init_task(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + + memcpy(tsk->comm, ti->cpt_comm, sizeof(tsk->comm)); + rst_mm_basic(obj, ti, ctx); + return 0; +} + +static int make_baby(cpt_object_t *cobj, + struct cpt_task_image *pi, + struct cpt_context *ctx) +{ + unsigned long flags; + struct cpt_task_image *ci = cobj->o_image; + struct thr_context thr_ctx; + struct task_struct *tsk; + pid_t pid; + struct fs_struct *tfs = NULL; + + flags = rst_mm_flag(ci, ctx) | rst_files_flag(ci, ctx) + | rst_signal_flag(ci, ctx) | rst_semundo_flag(ci, ctx); + if (ci->cpt_rppid != pi->cpt_pid) { + flags |= CLONE_THREAD|CLONE_PARENT; + if (ci->cpt_signal != pi->cpt_signal || + !(flags&CLONE_SIGHAND) || + (!(flags&CLONE_VM) && pi->cpt_mm != CPT_NULL)) { + eprintk_ctx("something is wrong with threads: %d %d %d %Ld %Ld %08lx\n", + (int)ci->cpt_pid, (int)ci->cpt_rppid, (int)pi->cpt_pid, + (long long)ci->cpt_signal, (long long)pi->cpt_signal, flags + ); + return -EINVAL; + } + } + + thr_ctx.ctx = ctx; + thr_ctx.error = 0; + init_completion(&thr_ctx.init_complete); + init_completion(&thr_ctx.task_done); + thr_ctx.tobj = cobj; + +#if 0 + set_task_ubs(ci, ctx); +#endif + + if (current->fs == NULL) { + tfs = get_exec_env()->ve_ns->pid_ns->child_reaper->fs; + if (tfs == NULL) + return -EINVAL; + atomic_inc(&tfs->count); + current->fs = tfs; + } + pid = local_kernel_thread(hook, &thr_ctx, flags, ci->cpt_pid); + if (tfs) { + current->fs = NULL; + atomic_dec(&tfs->count); + } + if (pid < 0) + return pid; + + read_lock(&tasklist_lock); + tsk = find_task_by_vpid(pid); + if (tsk) + get_task_struct(tsk); + read_unlock(&tasklist_lock); + if (tsk == NULL) + return -ESRCH; + cpt_obj_setobj(cobj, tsk, ctx); + thr_ctx.tobj = cobj; + wait_for_completion(&thr_ctx.init_complete); + wait_task_inactive(cobj->o_obj, 0); + rst_basic_init_task(cobj, ctx); + + /* clone() increases group_stop_count if it was not zero and + * CLONE_THREAD was asked. Undo. + */ + if (current->signal->group_stop_count && (flags & CLONE_THREAD)) { + if (tsk->signal != current->signal) BUG(); + current->signal->group_stop_count--; + } + + wake_up_process(tsk); + wait_for_completion(&thr_ctx.task_done); + wait_task_inactive(tsk, 0); + + return thr_ctx.error; +} + +static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx) +{ + int err = 0; + struct cpt_task_image *ti = obj->o_image; + cpt_object_t *cobj; + + for_each_object(cobj, CPT_OBJ_TASK) { + struct cpt_task_image *ci = cobj->o_image; + if (cobj == obj) + continue; + if ((ci->cpt_rppid == ti->cpt_pid && ci->cpt_tgid == ci->cpt_pid) || + (ci->cpt_leader == ti->cpt_pid && + ci->cpt_tgid != ci->cpt_pid && ci->cpt_pid != 1)) { + err = make_baby(cobj, ti, ctx); + if (err) { + eprintk_ctx("make_baby: %d\n", err); + return err; + } + } + } + return 0; +} + +static int read_task_images(struct cpt_context *ctx) +{ + int err; + loff_t start, end; + + err = rst_get_section(CPT_SECT_TASKS, ctx, &start, &end); + if (err) + return err; + + while (start < end) { + cpt_object_t *obj; + struct cpt_task_image *ti = cpt_get_buf(ctx); + + err = rst_get_object(CPT_OBJ_TASK, start, ti, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } +#if 0 + if (ti->cpt_pid != 1 && !__is_virtual_pid(ti->cpt_pid)) { + eprintk_ctx("BUG: pid %d is not virtual\n", ti->cpt_pid); + cpt_release_buf(ctx); + return -EINVAL; + } +#endif + obj = alloc_cpt_object(GFP_KERNEL, ctx); + cpt_obj_setpos(obj, start, ctx); + intern_cpt_object(CPT_OBJ_TASK, obj, ctx); + obj->o_image = kmalloc(ti->cpt_next, GFP_KERNEL); + if (obj->o_image == NULL) { + cpt_release_buf(ctx); + return -ENOMEM; + } + memcpy(obj->o_image, ti, sizeof(*ti)); + err = ctx->pread(obj->o_image + sizeof(*ti), + ti->cpt_next - sizeof(*ti), ctx, start + sizeof(*ti)); + cpt_release_buf(ctx); + if (err) + return err; + start += ti->cpt_next; + } + return 0; +} + + +static int vps_rst_restore_tree(struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + struct thr_context thr_ctx_root; + + err = read_task_images(ctx); + if (err) + return err; + + err = rst_undump_ubc(ctx); + if (err) + return err; + + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RSTCHECK, ctx) & NOTIFY_FAIL) + return -ECHRNG; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + err = rst_setup_pagein(ctx); + if (err) + return err; +#endif + for_each_object(obj, CPT_OBJ_TASK) { + err = create_root_task(obj, ctx, &thr_ctx_root); + if (err) + return err; + + wait_for_completion(&thr_ctx_root.init_complete); + wait_task_inactive(obj->o_obj, 0); + rst_basic_init_task(obj, ctx); + + wake_up_process(obj->o_obj); + wait_for_completion(&thr_ctx_root.task_done); + wait_task_inactive(obj->o_obj, 0); + err = thr_ctx_root.error; + if (err) + return err; + break; + } + + return err; +} + +#ifndef CONFIG_IA64 +int rst_read_vdso(struct cpt_context *ctx) +{ + int err; + loff_t start, end; + struct cpt_page_block *pgb; + + ctx->vdso = NULL; + err = rst_get_section(CPT_SECT_VSYSCALL, ctx, &start, &end); + if (err) + return err; + if (start == CPT_NULL) + return 0; + if (end < start + sizeof(*pgb) + PAGE_SIZE) + return -EINVAL; + + pgb = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_VSYSCALL, start, pgb, ctx); + if (err) { + goto err_buf; + } + ctx->vdso = (char*)__get_free_page(GFP_KERNEL); + if (ctx->vdso == NULL) { + err = -ENOMEM; + goto err_buf; + } + err = ctx->pread(ctx->vdso, PAGE_SIZE, ctx, start + sizeof(*pgb)); + if (err) + goto err_page; + if (!memcmp(ctx->vdso, vsyscall_addr, PAGE_SIZE)) { + free_page((unsigned long)ctx->vdso); + ctx->vdso = NULL; + } + + cpt_release_buf(ctx); + return 0; +err_page: + free_page((unsigned long)ctx->vdso); + ctx->vdso = NULL; +err_buf: + cpt_release_buf(ctx); + return err; +} +#endif + +int vps_rst_undump(struct cpt_context *ctx) +{ + int err; + unsigned long umask; + + err = rst_open_dumpfile(ctx); + if (err) + return err; + + if (ctx->tasks64) { +#if defined(CONFIG_IA64) + if (ctx->image_arch != CPT_OS_ARCH_IA64) +#elif defined(CONFIG_X86_64) + if (ctx->image_arch != CPT_OS_ARCH_EMT64) +#else + if (1) +#endif + { + eprintk_ctx("Cannot restore 64 bit container on this architecture\n"); + return -EINVAL; + } + } + + umask = current->fs->umask; + current->fs->umask = 0; + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + err = rst_setup_pagein(ctx); +#endif +#ifndef CONFIG_IA64 + if (err == 0) + err = rst_read_vdso(ctx); +#endif + if (err == 0) + err = vps_rst_restore_tree(ctx); + + if (err == 0) + err = rst_restore_process(ctx); + + if (err) + virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RSTFAIL, ctx); + + current->fs->umask = umask; + + return err; +} + +static int rst_unlock_ve(struct cpt_context *ctx) +{ + struct ve_struct *env; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + down_write(&env->op_sem); + env->is_locked = 0; + up_write(&env->op_sem); + put_ve(env); + return 0; +} + +int recalc_sigpending_tsk(struct task_struct *t); + +int rst_resume(struct cpt_context *ctx) +{ + cpt_object_t *obj; + int err = 0; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *bc; +#endif + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + + fput(file); + } + +#ifdef CONFIG_BEANCOUNTERS + bc = get_beancounter_byuid(ctx->ve_id, 0); + BUG_ON(!bc); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_KMEMSIZE); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMPROC); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMFILE); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_DCACHESIZE); + put_beancounter(bc); +#endif + + rst_resume_network(ctx); + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + + if (!tsk) + continue; + + if (ti->cpt_state == TASK_UNINTERRUPTIBLE) { + dprintk_ctx("task %d/%d(%s) is started\n", task_pid_vnr(tsk), tsk->pid, tsk->comm); + + /* Weird... If a signal is sent to stopped task, + * nobody makes recalc_sigpending(). We have to do + * this by hands after wake_up_process(). + * if we did this before a signal could arrive before + * wake_up_process() and stall. + */ + spin_lock_irq(&tsk->sighand->siglock); + if (!signal_pending(tsk)) + recalc_sigpending_tsk(tsk); + spin_unlock_irq(&tsk->sighand->siglock); + + wake_up_process(tsk); + } else { + if (ti->cpt_state == TASK_STOPPED || + ti->cpt_state == TASK_TRACED) { + set_task_state(tsk, ti->cpt_state); + } + } + put_task_struct(tsk); + } + + rst_unlock_ve(ctx); + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + rst_complete_pagein(ctx, 0); +#endif + + rst_finish_ubc(ctx); + cpt_object_destroy(ctx); + + return err; +} + +int rst_kill(struct cpt_context *ctx) +{ + cpt_object_t *obj; + int err = 0; + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + + fput(file); + } + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + + if (tsk == NULL) + continue; + + if (tsk->exit_state == 0) { + send_sig(SIGKILL, tsk, 1); + + spin_lock_irq(&tsk->sighand->siglock); + sigfillset(&tsk->blocked); + sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); + set_tsk_thread_flag(tsk, TIF_SIGPENDING); + clear_tsk_thread_flag(tsk, TIF_FREEZE); + if (tsk->flags & PF_FROZEN) + tsk->flags &= ~PF_FROZEN; + spin_unlock_irq(&tsk->sighand->siglock); + + wake_up_process(tsk); + } + + put_task_struct(tsk); + } + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + rst_complete_pagein(ctx, 1); +#endif + + rst_finish_ubc(ctx); + cpt_object_destroy(ctx); + + return err; +} + +static int rst_utsname(cpt_context_t *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_UTSNAME]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_object_hdr o; + struct ve_struct *ve; + struct uts_namespace *ns; + int i; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_UTSNAME || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + ve = get_exec_env(); + ns = ve->ve_ns->uts_ns; + + i = 0; + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int len; + char *ptr; + err = rst_get_object(CPT_OBJ_NAME, sec, &o, ctx); + if (err) + return err; + len = o.cpt_next - o.cpt_hdrlen; + if (len > __NEW_UTS_LEN + 1) + return -ENAMETOOLONG; + switch (i) { + case 0: + ptr = ns->name.nodename; break; + case 1: + ptr = ns->name.domainname; break; + default: + return -EINVAL; + } + err = ctx->pread(ptr, len, ctx, sec+o.cpt_hdrlen); + if (err) + return err; + i++; + sec += o.cpt_next; + } + + return 0; +} diff --git a/kernel/cpu.c b/kernel/cpu.c index f17e985..f489d1c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -176,7 +176,7 @@ static inline void check_for_tasks(int cpu) struct task_struct *p; write_lock_irq(&tasklist_lock); - for_each_process(p) { + for_each_process_all(p) { if (task_cpu(p) == cpu && (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) diff --git a/kernel/exit.c b/kernel/exit.c index 85a83c8..0760834 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -48,12 +51,15 @@ #include #include +#include +#include + #include #include #include #include -static void exit_mm(struct task_struct * tsk); +void exit_mm(struct task_struct * tsk); static inline int task_detached(struct task_struct *p) { @@ -69,6 +75,9 @@ static void __unhash_process(struct task_struct *p) detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); +#ifdef CONFIG_VE + list_del_rcu(&p->ve_task_info.vetask_list); +#endif __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); @@ -164,6 +173,8 @@ repeat: write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); __exit_signal(p); + nr_zombie--; + atomic_inc(&nr_dead); /* * If we are the last non-leader member of the thread @@ -192,9 +203,12 @@ repeat: if (zap_leader) leader->exit_state = EXIT_DEAD; } + put_task_fairsched_node(p); write_unlock_irq(&tasklist_lock); release_thread(p); + ub_task_uncharge(p); + pput_ve(p->ve_task_info.owner_env); call_rcu(&p->rcu, delayed_put_task_struct); p = leader; @@ -523,6 +537,7 @@ void put_files_struct(struct files_struct *files) free_fdtable(fdt); } } +EXPORT_SYMBOL_GPL(put_files_struct); void reset_files_struct(struct files_struct *files) { @@ -666,7 +681,7 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +void exit_mm(struct task_struct * tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; @@ -674,6 +689,10 @@ static void exit_mm(struct task_struct * tsk) mm_release(tsk, mm); if (!mm) return; + + if (test_tsk_thread_flag(tsk, TIF_MEMDIE)) + mm->oom_killed = 1; + /* * Serialize with any possible pending coredump. * We must hold mmap_sem around checking core_state @@ -718,6 +737,7 @@ static void exit_mm(struct task_struct * tsk) mm_update_next_owner(mm); mmput(mm); } +EXPORT_SYMBOL_GPL(exit_mm); /* * Return nonzero if @parent's children should reap themselves. @@ -845,7 +865,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) struct task_struct *thread; thread = father; - while_each_thread(father, thread) { + while_each_thread_ve(father, thread) { if (thread->flags & PF_EXITING) continue; if (unlikely(pid_ns->child_reaper == father)) @@ -942,11 +962,16 @@ static void exit_notify(struct task_struct *tsk, int group_dead) !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; + if (tsk->exit_signal != -1 && tsk == init_pid_ns.child_reaper) + /* We dont want people slaying init. */ + tsk->exit_signal = SIGCHLD; + signal = tracehook_notify_death(tsk, &cookie, group_dead); if (signal >= 0) signal = do_notify_parent(tsk, signal); tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; + nr_zombie++; /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && @@ -1006,6 +1031,7 @@ NORET_TYPE void do_exit(long code) panic("Attempted to kill the idle task!"); tracehook_report_exit(&code); + (void)virtinfo_gencall(VIRTINFO_DOEXIT, NULL); /* * We're taking recursive faults here in do_exit. Safest is to just @@ -1055,12 +1081,14 @@ NORET_TYPE void do_exit(long code) } acct_collect(code, group_dead); #ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) - exit_robust_list(tsk); + if (!(tsk->flags & PF_EXIT_RESTART)) { + if (unlikely(tsk->robust_list)) + exit_robust_list(tsk); #ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) - compat_exit_robust_list(tsk); + if (unlikely(tsk->compat_robust_list)) + compat_exit_robust_list(tsk); #endif + } #endif if (group_dead) tty_audit_exit(); @@ -1089,8 +1117,16 @@ NORET_TYPE void do_exit(long code) if (tsk->binfmt) module_put(tsk->binfmt->module); - proc_exit_connector(tsk); - exit_notify(tsk, group_dead); + if (!(tsk->flags & PF_EXIT_RESTART)) { + proc_exit_connector(tsk); + exit_notify(tsk, group_dead); + } else { + write_lock_irq(&tasklist_lock); + tsk->exit_state = EXIT_ZOMBIE; + nr_zombie++; + write_unlock_irq(&tasklist_lock); + exit_task_namespaces(tsk); + } #ifdef CONFIG_NUMA mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; @@ -1821,6 +1857,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, asmlinkage_protect(4, ret, upid, stat_addr, options, ru); return ret; } +EXPORT_SYMBOL_GPL(sys_wait4); #ifdef __ARCH_WANT_SYS_WAITPID diff --git a/kernel/fairsched.c b/kernel/fairsched.c new file mode 100644 index 0000000..bfa5c33 --- /dev/null +++ b/kernel/fairsched.c @@ -0,0 +1,633 @@ +/* + * Fair Scheduler + * + * Copyright (C) 2000-2008 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include + +struct fairsched_node fairsched_init_node = { + .id = FAIRSCHED_INIT_NODE_ID, + .tg = &init_task_group, +#ifdef CONFIG_VE + .owner_env = get_ve0(), +#endif + .weight = 1, +}; + +static DEFINE_MUTEX(fairsched_mutex); + +/* list protected with fairsched_mutex */ +static LIST_HEAD(fairsched_node_head); +static int fairsched_nr_nodes; + +void __init fairsched_init_early(void) +{ + list_add(&fairsched_init_node.nodelist, &fairsched_node_head); + fairsched_nr_nodes++; +} + +#define FSCHWEIGHT_BASE 512000 + +/****************************************************************************** + * cfs group shares = FSCHWEIGHT_BASE / fairsched weight + * + * vzctl cpuunits default 1000 + * cfs shares default value is 1024 (see init_task_group_load in sched.c) + * cpuunits = 1000 --> weight = 500000 / cpuunits = 500 --> shares = 1024 + * ^--- from vzctl + * weight in 1..65535 --> shares in 7..512000 + * shares should be >1 (see comment in sched_group_set_shares function) + *****************************************************************************/ + +static struct fairsched_node *fairsched_find(unsigned int id) +{ + struct fairsched_node *p; + list_for_each_entry(p, &fairsched_node_head, nodelist) { + if (p->id == id) + return p; + } + return NULL; +} + +/****************************************************************************** + * System calls + * + * All do_xxx functions are called under fairsched mutex and after + * capability check. + * + * The binary interfaces follow some other Fair Scheduler implementations + * (although some system call arguments are not needed for our implementation). + *****************************************************************************/ + +static int do_fairsched_mknod(unsigned int parent, unsigned int weight, + unsigned int newid) +{ + struct fairsched_node *node; + int retval; + + retval = -EINVAL; + if (weight < 1 || weight > FSCHWEIGHT_MAX) + goto out; + if (newid < 0 || newid > INT_MAX) + goto out; + + retval = -EBUSY; + if (fairsched_find(newid) != NULL) + goto out; + + retval = -ENOMEM; + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (node == NULL) + goto out; + + node->tg = sched_create_group(&init_task_group); + if (IS_ERR(node->tg)) + goto out_free; + + node->id = newid; + node->weight = weight; + sched_group_set_shares(node->tg, FSCHWEIGHT_BASE / weight); +#ifdef CONFIG_VE + node->owner_env = get_exec_env(); +#endif + list_add(&node->nodelist, &fairsched_node_head); + fairsched_nr_nodes++; + + retval = newid; +out: + return retval; + +out_free: + kfree(node); + return retval; +} + +asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, + unsigned int newid) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_mknod(parent, weight, newid); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_mknod); + +static int do_fairsched_rmnod(unsigned int id) +{ + struct fairsched_node *node; + int retval; + + retval = -EINVAL; + node = fairsched_find(id); + if (node == NULL) + goto out; + if (node == &fairsched_init_node) + goto out; + + retval = -EBUSY; + if (node->refcnt) + goto out; + + list_del(&node->nodelist); + fairsched_nr_nodes--; + + sched_destroy_group(node->tg); + kfree(node); + retval = 0; +out: + return retval; +} + +asmlinkage int sys_fairsched_rmnod(unsigned int id) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_rmnod(id); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_rmnod); + +static int do_fairsched_chwt(unsigned int id, unsigned weight) +{ + struct fairsched_node *node; + + if (id == 0) + return -EINVAL; + if (weight < 1 || weight > FSCHWEIGHT_MAX) + return -EINVAL; + + node = fairsched_find(id); + if (node == NULL) + return -ENOENT; + + node->weight = weight; + sched_group_set_shares(node->tg, FSCHWEIGHT_BASE / weight); + + return 0; +} + +asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned weight) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_chwt(id, weight); + mutex_unlock(&fairsched_mutex); + + return retval; +} + +static int do_fairsched_vcpus(unsigned int id, unsigned int vcpus) +{ + struct fairsched_node *node; + + if (id == 0) + return -EINVAL; + + node = fairsched_find(id); + if (node == NULL) + return -ENOENT; + + return 0; +} + +asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_vcpus(id, vcpus); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_vcpus); + +static int do_fairsched_rate(unsigned int id, int op, unsigned rate) +{ + struct fairsched_node *node; + int retval; + + if (id == 0) + return -EINVAL; + if (op == FAIRSCHED_SET_RATE && (rate < 1 || rate >= (1UL << 31))) + return -EINVAL; + + node = fairsched_find(id); + if (node == NULL) + return -ENOENT; + + retval = -EINVAL; + switch (op) { + case FAIRSCHED_SET_RATE: + node->rate = rate; + node->rate_limited = 1; + retval = rate; + break; + case FAIRSCHED_DROP_RATE: + node->rate = 0; + node->rate_limited = 0; + retval = 0; + break; + case FAIRSCHED_GET_RATE: + if (node->rate_limited) + retval = node->rate; + else + retval = -ENODATA; + break; + } + return retval; +} + +asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_rate(id, op, rate); + mutex_unlock(&fairsched_mutex); + + return retval; +} + +static int do_fairsched_mvpr(pid_t pid, unsigned int nodeid) +{ + struct task_struct *p; + struct fairsched_node *node; + int retval; + + retval = -ENOENT; + node = fairsched_find(nodeid); + if (node == NULL) + goto out; + + write_lock_irq(&tasklist_lock); + retval = -ESRCH; + p = find_task_by_vpid(pid); + if (p == NULL) + goto out_unlock; + + get_task_struct(p); + put_task_fairsched_node(p); + p->fsched_node = node; + get_task_fairsched_node(p); + write_unlock_irq(&tasklist_lock); + + smp_wmb(); + sched_move_task(p); + put_task_struct(p); + return 0; + +out_unlock: + write_unlock_irq(&tasklist_lock); +out: + return retval; +} + +asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_mvpr(pid, nodeid); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_mvpr); + +#ifdef CONFIG_PROC_FS + +/*********************************************************************/ +/* + * proc interface + */ +/*********************************************************************/ + +#include +#include +#include + +struct fairsched_node_dump { + int id; + unsigned weight; + unsigned rate; + int rate_limited; + int nr_pcpu; + int nr_tasks, nr_runtasks; +}; + +struct fairsched_dump { + int len; + struct fairsched_node_dump nodes[0]; +}; + +static struct fairsched_dump *fairsched_do_dump(int compat) +{ + int nr_nodes; + int len; + struct fairsched_dump *dump; + struct fairsched_node *node; + struct fairsched_node_dump *p; + + mutex_lock(&fairsched_mutex); + nr_nodes = (ve_is_super(get_exec_env()) ? fairsched_nr_nodes + 16 : 1); + len = sizeof(*dump) + nr_nodes * sizeof(dump->nodes[0]); + dump = ub_vmalloc(len); + if (dump == NULL) + goto out; + + p = dump->nodes; + list_for_each_entry_reverse(node, &fairsched_node_head, nodelist) { + if ((char *)p - (char *)dump >= len) + break; + p->nr_tasks = 0; + p->nr_runtasks = 0; +#ifdef CONFIG_VE + if (!ve_accessible(node->owner_env, get_exec_env())) + continue; + p->nr_tasks = atomic_read(&node->owner_env->pcounter); + p->nr_runtasks = nr_running_ve(node->owner_env); +#endif + p->id = node->id; + p->weight = node->weight; + p->rate = node->rate; + p->rate_limited = node->rate_limited; + p->nr_pcpu = num_online_cpus(); + p++; + } + dump->len = p - dump->nodes; +out: + mutex_unlock(&fairsched_mutex); + return dump; +} + +#define FAIRSCHED_PROC_HEADLINES 2 + +#define FAIRSHED_DEBUG " debug" + +#ifdef CONFIG_VE +/* + * File format is dictated by compatibility reasons. + */ +static int fairsched_seq_show(struct seq_file *m, void *v) +{ + struct fairsched_dump *dump; + struct fairsched_node_dump *p; + unsigned vid, nid, pid, r; + + dump = m->private; + p = (struct fairsched_node_dump *)((unsigned long)v & ~3UL); + if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) { + if (p == dump->nodes) + seq_printf(m, "Version: 2.6 debug\n"); + else if (p == dump->nodes + 1) + seq_printf(m, + " veid " + " id " + " parent " + "weight " + " rate " + "tasks " + " run " + "cpus" + " " + "flg " + "ready " + " start_tag " + " value " + " delay" + "\n"); + } else { + p -= FAIRSCHED_PROC_HEADLINES; + vid = nid = pid = 0; + r = (unsigned long)v & 3; + if (p == dump->nodes) { + if (r == 2) + nid = p->id; + } else { + if (!r) + nid = p->id; + else if (r == 1) + vid = pid = p->id; + else + vid = p->id, nid = 1; + } + seq_printf(m, + "%10u " + "%10u %10u %6u %5u %5u %5u %4u" + " " + " %c%c %5u %20Lu %20Lu %20Lu" + "\n", + vid, + nid, + pid, + p->weight, + p->rate, + p->nr_tasks, + p->nr_runtasks, + p->nr_pcpu, + p->rate_limited ? 'L' : '.', + '.', + p->nr_runtasks, + 0ll, 0ll, 0ll); + } + + return 0; +} + +static void *fairsched_seq_start(struct seq_file *m, loff_t *pos) +{ + struct fairsched_dump *dump; + unsigned long l; + + dump = m->private; + if (*pos >= dump->len * 3 - 1 + FAIRSCHED_PROC_HEADLINES) + return NULL; + if (*pos < FAIRSCHED_PROC_HEADLINES) + return dump->nodes + *pos; + /* guess why... */ + l = (unsigned long)(dump->nodes + + ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) / 3); + l |= ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) % 3; + return (void *)l; +} +static void *fairsched_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return fairsched_seq_start(m, pos); +} +#endif /* CONFIG_VE */ + +static int fairsched2_seq_show(struct seq_file *m, void *v) +{ + struct fairsched_dump *dump; + struct fairsched_node_dump *p; + + dump = m->private; + p = v; + if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) { + if (p == dump->nodes) + seq_printf(m, "Version: 2.7" FAIRSHED_DEBUG "\n"); + else if (p == dump->nodes + 1) + seq_printf(m, + " id " + "weight " + " rate " + " run " + "cpus" +#ifdef FAIRSHED_DEBUG + " " + "flg " + "ready " + " start_tag " + " value " + " delay" +#endif + "\n"); + } else { + p -= FAIRSCHED_PROC_HEADLINES; + seq_printf(m, + "%10u %6u %5u %5u %4u" +#ifdef FAIRSHED_DEBUG + " " + " %c%c %5u %20Lu %20Lu %20Lu" +#endif + "\n", + p->id, + p->weight, + p->rate, + p->nr_runtasks, + p->nr_pcpu +#ifdef FAIRSHED_DEBUG + , + p->rate_limited ? 'L' : '.', + '.', + p->nr_runtasks, + 0ll, 0ll, 0ll +#endif + ); + } + + return 0; +} + +static void *fairsched2_seq_start(struct seq_file *m, loff_t *pos) +{ + struct fairsched_dump *dump; + + dump = m->private; + if (*pos >= dump->len + FAIRSCHED_PROC_HEADLINES) + return NULL; + return dump->nodes + *pos; +} +static void *fairsched2_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return fairsched2_seq_start(m, pos); +} +static void fairsched2_seq_stop(struct seq_file *m, void *v) +{ +} + +#ifdef CONFIG_VE +static struct seq_operations fairsched_seq_op = { + .start = fairsched_seq_start, + .next = fairsched_seq_next, + .stop = fairsched2_seq_stop, + .show = fairsched_seq_show +}; +#endif +static struct seq_operations fairsched2_seq_op = { + .start = fairsched2_seq_start, + .next = fairsched2_seq_next, + .stop = fairsched2_seq_stop, + .show = fairsched2_seq_show +}; +static int fairsched_seq_open(struct inode *inode, struct file *file) +{ + int ret; + struct seq_file *m; + int compat; + +#ifdef CONFIG_VE + compat = (file->f_dentry->d_name.len == sizeof("fairsched") - 1); + ret = seq_open(file, compat ? &fairsched_seq_op : &fairsched2_seq_op); +#else + compat = 0; + ret = seq_open(file, &fairsched2_seq_op); +#endif + if (ret) + return ret; + m = file->private_data; + m->private = fairsched_do_dump(compat); + if (m->private == NULL) { + seq_release(inode, file); + ret = -ENOMEM; + } + return ret; +} +static int fairsched_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *m; + struct fairsched_dump *dump; + + m = file->private_data; + dump = m->private; + m->private = NULL; + vfree(dump); + seq_release(inode, file); + return 0; +} +static struct file_operations proc_fairsched_operations = { + .open = fairsched_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = fairsched_seq_release +}; + +void __init fairsched_init_late(void) +{ + proc_create("fairsched", S_IRUGO, &glob_proc_root, + &proc_fairsched_operations); + proc_create("fairsched2", S_IRUGO, &glob_proc_root, + &proc_fairsched_operations); +} + +#else + +void __init fairsched_init_late(void) { } + +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/fork.c b/kernel/fork.c index 7ce2ebe..fefefd3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -58,6 +58,8 @@ #include #include #include +#include +#include #include #include @@ -66,17 +68,23 @@ #include #include +#include +#include +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ int nr_threads; /* The idle threads do not count.. */ +EXPORT_SYMBOL_GPL(nr_threads); int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +EXPORT_SYMBOL(tasklist_lock); int nr_processes(void) { @@ -145,14 +153,20 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + ub_task_put(tsk); security_task_free(tsk); free_uid(tsk->user); put_group_info(tsk->group_info); delayacct_tsk_free(tsk); +#ifdef CONFIG_VE + put_ve(VE_TASK_INFO(tsk)->owner_env); + atomic_dec(&nr_dead); +#endif if (!profile_handoff_task(tsk)) free_task(tsk); } +EXPORT_SYMBOL_GPL(__put_task_struct); /* * macro override instead of weak attribute alias, to workaround @@ -171,7 +185,7 @@ void __init fork_init(unsigned long mempages) /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", sizeof(struct task_struct), - ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); + ARCH_MIN_TASKALIGN, SLAB_PANIC|SLAB_UBC, NULL); #endif /* do the arch specific task caches init */ @@ -291,6 +305,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) continue; } charge = 0; + if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start, + mpnt->vm_flags & ~VM_LOCKED, + mpnt->vm_file, UB_HARD)) + goto fail_noch; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory(len)) @@ -345,7 +363,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, oldmm, mpnt); + retval = copy_page_range(mm, oldmm, tmp, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -364,6 +382,9 @@ out: fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: + ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start, + mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file); +fail_noch: retval = -ENOMEM; vm_unacct_memory(charge); goto out; @@ -412,6 +433,15 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; mm_init_owner(mm, p); + /* + * This looks ugly, buy when we came from + * sys_execve -> mm_alloc -> here + * we need to get exec_ub, not task_ub. But when + * we're here like this + * sys_fork() -> dup_mm -> here + * we need task_ub, not the exec one... xemul + */ + set_mm_ub(mm, p); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -419,6 +449,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) return mm; } + put_mm_ub(mm); free_mm(mm); return NULL; } @@ -437,6 +468,7 @@ struct mm_struct * mm_alloc(void) } return mm; } +EXPORT_SYMBOL_GPL(mm_alloc); /* * Called when the last reference to the mm @@ -449,6 +481,7 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); + put_mm_ub(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -470,6 +503,9 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + (void) virtinfo_gencall(VIRTINFO_EXITMMAP, mm); + if (mm->oom_killed) + ub_oom_task_dead(current); mmdrop(mm); } } @@ -599,6 +635,7 @@ fail_nocontext: * because it calls destroy_context() */ mm_free_pgd(mm); + put_mm_ub(mm); free_mm(mm); return NULL; } @@ -899,14 +936,20 @@ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, + pid_t vpid, int trace) { int retval; struct task_struct *p; int cgroup_callbacks_done = 0; +#ifdef CONFIG_VE + if (clone_flags & CLONE_NAMESPACES_MASK) + return ERR_PTR(-EINVAL); +#else if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); +#endif /* * Thread groups must share signals as well, and detached threads @@ -934,6 +977,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, rt_mutex_init_task(p); + if (ub_task_charge(current, p)) + goto bad_fork_charge; + #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); @@ -1083,7 +1129,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (pid != &init_struct_pid) { retval = -ENOMEM; - pid = alloc_pid(task_active_pid_ns(p)); + pid = alloc_pid(task_active_pid_ns(p), vpid); if (!pid) goto bad_fork_cleanup_io; @@ -1091,6 +1137,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = pid_ns_prepare_proc(task_active_pid_ns(p)); if (retval < 0) goto bad_fork_free_pid; + if (task_active_pid_ns(current)->flags & PID_NS_HIDE_CHILD) + task_active_pid_ns(p)->flags |= PID_NS_HIDDEN; } } @@ -1191,7 +1239,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); - if (signal_pending(current)) { + if (signal_pending(current) && !vpid) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; @@ -1233,14 +1281,24 @@ static struct task_struct *copy_process(unsigned long clone_flags, attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); +#ifdef CONFIG_VE + list_add_tail_rcu(&p->ve_task_info.vetask_list, + &p->ve_task_info.owner_env->vetask_lh); +#endif __get_cpu_var(process_counts)++; } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; } + (void)get_ve(p->ve_task_info.owner_env); + pget_ve(p->ve_task_info.owner_env); +#ifdef CONFIG_VE + seqcount_init(&p->ve_task_info.wakeup_lock); +#endif total_forks++; spin_unlock(¤t->sighand->siglock); + get_task_fairsched_node(p); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); @@ -1288,6 +1346,9 @@ bad_fork_cleanup_count: atomic_dec(&p->user->processes); free_uid(p->user); bad_fork_free: + ub_task_uncharge(p); + ub_task_put(p); +bad_fork_charge: free_task(p); fork_out: return ERR_PTR(retval); @@ -1305,7 +1366,7 @@ struct task_struct * __cpuinit fork_idle(int cpu) struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, - &init_struct_pid, 0); + &init_struct_pid, 0, 0); if (!IS_ERR(task)) init_idle(task, cpu); @@ -1318,12 +1379,13 @@ struct task_struct * __cpuinit fork_idle(int cpu) * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -long do_fork(unsigned long clone_flags, +long do_fork_pid(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, - int __user *child_tidptr) + int __user *child_tidptr, + long vpid) { struct task_struct *p; int trace = 0; @@ -1346,6 +1408,10 @@ long do_fork(unsigned long clone_flags, } } + nr = virtinfo_gencall(VIRTINFO_DOFORK, (void *)clone_flags); + if (nr) + return nr; + /* * When called from kernel_thread, don't do user tracing stuff. */ @@ -1353,7 +1419,7 @@ long do_fork(unsigned long clone_flags, trace = tracehook_prepare_clone(clone_flags); p = copy_process(clone_flags, stack_start, regs, stack_size, - child_tidptr, NULL, trace); + child_tidptr, NULL, vpid, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1381,6 +1447,8 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; + (void)virtinfo_gencall(VIRTINFO_DOFORKRET, p); + if (unlikely(clone_flags & CLONE_STOPPED)) { /* * We'll start up with an immediate SIGSTOP. @@ -1404,6 +1472,8 @@ long do_fork(unsigned long clone_flags, } else { nr = PTR_ERR(p); } + + (void)virtinfo_gencall(VIRTINFO_DOFORKPOST, (void *)(long)nr); return nr; } @@ -1419,27 +1489,40 @@ static void sighand_ctor(void *data) init_waitqueue_head(&sighand->signalfd_wqh); } +EXPORT_SYMBOL(do_fork_pid); + +long do_fork(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + return do_fork_pid(clone_flags, stack_start, regs, stack_size, + parent_tidptr, child_tidptr, 0); +} + void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|SLAB_UBC, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); vm_area_cachep = kmem_cache_create("vm_area_struct", sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); + SLAB_PANIC|SLAB_UBC, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); } /* @@ -1577,6 +1660,10 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| CLONE_NEWNET)) goto bad_unshare_out; +#ifdef CONFIG_VE + if (unshare_flags & CLONE_NAMESPACES_MASK) + goto bad_unshare_out; +#endif /* * CLONE_NEWIPC must also detach from the undolist: after switching @@ -1595,9 +1682,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_cleanup_sigh; if ((err = unshare_fd(unshare_flags, &new_fd))) goto bad_unshare_cleanup_vm; +#ifndef CONFIG_VE if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) goto bad_unshare_cleanup_fd; +#endif if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { if (do_sysvsem) { @@ -1641,7 +1730,9 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) if (new_nsproxy) put_nsproxy(new_nsproxy); +#ifndef CONFIG_VE bad_unshare_cleanup_fd: +#endif if (new_fd) put_files_struct(new_fd); diff --git a/kernel/futex.c b/kernel/futex.c index 7d1136e..a02be16 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1198,8 +1198,6 @@ handle_fault: */ #define FLAGS_SHARED 1 -static long futex_wait_restart(struct restart_block *restart); - static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, u32 val, ktime_t *abs_time, u32 bitset) { @@ -1365,7 +1363,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, } -static long futex_wait_restart(struct restart_block *restart) +long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; struct rw_semaphore *fshared = NULL; @@ -1378,6 +1376,7 @@ static long futex_wait_restart(struct restart_block *restart) return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, restart->futex.bitset); } +EXPORT_SYMBOL_GPL(futex_wait_restart); /* diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cdec83e..b1de384 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1523,6 +1523,7 @@ out: destroy_hrtimer_on_stack(&t.timer); return ret; } +EXPORT_SYMBOL_GPL(hrtimer_nanosleep_restart); long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid) diff --git a/kernel/kmod.c b/kernel/kmod.c index 2456d1a..7f97ec1 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -78,6 +78,10 @@ int request_module(const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; + /* Don't allow request_module() inside VE. */ + if (!ve_is_super(get_exec_env())) + return -EPERM; + va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); @@ -452,6 +456,9 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, DECLARE_COMPLETION_ONSTACK(done); int retval = 0; + if (!ve_is_super(get_exec_env())) + return -EPERM; + helper_lock(); if (sub_info->path[0] == '\0') goto out; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 75bc2cd..d4839ae 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -126,14 +126,14 @@ static int __kprobes check_safety(void) ret = freeze_processes(); if (ret == 0) { struct task_struct *p, *q; - do_each_thread(p, q) { + do_each_thread_all(p, q) { if (p != current && p->state == TASK_RUNNING && p->pid != 0) { printk("Check failed: %s is running\n",p->comm); ret = -1; goto loop_end; } - } while_each_thread(p, q); + } while_each_thread_all(p, q); } loop_end: thaw_processes(); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index dbda475..055464e 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3421,7 +3421,7 @@ retry: if (count != 10) printk(" locked it.\n"); - do_each_thread(g, p) { + do_each_thread_all(g, p) { /* * It's not reliable to print a task's held locks * if it's not sleeping (or if it's not the current @@ -3434,7 +3434,7 @@ retry: if (!unlock) if (read_trylock(&tasklist_lock)) unlock = 1; - } while_each_thread(g, p); + } while_each_thread_all(g, p); printk("\n"); printk("=============================================\n\n"); diff --git a/kernel/module.c b/kernel/module.c index 9db1191..c3188b8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2531,6 +2531,8 @@ unsigned long module_kallsyms_lookup_name(const char *name) static void *m_start(struct seq_file *m, loff_t *pos) { mutex_lock(&module_mutex); + if (!ve_is_super(get_exec_env())) + return NULL; return seq_list_start(&modules, *pos); } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 1d3ef29..ef348c5 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -26,6 +26,14 @@ static struct kmem_cache *nsproxy_cachep; struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); +void get_task_namespaces(struct task_struct *tsk) +{ + struct nsproxy *ns = tsk->nsproxy; + if (ns) { + get_nsproxy(ns); + } +} + /* * creates a copy of "orig" with refcount 1. */ @@ -133,10 +141,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET))) return 0; +#ifndef CONFIG_VE if (!capable(CAP_SYS_ADMIN)) { err = -EPERM; goto out; } +#endif /* * CLONE_NEWIPC must detach from the undolist: after switching @@ -162,6 +172,7 @@ out: put_nsproxy(old_ns); return err; } +EXPORT_SYMBOL(copy_namespaces); void free_nsproxy(struct nsproxy *ns) { @@ -178,6 +189,22 @@ void free_nsproxy(struct nsproxy *ns) put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } +EXPORT_SYMBOL(free_nsproxy); + +struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk) +{ + struct mnt_namespace *mnt_ns = NULL; + + task_lock(tsk); + if (tsk->nsproxy) + mnt_ns = tsk->nsproxy->mnt_ns; + if (mnt_ns) + get_mnt_ns(mnt_ns); + task_unlock(tsk); + + return mnt_ns; +} +EXPORT_SYMBOL(get_task_mnt_ns); /* * Called from unshare. Unshare all the namespaces part of nsproxy. diff --git a/kernel/pid.c b/kernel/pid.c index 064e76a..6051d2f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -110,7 +111,7 @@ EXPORT_SYMBOL(is_container_init); * For now it is easier to be safe than to prove it can't happen. */ -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); static void free_pidmap(struct upid *upid) { @@ -121,8 +122,9 @@ static void free_pidmap(struct upid *upid) clear_bit(offset, map->page); atomic_inc(&map->nr_free); } +EXPORT_SYMBOL_GPL(free_pidmap); -static int alloc_pidmap(struct pid_namespace *pid_ns) +int alloc_pidmap(struct pid_namespace *pid_ns) { int i, offset, max_scan, pid, last = pid_ns->last_pid; struct pidmap *map; @@ -182,6 +184,36 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) return -1; } +int set_pidmap(struct pid_namespace *pid_ns, pid_t pid) +{ + int offset; + struct pidmap *map; + + offset = pid & BITS_PER_PAGE_MASK; + map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; + if (unlikely(!map->page)) { + void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); + /* + * Free the page if someone raced with us + * installing it: + */ + spin_lock_irq(&pidmap_lock); + if (map->page) + kfree(page); + else + map->page = page; + spin_unlock_irq(&pidmap_lock); + if (unlikely(!map->page)) + return -ENOMEM; + } + + if (test_and_set_bit(offset, map->page)) + return -EBUSY; + + atomic_dec(&map->nr_free); + return pid; +} + int next_pidmap(struct pid_namespace *pid_ns, int last) { int offset; @@ -227,25 +259,33 @@ void free_pid(struct pid *pid) /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; unsigned long flags; + struct upid *upid; spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); - spin_unlock_irqrestore(&pidmap_lock, flags); + for (i = 0; i <= pid->level; i++) { + upid = &pid->numbers[i]; + if (!hlist_unhashed(&upid->pid_chain)) + hlist_del_rcu(&upid->pid_chain); + } + spin_unlock(&pidmap_lock); + ub_kmemsize_uncharge(pid->ub, pid->numbers[pid->level].ns->pid_cachep->objuse); + local_irq_restore(flags); for (i = 0; i <= pid->level; i++) free_pidmap(pid->numbers + i); - + put_beancounter(pid->ub); call_rcu(&pid->rcu, delayed_put_pid); } +EXPORT_SYMBOL_GPL(free_pid); -struct pid *alloc_pid(struct pid_namespace *ns) +struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid) { struct pid *pid; enum pid_type type; int i, nr; struct pid_namespace *tmp; struct upid *upid; + struct user_beancounter *ub; pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) @@ -253,7 +293,10 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = ns; for (i = ns->level; i >= 0; i--) { - nr = alloc_pidmap(tmp); + if (vpid != 0 && i == ns->level) + nr = set_pidmap(tmp, vpid); + else + nr = alloc_pidmap(tmp); if (nr < 0) goto out_free; @@ -268,17 +311,32 @@ struct pid *alloc_pid(struct pid_namespace *ns) for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); +#ifdef CONFIG_BEANCOUNTERS + ub = get_exec_ub(); + local_irq_disable(); + if (ub_kmemsize_charge(ub, ns->pid_cachep->objuse, UB_HARD)) + goto out_enable; + pid->ub = get_beancounter(ub); + spin_lock(&pidmap_lock); +#else spin_lock_irq(&pidmap_lock); +#endif for (i = ns->level; i >= 0; i--) { upid = &pid->numbers[i]; hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + if (upid->ns->flags & PID_NS_HIDDEN) + while (i--) + INIT_HLIST_NODE(&pid->numbers[i].pid_chain); } spin_unlock_irq(&pidmap_lock); out: return pid; +out_enable: + local_irq_enable(); + put_pid_ns(ns); out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); @@ -287,6 +345,7 @@ out_free: pid = NULL; goto out; } +EXPORT_SYMBOL_GPL(alloc_pid); struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { @@ -309,6 +368,45 @@ struct pid *find_vpid(int nr) } EXPORT_SYMBOL_GPL(find_vpid); +void reattach_pid(struct task_struct *tsk, enum pid_type type, + struct pid *pid) +{ + int i; + struct pid *old_pid; + struct pid_link *link; + struct upid *upid; + + link = &tsk->pids[type]; + old_pid = link->pid; + + hlist_del_rcu(&link->node); + link->pid = pid; + hlist_add_head_rcu(&link->node, &pid->tasks[type]); + + if (type != PIDTYPE_PID) { + for (i = PIDTYPE_MAX; --i >= 0; ) + if (!hlist_empty(&old_pid->tasks[i])) + return; + + for (i = 0; i < pid->level; i++) + hlist_del_rcu(&old_pid->numbers[i].pid_chain); + } else { + for (i = PIDTYPE_MAX; --i >= 0; ) + if (!hlist_empty(&old_pid->tasks[i])) + BUG(); + + for (i = 0; i < pid->level; i++) + hlist_replace_rcu(&old_pid->numbers[i].pid_chain, + &pid->numbers[i].pid_chain); + + upid = &pid->numbers[pid->level]; + hlist_add_head_rcu(&upid->pid_chain, + &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + } + + call_rcu(&old_pid->rcu, delayed_put_pid); +} + /* * attach_pid() must be called with the tasklist_lock write-held. */ @@ -321,6 +419,7 @@ void attach_pid(struct task_struct *task, enum pid_type type, link->pid = pid; hlist_add_head_rcu(&link->node, &pid->tasks[type]); } +EXPORT_SYMBOL_GPL(attach_pid); static void __change_pid(struct task_struct *task, enum pid_type type, struct pid *new) @@ -341,6 +440,7 @@ static void __change_pid(struct task_struct *task, enum pid_type type, free_pid(pid); } +EXPORT_SYMBOL_GPL(detach_pid); void detach_pid(struct task_struct *task, enum pid_type type) { @@ -431,6 +531,17 @@ struct pid *find_get_pid(pid_t nr) } EXPORT_SYMBOL_GPL(find_get_pid); +pid_t pid_to_vpid(pid_t nr) +{ + struct pid *pid; + + pid = find_pid_ns(nr, &init_pid_ns); + if (pid) + return pid->numbers[pid->level].nr; + return -1; +} +EXPORT_SYMBOL_GPL(pid_to_vpid); + pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) { struct upid *upid; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index fab8ea8..910d183 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -14,6 +14,8 @@ #include #include +#include + #define BITS_PER_PAGE (PAGE_SIZE*8) struct pid_cache { @@ -148,6 +150,160 @@ void free_pid_ns(struct kref *kref) put_pid_ns(parent); } +/* + * this is a dirty ugly hack. + */ + +static int __pid_ns_attach_task(struct pid_namespace *ns, + struct task_struct *tsk, pid_t nr) +{ + struct pid *pid; + enum pid_type type; + unsigned long old_size, new_size; + + pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); + if (!pid) + goto out; + + if (nr == 0) + nr = alloc_pidmap(ns); + else + nr = set_pidmap(ns, nr); + + if (nr < 0) + goto out_free; + + memcpy(pid, task_pid(tsk), + sizeof(struct pid) + (ns->level - 1) * sizeof(struct upid)); + get_pid_ns(ns); + pid->level++; + BUG_ON(pid->level != ns->level); + pid->numbers[pid->level].nr = nr; + pid->numbers[pid->level].ns = ns; + atomic_set(&pid->count, 1); + for (type = 0; type < PIDTYPE_MAX; ++type) + INIT_HLIST_HEAD(&pid->tasks[type]); + + old_size = pid->numbers[pid->level - 1].ns->pid_cachep->objuse; + new_size = pid->numbers[pid->level].ns->pid_cachep->objuse; + local_irq_disable(); + /* + * Depending on sizeof(struct foo), cache flags (redzoning, etc) + * and actual CPU (cacheline_size() jump from 64 to 128 bytes after + * CPU detection) new size can very well be smaller than old size. + */ + if (new_size > old_size) { + if (ub_kmemsize_charge(pid->ub, new_size - old_size, UB_HARD) < 0) + goto out_enable; + } else + ub_kmemsize_uncharge(pid->ub, old_size - new_size); + + write_lock(&tasklist_lock); + + spin_lock(&pidmap_lock); + reattach_pid(tsk, PIDTYPE_SID, pid); + set_task_session(tsk, pid_nr(pid)); + reattach_pid(tsk, PIDTYPE_PGID, pid); + tsk->signal->__pgrp = pid_nr(pid); + current->signal->tty_old_pgrp = NULL; + + reattach_pid(tsk, PIDTYPE_PID, pid); + spin_unlock(&pidmap_lock); + + write_unlock_irq(&tasklist_lock); + + return 0; + +out_enable: + local_irq_enable(); + put_pid_ns(ns); +out_free: + kmem_cache_free(ns->pid_cachep, pid); +out: + return -ENOMEM; +} + +int pid_ns_attach_task(struct pid_namespace *ns, struct task_struct *tsk) +{ + return __pid_ns_attach_task(ns, tsk, 0); +} +EXPORT_SYMBOL_GPL(pid_ns_attach_task); + +int pid_ns_attach_init(struct pid_namespace *ns, struct task_struct *tsk) +{ + int err; + + err = __pid_ns_attach_task(ns, tsk, 1); + if (err < 0) + return err; + + ns->child_reaper = tsk; + return 0; +} +EXPORT_SYMBOL_GPL(pid_ns_attach_init); + +#ifdef CONFIG_VE +static noinline void show_lost_task(struct task_struct *p) +{ + char buf[512] = "N/A"; +#ifdef CONFIG_PROC_FS + extern char * task_sig(struct task_struct *p, char *buffer); + + task_sig(p, buf); +#endif + printk("Lost task: %d/%s/%p\nSignals:%s\n", p->pid, p->comm, p, buf); +} + +static void zap_ve_processes(struct ve_struct *env) +{ + /* + * Here the VE changes its state into "not running". + * op_sem taken for write is a barrier to all VE manipulations from + * ioctl: it waits for operations currently in progress and blocks all + * subsequent operations until is_running is set to 0 and op_sem is + * released. + */ + down_write(&env->op_sem); + env->is_running = 0; + up_write(&env->op_sem); + + /* wait for all init childs exit */ + while (atomic_read(&env->pcounter) > 1) { + struct task_struct *g, *p; + long delay = 1; + + if (sys_wait4(-1, NULL, __WALL | WNOHANG, NULL) > 0) + continue; + /* it was ENOCHLD or no more children somehow */ + if (atomic_read(&env->pcounter) == 1) + break; + + /* clear all signals to avoid wakeups */ + if (signal_pending(current)) + flush_signals(current); + /* we have child without signal sent */ + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(delay); + delay = (delay < HZ) ? (delay << 1) : HZ; + read_lock(&tasklist_lock); + do_each_thread_ve(g, p) { + if (p != current) { + /* + * by that time no processes other then entered + * may exist in the VE. if some were missed by + * zap_pid_ns_processes() this was a BUG + */ + if (!p->did_ve_enter) + show_lost_task(p); + + force_sig_specific(SIGKILL, p); + } + } while_each_thread_ve(g, p); + read_unlock(&tasklist_lock); + } +} +#endif + void zap_pid_ns_processes(struct pid_namespace *pid_ns) { int nr; @@ -180,6 +336,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) } while (rc != -ECHILD); acct_exit_ns(pid_ns); + +#ifdef CONFIG_VE + zap_ve_processes(get_exec_env()); +#endif return; } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5131e54..73fbe29 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -31,6 +31,8 @@ * POSIX clocks & timers */ #include +#include +#include #include #include #include @@ -46,6 +48,9 @@ #include #include #include +#include + +#include /* * Management arrays for POSIX timers. Timers are kept in slab memory @@ -240,8 +245,8 @@ static __init int init_posix_timers(void) register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); + sizeof (struct k_itimer), 0, + SLAB_PANIC|SLAB_UBC, NULL); idr_init(&posix_timers_id); return 0; } @@ -298,6 +303,13 @@ void do_schedule_next_timer(struct siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { + int ret; + struct ve_struct *ve; + struct user_beancounter *ub; + + ve = set_exec_env(timr->it_process->ve_task_info.owner_env); + ub = set_exec_ub(timr->it_process->task_bc.task_ub); + /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->do_schedule_next_timer(). @@ -318,10 +330,10 @@ int posix_timer_event(struct k_itimer *timr, int si_private) if (timr->it_sigev_notify & SIGEV_THREAD_ID) { struct task_struct *leader; - int ret = send_sigqueue(timr->sigq, timr->it_process, 0); + ret = send_sigqueue(timr->sigq, timr->it_process, 0); if (likely(ret >= 0)) - return ret; + goto out; timr->it_sigev_notify = SIGEV_SIGNAL; leader = timr->it_process->group_leader; @@ -329,7 +341,11 @@ int posix_timer_event(struct k_itimer *timr, int si_private) timr->it_process = leader; } - return send_sigqueue(timr->sigq, timr->it_process, 1); + ret = send_sigqueue(timr->sigq, timr->it_process, 1); +out: + (void)set_exec_ub(ub); + (void)set_exec_env(ve); + return ret; } EXPORT_SYMBOL_GPL(posix_timer_event); diff --git a/kernel/power/process.c b/kernel/power/process.c index 278946a..e60acd5 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -14,6 +14,8 @@ #include #include +static atomic_t global_suspend = ATOMIC_INIT(0); + /* * Timeout for stopping processes */ @@ -23,7 +25,9 @@ static inline int freezeable(struct task_struct * p) { if ((p == current) || (p->flags & PF_NOFREEZE) || - (p->exit_state != 0)) + (p->exit_state != 0) || + (p->state == TASK_STOPPED) || + (p->state == TASK_TRACED)) return 0; return 1; } @@ -47,6 +51,28 @@ void refrigerator(void) processes around? */ long save; +#if defined(CONFIG_VZ_CHECKPOINT) || defined(CONFIG_VZ_CHECKPOINT_MODULE) + save = current->state; + current->state = TASK_UNINTERRUPTIBLE; + + spin_lock_irq(¤t->sighand->siglock); + if (test_and_clear_thread_flag(TIF_FREEZE)) { + recalc_sigpending(); /* We sent fake signal, clean it up */ + if (atomic_read(&global_suspend) || + atomic_read(&get_exec_env()->suspend)) + current->flags |= PF_FROZEN; + else + current->state = save; + } else { + /* Freeze request could be canceled before we entered + * refrigerator(). In this case we do nothing. */ + current->state = save; + } + spin_unlock_irq(¤t->sighand->siglock); + + while (current->flags & PF_FROZEN) + schedule(); +#else task_lock(current); if (freezing(current)) { frozen_process(); @@ -68,6 +94,7 @@ void refrigerator(void) break; schedule(); } +#endif pr_debug("%s left refrigerator\n", current->comm); __set_current_state(save); } @@ -158,7 +185,7 @@ static int try_to_freeze_tasks(bool sig_only) do { todo = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (frozen(p) || !freezeable(p)) continue; @@ -174,7 +201,7 @@ static int try_to_freeze_tasks(bool sig_only) if (!task_is_stopped_or_traced(p) && !freezer_should_skip(p)) todo++; - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); yield(); /* Yield is okay here */ if (time_after(jiffies, end_time)) @@ -198,13 +225,13 @@ static int try_to_freeze_tasks(bool sig_only) elapsed_csecs / 100, elapsed_csecs % 100, todo); show_state(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { task_lock(p); if (freezing(p) && !freezer_should_skip(p)) printk(KERN_ERR " %s\n", p->comm); cancel_freezing(p); task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); } else { printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, @@ -221,6 +248,7 @@ int freeze_processes(void) { int error; + atomic_inc(&global_suspend); printk("Freezing user space processes ... "); error = try_to_freeze_tasks(true); if (error) @@ -235,6 +263,7 @@ int freeze_processes(void) Exit: BUG_ON(in_atomic()); printk("\n"); + atomic_dec(&global_suspend); return error; } @@ -243,15 +272,17 @@ static void thaw_tasks(bool nosig_only) struct task_struct *g, *p; read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (!freezeable(p)) continue; if (nosig_only && should_send_signal(p)) continue; - thaw_process(p); - } while_each_thread(g, p); + if (!thaw_process(p)) + printk(KERN_WARNING " Strange, %s not stopped\n", + p->comm ); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); } diff --git a/kernel/printk.c b/kernel/printk.c index b51b156..d95b686 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -31,7 +31,9 @@ #include #include #include +#include #include +#include #include @@ -92,7 +94,7 @@ static int console_locked, console_suspended; * It is also used in interesting ways to provide interlocking in * release_console_sem(). */ -static DEFINE_SPINLOCK(logbuf_lock); +DEFINE_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -128,6 +130,7 @@ EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; +int console_silence_loglevel; #ifdef CONFIG_PRINTK @@ -136,6 +139,19 @@ static char *log_buf = __log_buf; static int log_buf_len = __LOG_BUF_LEN; static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ +static int __init setup_console_silencelevel(char *str) +{ + int level; + + if (get_option(&str, &level) != 1) + return 0; + + console_silence_loglevel = level; + return 1; +} + +__setup("silencelevel=", setup_console_silencelevel); + static int __init log_buf_len_setup(char *str) { unsigned size = memparse(str, &str); @@ -293,6 +309,9 @@ int do_syslog(int type, char __user *buf, int len) char c; int error = 0; + if (!ve_is_super(get_exec_env()) && (type == 6 || type == 7)) + goto out; + error = security_syslog(type); if (error) return error; @@ -313,15 +332,15 @@ int do_syslog(int type, char __user *buf, int len) error = -EFAULT; goto out; } - error = wait_event_interruptible(log_wait, - (log_start - log_end)); + error = wait_event_interruptible(ve_log_wait, + (ve_log_start - ve_log_end)); if (error) goto out; i = 0; spin_lock_irq(&logbuf_lock); - while (!error && (log_start != log_end) && i < len) { - c = LOG_BUF(log_start); - log_start++; + while (!error && (ve_log_start != ve_log_end) && i < len) { + c = VE_LOG_BUF(ve_log_start); + ve_log_start++; spin_unlock_irq(&logbuf_lock); error = __put_user(c,buf); buf++; @@ -347,15 +366,17 @@ int do_syslog(int type, char __user *buf, int len) error = -EFAULT; goto out; } + if (ve_log_buf == NULL) + goto out; count = len; - if (count > log_buf_len) - count = log_buf_len; + if (count > ve_log_buf_len) + count = ve_log_buf_len; spin_lock_irq(&logbuf_lock); - if (count > logged_chars) - count = logged_chars; + if (count > ve_logged_chars) + count = ve_logged_chars; if (do_clear) - logged_chars = 0; - limit = log_end; + ve_logged_chars = 0; + limit = ve_log_end; /* * __put_user() could sleep, and while we sleep * printk() could overwrite the messages @@ -364,9 +385,9 @@ int do_syslog(int type, char __user *buf, int len) */ for (i = 0; i < count && !error; i++) { j = limit-1-i; - if (j + log_buf_len < log_end) + if (j + ve_log_buf_len < ve_log_end) break; - c = LOG_BUF(j); + c = VE_LOG_BUF(j); spin_unlock_irq(&logbuf_lock); error = __put_user(c,&buf[count-1-i]); cond_resched(); @@ -390,7 +411,7 @@ int do_syslog(int type, char __user *buf, int len) } break; case 5: /* Clear ring buffer */ - logged_chars = 0; + ve_logged_chars = 0; break; case 6: /* Disable logging to console */ console_loglevel = minimum_console_loglevel; @@ -402,16 +423,19 @@ int do_syslog(int type, char __user *buf, int len) error = -EINVAL; if (len < 1 || len > 8) goto out; + error = 0; + /* VE has no console, so return success */ + if (!ve_is_super(get_exec_env())) + goto out; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; - error = 0; break; case 9: /* Number of chars in the log buffer */ - error = log_end - log_start; + error = ve_log_end - ve_log_start; break; case 10: /* Size of the log buffer */ - error = log_buf_len; + error = ve_log_buf_len; break; default: error = -EINVAL; @@ -522,14 +546,14 @@ static void call_console_drivers(unsigned start, unsigned end) static void emit_log_char(char c) { - LOG_BUF(log_end) = c; - log_end++; - if (log_end - log_start > log_buf_len) - log_start = log_end - log_buf_len; - if (log_end - con_start > log_buf_len) - con_start = log_end - log_buf_len; - if (logged_chars < log_buf_len) - logged_chars++; + VE_LOG_BUF(ve_log_end) = c; + ve_log_end++; + if (ve_log_end - ve_log_start > ve_log_buf_len) + ve_log_start = ve_log_end - ve_log_buf_len; + if (ve_is_super(get_exec_env()) && ve_log_end - con_start > ve_log_buf_len) + con_start = ve_log_end - ve_log_buf_len; + if (ve_logged_chars < ve_log_buf_len) + ve_logged_chars++; } /* @@ -595,6 +619,30 @@ static int have_callable_console(void) * printf(3) */ +static inline int ve_log_init(void) +{ +#ifdef CONFIG_VE + if (ve_log_buf != NULL) + return 0; + + if (ve_is_super(get_exec_env())) { + ve0._log_wait = &log_wait; + ve0._log_start = &log_start; + ve0._log_end = &log_end; + ve0._logged_chars = &logged_chars; + ve0.log_buf = log_buf; + return 0; + } + + ve_log_buf = kmalloc(ve_log_buf_len, GFP_ATOMIC); + if (!ve_log_buf) + return -ENOMEM; + + memset(ve_log_buf, 0, ve_log_buf_len); +#endif + return 0; +} + asmlinkage int printk(const char *fmt, ...) { va_list args; @@ -662,13 +710,14 @@ static int recursion_bug; static int new_text_line = 1; static char printk_buf[1024]; -asmlinkage int vprintk(const char *fmt, va_list args) +asmlinkage int __vprintk(const char *fmt, va_list args) { int printed_len = 0; int current_log_level = default_message_loglevel; unsigned long flags; int this_cpu; char *p; + int err, need_wake; boot_delay_msec(); @@ -699,6 +748,12 @@ asmlinkage int vprintk(const char *fmt, va_list args) spin_lock(&logbuf_lock); printk_cpu = this_cpu; + err = ve_log_init(); + if (err) { + spin_unlock_irqrestore(&logbuf_lock, flags); + return err; + } + if (recursion_bug) { recursion_bug = 0; strcpy(printk_buf, recursion_bug_msg); @@ -767,7 +822,12 @@ asmlinkage int vprintk(const char *fmt, va_list args) * will release 'logbuf_lock' regardless of whether it * actually gets the semaphore or not. */ - if (acquire_console_semaphore_for_printk(this_cpu)) + if (!ve_is_super(get_exec_env())) { + need_wake = (ve_log_start != ve_log_end); + spin_unlock_irqrestore(&logbuf_lock, flags); + if (!oops_in_progress && need_wake) + wake_up_interruptible(&ve_log_wait); + } else if (acquire_console_semaphore_for_printk(this_cpu)) release_console_sem(); lockdep_on(); @@ -780,6 +840,41 @@ out_restore_irqs: EXPORT_SYMBOL(printk); EXPORT_SYMBOL(vprintk); +asmlinkage int vprintk(const char *fmt, va_list args) +{ + int i; + struct ve_struct *env; + + env = set_exec_env(get_ve0()); + i = __vprintk(fmt, args); + (void)set_exec_env(env); + return i; +} + +asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args) +{ + int printed_len; + + printed_len = 0; + if (ve_is_super(get_exec_env()) || (dst & VE0_LOG)) + printed_len = vprintk(fmt, args); + if (!ve_is_super(get_exec_env()) && (dst & VE_LOG)) + printed_len = __vprintk(fmt, args); + return printed_len; +} + +asmlinkage int ve_printk(int dst, const char *fmt, ...) +{ + va_list args; + int printed_len; + + va_start(args, fmt); + printed_len = ve_vprintk(dst, fmt, args); + va_end(args); + return printed_len; +} +EXPORT_SYMBOL(ve_printk); + #else asmlinkage long sys_syslog(int type, char __user *buf, int len) @@ -1323,6 +1418,36 @@ int printk_ratelimit(void) } EXPORT_SYMBOL(printk_ratelimit); +/* + * Rate limiting stuff. + */ +int vz_ratelimit(struct vz_rate_info *p) +{ + unsigned long cjif, djif; + unsigned long flags; + static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; + long new_bucket; + + spin_lock_irqsave(&ratelimit_lock, flags); + cjif = jiffies; + djif = cjif - p->last; + if (djif < p->interval) { + if (p->bucket >= p->burst) { + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 0; + } + p->bucket++; + } else { + new_bucket = p->bucket - (djif / (unsigned)p->interval); + if (new_bucket < 0) + new_bucket = 0; + p->bucket = new_bucket + 1; + } + p->last = cjif; + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 1; +} + /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 356699a..05cbe69 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -124,6 +124,8 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) * or halting the specified task is impossible. */ int dumpable = 0; + int vps_dumpable = 0; + /* Don't let security modules deny introspection */ if (task == current) return 0; @@ -135,11 +137,17 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) return -EPERM; smp_rmb(); - if (task->mm) + if (task->mm) { dumpable = get_dumpable(task->mm); + vps_dumpable = (task->mm->vps_dumpable == 1); + } + if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - + if (!vps_dumpable && !ve_is_super(get_exec_env())) + return -EPERM; + if (!ve_accessible(VE_TASK_INFO(task)->owner_env, get_exec_env())) + return -EPERM; return security_ptrace_may_access(task, mode); } @@ -190,6 +198,8 @@ repeat: retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); if (retval) goto bad; + if (task->mm->vps_dumpable == 2) + goto bad; /* Go */ task->ptrace |= PT_PTRACED; @@ -283,6 +293,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds } return copied; } +EXPORT_SYMBOL_GPL(access_process_vm); static int ptrace_setoptions(struct task_struct *child, long data) { diff --git a/kernel/sched.c b/kernel/sched.c index ad1962d..6f173a5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -343,6 +344,8 @@ static inline struct task_group *task_group(struct task_struct *p) #elif defined(CONFIG_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); +#elif defined(CONFIG_VZ_FAIRSCHED) + tg = p->fsched_node->tg; #else tg = &init_task_group; #endif @@ -545,6 +548,9 @@ struct rq { */ unsigned long nr_uninterruptible; + unsigned long nr_sleeping; + unsigned long nr_stopped; + struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; @@ -618,6 +624,11 @@ static inline int cpu_of(struct rq *rq) #endif } +struct kernel_stat_glob kstat_glob; +DEFINE_SPINLOCK(kstat_glb_lock); +EXPORT_SYMBOL(kstat_glob); +EXPORT_SYMBOL(kstat_glb_lock); + /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. @@ -969,6 +980,217 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) spin_unlock_irqrestore(&rq->lock, *flags); } +#ifdef CONFIG_VE +static inline void ve_nr_iowait_inc(struct ve_struct *ve, int cpu) +{ + VE_CPU_STATS(ve, cpu)->nr_iowait++; +} + +static inline void ve_nr_iowait_dec(struct ve_struct *ve, int cpu) +{ + VE_CPU_STATS(ve, cpu)->nr_iowait--; +} + +static inline void ve_nr_unint_inc(struct ve_struct *ve, int cpu) +{ + VE_CPU_STATS(ve, cpu)->nr_unint++; +} + +static inline void ve_nr_unint_dec(struct ve_struct *ve, int cpu) +{ + VE_CPU_STATS(ve, cpu)->nr_unint--; +} + +#define cycles_after(a, b) ((long long)(b) - (long long)(a) < 0) + +cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu) +{ + struct ve_cpu_stats *ve_stat; + unsigned v; + cycles_t strt, ret, cycles; + + ve_stat = VE_CPU_STATS(ve, cpu); + do { + v = read_seqcount_begin(&ve_stat->stat_lock); + ret = ve_stat->idle_time; + strt = ve_stat->strt_idle_time; + if (strt && nr_uninterruptible_ve(ve) == 0) { + cycles = get_cycles(); + if (cycles_after(cycles, strt)) + ret += cycles - strt; + } + } while (read_seqcount_retry(&ve_stat->stat_lock, v)); + return ret; +} +EXPORT_SYMBOL(ve_sched_get_idle_time); + +cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu) +{ + struct ve_cpu_stats *ve_stat; + unsigned v; + cycles_t strt, ret, cycles; + + ve_stat = VE_CPU_STATS(ve, cpu); + do { + v = read_seqcount_begin(&ve_stat->stat_lock); + ret = ve_stat->iowait_time; + strt = ve_stat->strt_idle_time; + if (strt && nr_iowait_ve(ve) > 0) { + cycles = get_cycles(); + if (cycles_after(cycles, strt)) + ret += cycles - strt; + } + } while (read_seqcount_retry(&ve_stat->stat_lock, v)); + return ret; +} +EXPORT_SYMBOL(ve_sched_get_iowait_time); + +static void ve_stop_idle(struct ve_struct *ve, unsigned int cpu, cycles_t cycles) +{ + struct ve_cpu_stats *ve_stat; + + ve_stat = VE_CPU_STATS(ve, cpu); + + write_seqcount_begin(&ve_stat->stat_lock); + if (ve_stat->strt_idle_time) { + if (cycles_after(cycles, ve_stat->strt_idle_time)) { + if (nr_iowait_ve(ve) == 0) + ve_stat->idle_time += + cycles - ve_stat->strt_idle_time; + else + ve_stat->iowait_time += + cycles - ve_stat->strt_idle_time; + } + ve_stat->strt_idle_time = 0; + } + write_seqcount_end(&ve_stat->stat_lock); +} + +static void ve_strt_idle(struct ve_struct *ve, unsigned int cpu, cycles_t cycles) +{ + struct ve_cpu_stats *ve_stat; + + ve_stat = VE_CPU_STATS(ve, cpu); + + write_seqcount_begin(&ve_stat->stat_lock); + ve_stat->strt_idle_time = cycles; + write_seqcount_end(&ve_stat->stat_lock); +} + +static inline void ve_nr_running_inc(struct ve_struct *ve, int cpu, cycles_t cycles) +{ + if (++VE_CPU_STATS(ve, cpu)->nr_running == 1) + ve_stop_idle(ve, cpu, cycles); +} + +static inline void ve_nr_running_dec(struct ve_struct *ve, int cpu, cycles_t cycles) +{ + if (--VE_CPU_STATS(ve, cpu)->nr_running == 0) + ve_strt_idle(ve, cpu, cycles); +} + +void ve_sched_attach(struct ve_struct *target_ve) +{ + struct task_struct *tsk; + unsigned int cpu; + cycles_t cycles; + + tsk = current; + preempt_disable(); + cycles = get_cycles(); + cpu = task_cpu(tsk); + ve_nr_running_dec(VE_TASK_INFO(tsk)->owner_env, cpu, cycles); + ve_nr_running_inc(target_ve, cpu, cycles); + preempt_enable(); +} +EXPORT_SYMBOL(ve_sched_attach); + +static inline void write_wakeup_stamp(struct task_struct *p, cycles_t cyc) +{ + struct ve_task_info *ti; + + ti = VE_TASK_INFO(p); + write_seqcount_begin(&ti->wakeup_lock); + ti->wakeup_stamp = cyc; + write_seqcount_end(&ti->wakeup_lock); +} + +static inline void update_sched_lat(struct task_struct *t, cycles_t cycles) +{ + int cpu; + cycles_t ve_wstamp; + + /* safe due to runqueue lock */ + cpu = smp_processor_id(); + ve_wstamp = t->ve_task_info.wakeup_stamp; + + if (ve_wstamp && cycles > ve_wstamp) { + KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat, + cpu, cycles - ve_wstamp); + KSTAT_LAT_PCPU_ADD(&t->ve_task_info.exec_env->sched_lat_ve, + cpu, cycles - ve_wstamp); + } +} + +static inline void update_ve_task_info(struct task_struct *prev, cycles_t cycles) +{ +#ifdef CONFIG_FAIRSCHED + if (prev != this_pcpu()->idle) { +#else + if (prev != this_rq()->idle) { +#endif + VE_CPU_STATS(prev->ve_task_info.owner_env, + smp_processor_id())->used_time += + cycles - prev->ve_task_info.sched_time; + + prev->ve_task_info.sched_time = cycles; + } +} +#else +static inline void ve_nr_running_inc(struct ve_struct, int cpu, cycles_t cycles) +{ +} + +static inline void ve_nr_running_dec(struct ve_struct, int cpu, cycles_t cycles) +{ +} + +static inline void ve_nr_iowait_inc(struct ve_struct *ve, int cpu) +{ +} + +static inline void ve_nr_iowait_dec(struct ve_struct *ve, int cpu) +{ +} + +static inline void ve_nr_unint_inc(struct ve_struct *ve, int cpu) +{ +} + +static inline void ve_nr_unint_dec(struct ve_struct *ve, int cpu) +{ +} + +static inline void update_ve_task_info(struct task_struct *prev, cycles_t cycles) +{ +} +#endif + +struct task_nrs_struct { + long nr_running; + long nr_unint; + long nr_stopped; + long nr_sleeping; + long nr_iowait; + long long nr_switches; +} ____cacheline_aligned_in_smp; + +unsigned long nr_zombie = 0; /* protected by tasklist_lock */ +EXPORT_SYMBOL(nr_zombie); + +atomic_t nr_dead = ATOMIC_INIT(0); +EXPORT_SYMBOL(nr_dead); + /* * this_rq_lock - lock this runqueue and disable interrupts. */ @@ -1709,11 +1931,21 @@ static int effective_prio(struct task_struct *p) */ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) { - if (task_contributes_to_load(p)) + cycles_t cycles; + +#ifdef CONFIG_VE + cycles = get_cycles(); + write_wakeup_stamp(p, cycles); + p->ve_task_info.sleep_time += cycles; +#endif + if (task_contributes_to_load(p)) { rq->nr_uninterruptible--; + ve_nr_unint_dec(VE_TASK_INFO(p)->owner_env, task_cpu(p)); + } enqueue_task(rq, p, wakeup); inc_nr_running(rq); + ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p), cycles); } /* @@ -1721,6 +1953,30 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) */ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { + cycles_t cycles; +#ifdef CONFIG_VE + unsigned int cpu, pcpu; + struct ve_struct *ve; + + cycles = get_cycles(); + cpu = task_cpu(p); + pcpu = smp_processor_id(); + ve = p->ve_task_info.owner_env; + + p->ve_task_info.sleep_time -= cycles; +#endif + if (p->state == TASK_UNINTERRUPTIBLE) { + ve_nr_unint_inc(ve, cpu); + } + if (p->state == TASK_INTERRUPTIBLE) { + rq->nr_sleeping++; + } + if (p->state == TASK_STOPPED) { + rq->nr_stopped++; + } + + ve_nr_running_dec(VE_TASK_INFO(p)->owner_env, cpu, cycles); + if (task_contributes_to_load(p)) rq->nr_uninterruptible++; @@ -1969,6 +2225,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) return ncsw; } +EXPORT_SYMBOL_GPL(wait_task_inactive); /*** * kick_process - kick a running thread to enter/exit the kernel @@ -2386,6 +2643,10 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif +#ifdef CONFIG_VE + /* cosmetic: sleep till wakeup below */ + p->ve_task_info.sleep_time -= get_cycles(); +#endif put_cpu(); } @@ -2416,6 +2677,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) */ p->sched_class->task_new(rq, p); inc_nr_running(rq); + ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p), + get_cycles()); } trace_mark(kernel_sched_wakeup_new, "pid %d state %ld ## rq %p task %p rq->curr %p", @@ -2580,6 +2843,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } +EXPORT_SYMBOL_GPL(schedule_tail); /* * context_switch - switch to the new MM and the new @@ -2655,6 +2919,7 @@ unsigned long nr_running(void) return sum; } +EXPORT_SYMBOL_GPL(nr_running); unsigned long nr_uninterruptible(void) { @@ -2672,6 +2937,7 @@ unsigned long nr_uninterruptible(void) return sum; } +EXPORT_SYMBOL_GPL(nr_uninterruptible); unsigned long long nr_context_switches(void) { @@ -2709,6 +2975,72 @@ unsigned long nr_active(void) return running + uninterruptible; } +unsigned long nr_stopped(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_stopped; + if (unlikely((long)sum < 0)) + sum = 0; + return sum; +} +EXPORT_SYMBOL(nr_stopped); + +unsigned long nr_sleeping(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_sleeping; + if (unlikely((long)sum < 0)) + sum = 0; + return sum; +} +EXPORT_SYMBOL(nr_sleeping); + +#ifdef CONFIG_VE +unsigned long nr_running_ve(struct ve_struct *ve) +{ + int i; + long sum = 0; + cpumask_t ve_cpus; + + ve_cpu_online_map(ve, &ve_cpus); + for_each_cpu_mask(i, ve_cpus) + sum += VE_CPU_STATS(ve, i)->nr_running; + return (unsigned long)(sum < 0 ? 0 : sum); +} +EXPORT_SYMBOL(nr_running_ve); + +unsigned long nr_uninterruptible_ve(struct ve_struct *ve) +{ + int i; + long sum = 0; + cpumask_t ve_cpus; + + sum = 0; + ve_cpu_online_map(ve, &ve_cpus); + for_each_cpu_mask(i, ve_cpus) + sum += VE_CPU_STATS(ve, i)->nr_unint; + return (unsigned long)(sum < 0 ? 0 : sum); +} +EXPORT_SYMBOL(nr_uninterruptible_ve); + +unsigned long nr_iowait_ve(struct ve_struct *ve) +{ + int i; + long sum = 0; + cpumask_t ve_cpus; + + ve_cpu_online_map(ve, &ve_cpus); + for_each_cpu_mask(i, ve_cpus) + sum += VE_CPU_STATS(ve, i)->nr_iowait; + return (unsigned long)(sum < 0 ? 0 : sum); +} +EXPORT_SYMBOL(nr_iowait_ve); +#endif + /* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). @@ -2739,6 +3071,16 @@ static void update_cpu_load(struct rq *this_rq) } } +#ifdef CONFIG_VE +#define update_ve_cpu_time(p, time, tick) \ + do { \ + VE_CPU_STATS((p)->ve_task_info.owner_env, \ + task_cpu(p))->time += tick; \ + } while (0) +#else +#define update_ve_cpu_time(p, time, tick) do { } while (0) +#endif + #ifdef CONFIG_SMP /* @@ -2873,8 +3215,15 @@ void sched_exec(void) static void pull_task(struct rq *src_rq, struct task_struct *p, struct rq *this_rq, int this_cpu) { + struct ve_struct *ve; + cycles_t cycles = get_cycles(); + + ve = VE_TASK_INFO(p)->owner_env; + deactivate_task(src_rq, p, 0); + ve_nr_running_dec(ve, task_cpu(p), cycles); set_task_cpu(p, this_cpu); + ve_nr_running_inc(ve, task_cpu(p), cycles); activate_task(this_rq, p, 0); /* * Note that idle threads have a prio of MAX_PRIO, for this test @@ -4073,10 +4422,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime) /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (TASK_NICE(p) > 0) { cpustat->nice = cputime64_add(cpustat->nice, tmp); - else + update_ve_cpu_time(p, nice, tmp); + } else { cpustat->user = cputime64_add(cpustat->user, tmp); + update_ve_cpu_time(p, user, tmp); + } /* Account for user time used */ acct_update_integrals(p); } @@ -4132,6 +4484,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); + update_ve_cpu_time(p, system, tmp); if (hardirq_count() - hardirq_offset) cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (softirq_count()) @@ -4454,12 +4807,30 @@ need_resched_nonpreemptible: next = pick_next_task(rq, prev); if (likely(prev != next)) { + cycles_t cycles = get_cycles(); + sched_info_switch(prev, next); rq->nr_switches++; rq->curr = next; ++*switch_count; +#ifdef CONFIG_VE + prev->ve_task_info.sleep_stamp = cycles; + if (prev->state == TASK_RUNNING && prev != this_rq()->idle) + write_wakeup_stamp(prev, cycles); + update_sched_lat(next, cycles); + + /* because next & prev are protected with + * runqueue lock we may not worry about + * wakeup_stamp and sched_time protection + * (same thing in 'else' branch below) + */ + update_ve_task_info(prev, cycles); + next->ve_task_info.sched_time = cycles; + write_wakeup_stamp(next, 0); +#endif + context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under @@ -4467,8 +4838,10 @@ need_resched_nonpreemptible: */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else + } else { + update_ve_task_info(prev, get_cycles()); spin_unlock_irq(&rq->lock); + } if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; @@ -5084,7 +5457,7 @@ recheck: /* * Allow unprivileged RT tasks to decrease priority: */ - if (user && !capable(CAP_SYS_NICE)) { + if (user && !capable(CAP_SYS_ADMIN)) { if (rt_policy(policy)) { unsigned long rlim_rtprio; @@ -5572,10 +5945,15 @@ EXPORT_SYMBOL(yield); void __sched io_schedule(void) { struct rq *rq = &__raw_get_cpu_var(runqueues); +#ifdef CONFIG_VE + struct ve_struct *ve = current->ve_task_info.owner_env; +#endif delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + ve_nr_iowait_inc(ve, task_cpu(current)); schedule(); + ve_nr_iowait_dec(ve, task_cpu(current)); atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); } @@ -5585,10 +5963,15 @@ long __sched io_schedule_timeout(long timeout) { struct rq *rq = &__raw_get_cpu_var(runqueues); long ret; +#ifdef CONFIG_VE + struct ve_struct *ve = current->ve_task_info.owner_env; +#endif delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + ve_nr_iowait_inc(ve, task_cpu(current)); ret = schedule_timeout(timeout); + ve_nr_iowait_dec(ve, task_cpu(current)); atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); return ret; @@ -5709,17 +6092,7 @@ void sched_show_task(struct task_struct *p) state = p->state ? __ffs(p->state) + 1 : 0; printk(KERN_INFO "%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) - printk(KERN_CONT " running "); - else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else - if (state == TASK_RUNNING) - printk(KERN_CONT " running task "); - else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif + printk(KERN_CONT " %p ", p); #ifdef CONFIG_DEBUG_STACK_USAGE { unsigned long *n = end_of_stack(p); @@ -5740,13 +6113,13 @@ void show_state_filter(unsigned long state_filter) #if BITS_PER_LONG == 32 printk(KERN_INFO - " task PC stack pid father\n"); + " task taskaddr stack pid father\n"); #else printk(KERN_INFO - " task PC stack pid father\n"); + " task taskaddr stack pid father\n"); #endif read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { /* * reset the NMI-timeout, listing all files on a slow * console might take alot of time: @@ -5754,7 +6127,7 @@ void show_state_filter(unsigned long state_filter) touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); touch_all_softlockup_watchdogs(); @@ -6118,13 +6491,13 @@ static void migrate_live_tasks(int src_cpu) read_lock(&tasklist_lock); - do_each_thread(t, p) { + do_each_thread_all(t, p) { if (p == current) continue; if (task_cpu(p) == src_cpu) move_task_off_dead_cpu(src_cpu, p); - } while_each_thread(t, p); + } while_each_thread_all(t, p); read_unlock(&tasklist_lock); } @@ -8126,7 +8499,7 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.shares = init_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); -#ifdef CONFIG_CGROUP_SCHED +#if defined(CONFIG_CGROUP_SCHED) || defined(CONFIG_VZ_FAIRSCHED) /* * How much cpu bandwidth does init_task_group get? * @@ -8172,7 +8545,7 @@ void __init sched_init(void) rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_rt_rq_list); -#ifdef CONFIG_CGROUP_SCHED +#if defined(CONFIG_CGROUP_SCHED) || defined(CONFIG_VZ_FAIRSCHED) init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); #elif defined CONFIG_USER_SCHED init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); @@ -8232,6 +8605,7 @@ void __init sched_init(void) * During early bootup we pretend to be a normal task: */ current->sched_class = &fair_sched_class; + fairsched_init_early(); scheduler_running = 1; } @@ -8284,7 +8658,7 @@ void normalize_rt_tasks(void) struct rq *rq; read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { + do_each_thread_all(g, p) { /* * Only normalize user tasks: */ @@ -8315,7 +8689,7 @@ void normalize_rt_tasks(void) __task_rq_unlock(rq); spin_unlock(&p->pi_lock); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock_irqrestore(&tasklist_lock, flags); } @@ -8758,7 +9132,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) return div64_u64(runtime << 16, period); } -#ifdef CONFIG_CGROUP_SCHED +#if defined(CONFIG_CGROUP_SCHED) || defined(CONFIG_VZ_FAIRSCHED) static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { struct task_group *tgi, *parent = tg->parent; @@ -8815,10 +9189,10 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) static inline int tg_has_rt_tasks(struct task_group *tg) { struct task_struct *g, *p; - do_each_thread(g, p) { + do_each_thread_ve(g, p) { if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) return 1; - } while_each_thread(g, p); + } while_each_thread_ve(g, p); return 0; } diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index bbe6b31..5997d36 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -101,12 +101,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (!p->se.on_rq || task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock_irqrestore(&tasklist_lock, flags); } diff --git a/kernel/signal.c b/kernel/signal.c index e661b01..c1f2e30 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -32,13 +32,32 @@ #include #include #include +#include #include "audit.h" /* audit_signal_info() */ /* * SLAB caches for signal bits. */ -static struct kmem_cache *sigqueue_cachep; +struct kmem_cache *sigqueue_cachep; +EXPORT_SYMBOL(sigqueue_cachep); + +static int sig_ve_ignored(int sig, struct siginfo *info, struct task_struct *t) +{ + struct ve_struct *ve; + + /* always allow signals from the kernel */ + if (info == SEND_SIG_FORCED || + (!is_si_special(info) && SI_FROMKERNEL(info))) + return 0; + + ve = current->ve_task_info.owner_env; + if (ve->ve_ns->pid_ns->child_reaper != t) + return 0; + if (ve_is_super(get_exec_env())) + return 0; + return !sig_user_defined(t, sig) || sig_kernel_only(sig); +} static void __user *sig_handler(struct task_struct *t, int sig) { @@ -106,7 +125,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) -static int recalc_sigpending_tsk(struct task_struct *t) +int recalc_sigpending_tsk(struct task_struct *t) { if (t->signal->group_stop_count > 0 || PENDING(&t->pending, &t->blocked) || @@ -131,6 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t) if (recalc_sigpending_tsk(t)) signal_wake_up(t, 0); } +EXPORT_SYMBOL_GPL(recalc_sigpending_tsk); void recalc_sigpending(void) { @@ -191,8 +211,13 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, atomic_inc(&user->sigpending); if (override_rlimit || atomic_read(&user->sigpending) <= - t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) + t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { q = kmem_cache_alloc(sigqueue_cachep, flags); + if (q && ub_siginfo_charge(q, get_task_ub(t))) { + kmem_cache_free(sigqueue_cachep, q); + q = NULL; + } + } if (unlikely(q == NULL)) { atomic_dec(&user->sigpending); } else { @@ -209,6 +234,7 @@ static void __sigqueue_free(struct sigqueue *q) return; atomic_dec(&q->user->sigpending); free_uid(q->user); + ub_siginfo_uncharge(q); kmem_cache_free(sigqueue_cachep, q); } @@ -384,7 +410,18 @@ still_pending: static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, siginfo_t *info) { - int sig = next_signal(pending, mask); + int sig = 0; + + /* SIGKILL must have priority, otherwise it is quite easy + * to create an unkillable process, sending sig < SIGKILL + * to self */ + if (unlikely(sigismember(&pending->signal, SIGKILL))) { + if (!sigismember(mask, SIGKILL)) + sig = SIGKILL; + } + + if (likely(!sig)) + sig = next_signal(pending, mask); if (sig) { if (current->notifier) { @@ -507,6 +544,7 @@ void signal_wake_up(struct task_struct *t, int resume) if (!wake_up_state(t, mask)) kick_process(t); } +EXPORT_SYMBOL_GPL(signal_wake_up); /* * Remove signals in mask from the pending set and queue. @@ -625,7 +663,7 @@ static int prepare_signal(int sig, struct task_struct *p) t = p; do { rm_from_queue(sigmask(SIGCONT), &t->pending); - } while_each_thread(p, t); + } while_each_thread_all(p, t); } else if (sig == SIGCONT) { unsigned int why; /* @@ -657,7 +695,7 @@ static int prepare_signal(int sig, struct task_struct *p) state |= TASK_INTERRUPTIBLE; } wake_up_state(t, state); - } while_each_thread(p, t); + } while_each_thread_all(p, t); /* * Notify the parent with CLD_CONTINUED if we were stopped. @@ -779,7 +817,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) do { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); - } while_each_thread(p, t); + } while_each_thread_all(p, t); return; } } @@ -1015,7 +1053,8 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) if (!ret && sig) { ret = -ESRCH; if (lock_task_sighand(p, &flags)) { - ret = __group_send_sig_info(sig, info, p); + ret = sig_ve_ignored(sig, info, p) ? 0 : + __group_send_sig_info(sig, info, p); unlock_task_sighand(p, &flags); } } @@ -1140,7 +1179,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) int retval = 0, count = 0; struct task_struct * p; - for_each_process(p) { + for_each_process_ve(p) { if (p->pid > 1 && !same_thread_group(p, current)) { int err = group_send_sig_info(sig, info, p); ++count; @@ -1348,6 +1387,14 @@ int do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); +#ifdef CONFIG_VE + /* Allow to send only SIGCHLD from VE */ + if (sig != SIGCHLD && + tsk->ve_task_info.owner_env != + tsk->parent->ve_task_info.owner_env) + sig = SIGCHLD; +#endif + info.si_signo = sig; info.si_errno = 0; /* @@ -1617,7 +1664,9 @@ finish_stop(int stop_count) } do { + set_stop_state(current); schedule(); + clear_stop_state(current); } while (try_to_freeze()); /* * Now we don't run again until continued. @@ -1669,6 +1718,7 @@ static int do_signal_stop(int signr) sig->group_stop_count = stop_count; } + clear_pn_state(current); if (stop_count == 0) sig->flags = SIGNAL_STOP_STOPPED; current->exit_code = sig->group_exit_code; @@ -1732,8 +1782,6 @@ relock: * Now that we woke up, it's crucial if we're supposed to be * frozen that we freeze now before running anything substantial. */ - try_to_freeze(); - spin_lock_irq(&sighand->siglock); /* * Every stopped thread goes here after wakeup. Check to see if @@ -2239,7 +2287,8 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) * signal is private anyway. */ if (!error && sig && lock_task_sighand(p, &flags)) { - error = specific_send_sig_info(sig, &info, p); + if (!sig_ve_ignored(sig, &info, p)) + error = specific_send_sig_info(sig, &info, p); unlock_task_sighand(p, &flags); } } @@ -2595,5 +2644,5 @@ __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) void __init signals_init(void) { - sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC|SLAB_UBC); } diff --git a/kernel/softirq.c b/kernel/softirq.c index c506f26..03cb8a5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -22,6 +22,8 @@ #include #include +#include + #include /* - No shared variables, all the data are CPU local. @@ -183,10 +185,14 @@ EXPORT_SYMBOL(local_bh_enable_ip); asmlinkage void __do_softirq(void) { + struct user_beancounter *ub; struct softirq_action *h; __u32 pending; int max_restart = MAX_SOFTIRQ_RESTART; int cpu; + struct ve_struct *envid; + + envid = set_exec_env(get_ve0()); pending = local_softirq_pending(); account_system_vtime(current); @@ -203,6 +209,7 @@ restart: h = softirq_vec; + ub = set_exec_ub(get_ub0()); do { if (pending & 1) { h->action(h); @@ -211,6 +218,7 @@ restart: h++; pending >>= 1; } while (pending); + (void)set_exec_ub(ub); local_irq_disable(); @@ -224,6 +232,7 @@ restart: trace_softirq_exit(); account_system_vtime(current); + (void)set_exec_env(envid); _local_bh_enable(); } @@ -279,6 +288,7 @@ void irq_exit(void) { account_system_vtime(current); trace_hardirq_exit(); + restore_context(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index cb838ee..51a33fd 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -230,13 +230,13 @@ static void check_hung_uninterruptible_tasks(int this_cpu) return; read_lock(&tasklist_lock); - do_each_thread(g, t) { + do_each_thread_all(g, t) { if (!--max_count) goto unlock; /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, now); - } while_each_thread(g, t); + } while_each_thread_all(g, t); unlock: read_unlock(&tasklist_lock); } diff --git a/kernel/sys.c b/kernel/sys.c index 038a7bc..bd617ce 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,7 @@ #include #include #include +#include #include #include @@ -112,6 +114,102 @@ EXPORT_SYMBOL(cad_pid); void (*pm_power_off_prepare)(void); +DECLARE_MUTEX(virtinfo_sem); +EXPORT_SYMBOL(virtinfo_sem); +static struct vnotifier_block *virtinfo_chain[VIRT_TYPES]; + +void __virtinfo_notifier_register(int type, struct vnotifier_block *nb) +{ + struct vnotifier_block **p; + + for (p = &virtinfo_chain[type]; + *p != NULL && nb->priority < (*p)->priority; + p = &(*p)->next); + nb->next = *p; + smp_wmb(); + *p = nb; +} + +EXPORT_SYMBOL(__virtinfo_notifier_register); + +void virtinfo_notifier_register(int type, struct vnotifier_block *nb) +{ + down(&virtinfo_sem); + __virtinfo_notifier_register(type, nb); + up(&virtinfo_sem); +} + +EXPORT_SYMBOL(virtinfo_notifier_register); + +struct virtinfo_cnt_struct { + volatile unsigned long exit[NR_CPUS]; + volatile unsigned long entry; +}; +static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt); + +void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb) +{ + struct vnotifier_block **p; + int entry_cpu, exit_cpu; + unsigned long cnt, ent; + + down(&virtinfo_sem); + for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next); + *p = nb->next; + smp_mb(); + + for_each_cpu_mask(entry_cpu, cpu_possible_map) { + while (1) { + cnt = 0; + for_each_cpu_mask(exit_cpu, cpu_possible_map) + cnt += + per_cpu(virtcnt, entry_cpu).exit[exit_cpu]; + smp_rmb(); + ent = per_cpu(virtcnt, entry_cpu).entry; + if (cnt == ent) + break; + __set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ / 100); + } + } + up(&virtinfo_sem); +} + +EXPORT_SYMBOL(virtinfo_notifier_unregister); + +int virtinfo_notifier_call(int type, unsigned long n, void *data) +{ + int ret; + int entry_cpu, exit_cpu; + struct vnotifier_block *nb; + + entry_cpu = get_cpu(); + per_cpu(virtcnt, entry_cpu).entry++; + smp_wmb(); + put_cpu(); + + nb = virtinfo_chain[type]; + ret = NOTIFY_DONE; + while (nb) + { + ret = nb->notifier_call(nb, n, data, ret); + if(ret & NOTIFY_STOP_MASK) { + ret &= ~NOTIFY_STOP_MASK; + break; + } + nb = nb->next; + } + + exit_cpu = get_cpu(); + smp_wmb(); + per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++; + put_cpu(); + + return ret; +} + +EXPORT_SYMBOL(virtinfo_notifier_call); + static int set_one_prio(struct task_struct *p, int niceval, int error) { int no_nice; @@ -181,10 +279,10 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) if ((who != current->uid) && !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread_ve(g, p) { if (p->uid == who) error = set_one_prio(p, niceval, error); - while_each_thread(g, p); + } while_each_thread_ve(g, p); if (who != current->uid) free_uid(user); /* For find_user() */ break; @@ -243,13 +341,13 @@ asmlinkage long sys_getpriority(int which, int who) if ((who != current->uid) && !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread_ve(g, p) if (p->uid == who) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } - while_each_thread(g, p); + while_each_thread_ve(g, p); if (who != current->uid) free_uid(user); /* for find_user() */ break; @@ -363,6 +461,25 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: + case LINUX_REBOOT_CMD_HALT: + case LINUX_REBOOT_CMD_POWER_OFF: + case LINUX_REBOOT_CMD_RESTART2: + force_sig(SIGKILL, + get_exec_env()->ve_ns->pid_ns->child_reaper); + + case LINUX_REBOOT_CMD_CAD_ON: + case LINUX_REBOOT_CMD_CAD_OFF: + return 0; + + default: + return -EINVAL; + } +#endif + /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ @@ -549,7 +666,7 @@ asmlinkage long sys_setgid(gid_t gid) return 0; } -static int set_user(uid_t new_ruid, int dumpclear) +int set_user(uid_t new_ruid, int dumpclear) { struct user_struct *new_user; @@ -853,8 +970,27 @@ asmlinkage long sys_setfsgid(gid_t gid) return old_fsgid; } +#ifdef CONFIG_VE +unsigned long long ve_relative_clock(struct timespec * ts) +{ + unsigned long long offset = 0; + + if (ts->tv_sec > get_exec_env()->start_timespec.tv_sec || + (ts->tv_sec == get_exec_env()->start_timespec.tv_sec && + ts->tv_nsec >= get_exec_env()->start_timespec.tv_nsec)) + offset = (unsigned long long)(ts->tv_sec - + get_exec_env()->start_timespec.tv_sec) * NSEC_PER_SEC + + ts->tv_nsec - get_exec_env()->start_timespec.tv_nsec; + return nsec_to_clock_t(offset); +} +#endif + asmlinkage long sys_times(struct tms __user * tbuf) { +#ifdef CONFIG_VE + struct timespec now; +#endif + /* * In the SMP world we might just be unlucky and have one of * the times increment as we use it. Since the value is an @@ -888,7 +1024,13 @@ asmlinkage long sys_times(struct tms __user * tbuf) if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } +#ifndef CONFIG_VE return (long) jiffies_64_to_clock_t(get_jiffies_64()); +#else + /* Compare to calculation in fs/proc/array.c */ + do_posix_clock_monotonic_gettime(&now); + return ve_relative_clock(&now); +#endif } /* @@ -1062,6 +1204,7 @@ asmlinkage long sys_setsid(void) spin_lock(&group_leader->sighand->siglock); group_leader->signal->tty = NULL; + group_leader->signal->tty_old_pgrp = 0; spin_unlock(&group_leader->sighand->siglock); err = session; @@ -1344,7 +1487,7 @@ asmlinkage long sys_sethostname(char __user *name, int len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; @@ -1389,7 +1532,7 @@ asmlinkage long sys_setdomainname(char __user *name, int len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 08d6e1b..ca1bb3e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -168,3 +168,15 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); +cond_syscall(sys_getluid); +cond_syscall(sys_setluid); +cond_syscall(sys_setublimit); +cond_syscall(sys_ubstat); + +/* fairsched compat */ +cond_syscall(sys_fairsched_mknod); +cond_syscall(sys_fairsched_rmnod); +cond_syscall(sys_fairsched_mvpr); +cond_syscall(sys_fairsched_vcpus); +cond_syscall(sys_fairsched_chwt); +cond_syscall(sys_fairsched_rate); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 50ec088..60b39bf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -81,6 +81,7 @@ extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; extern int maps_protect; +extern int ve_area_access_check; /* fs/namei.c */ extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifdef CONFIG_RCU_TORTURE_TEST @@ -111,6 +112,13 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; +int ve_allow_kthreads = 1; +EXPORT_SYMBOL(ve_allow_kthreads); + +#ifdef CONFIG_MAGIC_SYSRQ +extern int sysrq_key_scancode; +#endif + #ifdef CONFIG_MODULES extern char modprobe_path[]; #endif @@ -124,6 +132,8 @@ extern int stop_a_enabled; extern int scons_pwroff; #endif +extern int alloc_fail_warn; + #ifdef __hppa__ extern int pwrsw_enabled; extern int unaligned_enabled; @@ -136,6 +146,7 @@ extern int sysctl_ieee_emulation_warnings; extern int sysctl_userprocess_debug; extern int spin_retry; #endif +int decode_call_traces = 1; #ifdef CONFIG_BSD_PROCESS_ACCT extern int acct_parm[]; @@ -144,6 +155,10 @@ extern int acct_parm[]; #ifdef CONFIG_IA64 extern int no_unaligned_warning; #endif +#ifdef CONFIG_VE +int glob_ve_meminfo = 0; +EXPORT_SYMBOL(glob_ve_meminfo); +#endif #ifdef CONFIG_RT_MUTEXES extern int max_lock_depth; @@ -165,9 +180,31 @@ static struct ctl_table_header root_table_header = { .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, }; -static struct ctl_table_root sysctl_table_root = { + +#ifdef CONFIG_VE +static int sysctl_root_perms(struct ctl_table_root *root, + struct nsproxy *namespaces, struct ctl_table *table) +{ + if (ve_is_super(get_exec_env())) + return table->mode; + else + return table->mode & ~0222; +} + +static struct ctl_table_root sysctl_table_groot = { .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), + .default_set.list = LIST_HEAD_INIT(sysctl_table_groot.default_set.list), + .default_set.parent = &sysctl_table_root.default_set, +}; +#else +#define sysctl_root_perms NULL +#define sysctl_table_groot sysctl_table_root +#endif + +static struct ctl_table_root sysctl_table_root = { + .root_list = LIST_HEAD_INIT(sysctl_table_groot.root_list), .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), + .permissions = sysctl_root_perms, }; static struct ctl_table kern_table[]; @@ -442,6 +479,20 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .procname = "silence-level", + .data = &console_silence_loglevel, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "alloc_fail_warn", + .data = &alloc_fail_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef __hppa__ { .ctl_name = KERN_HPPA_PWRSW, @@ -606,6 +657,24 @@ static struct ctl_table kern_table[] = { .extra1 = &pid_max_min, .extra2 = &pid_max_max, }, +#ifdef CONFIG_VE + { + .procname = "ve_meminfo", + .data = &glob_ve_meminfo, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_MAGIC_SYSRQ + { + .procname = "sysrq-key", + .data = &sysrq_key_scancode, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { .ctl_name = KERN_PANIC_ON_OOPS, .procname = "panic_on_oops", @@ -1176,6 +1245,21 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif + { + .procname = "vsyscall", + .data = &sysctl_at_vsyscall, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "odirect_enable", + .data = &odirect_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -1338,6 +1422,13 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { + { + .procname = "decode_call_traces", + .data = &decode_call_traces, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #if defined(CONFIG_X86) || defined(CONFIG_PPC) { .ctl_name = CTL_UNNUMBERED, @@ -1890,10 +1981,27 @@ struct ctl_table_header *__register_sysctl_paths( struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, struct ctl_table *table) { + if (!ve_is_super(get_exec_env())) { + WARN_ON(1); + return NULL; + } + return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, path, table); } +struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *path, + struct ctl_table *table, int virtual_handler) +{ + if (!ve_is_super(get_exec_env())) { + WARN_ON(1); + return NULL; + } + + return __register_sysctl_paths(&sysctl_table_groot, current->nsproxy, + path, table); +} + /** * register_sysctl_table - register a sysctl table hierarchy * @table: the top-level table structure @@ -1910,6 +2018,14 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table) return register_sysctl_paths(null_path, table); } +struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *table, + int virtual_handler) +{ + static const struct ctl_path null_path[] = { {} }; + + return register_sysctl_glob_paths(null_path, table, virtual_handler); +} + /** * unregister_sysctl_table - unregister a sysctl table hierarchy * @header: the header returned from register_sysctl_table @@ -1971,6 +2087,18 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, return NULL; } +struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *table, + int vh) +{ + return NULL; +} + +struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *path, + struct ctl_table *table, int vh) +{ + return NULL; +} + void unregister_sysctl_table(struct ctl_table_header * table) { } @@ -3000,6 +3128,57 @@ static int deprecated_sysctl_warning(struct __sysctl_args *args) return 0; } +#ifdef CONFIG_PID_NS +#include + +static int proc_pid_ns_hide_child(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int tmp, res; + + tmp = (current->nsproxy->pid_ns->flags & PID_NS_HIDE_CHILD) ? 1 : 0; + + res = __do_proc_dointvec(&tmp, table, write, filp, buffer, + lenp, ppos, NULL, NULL); + if (res || !write) + return res; + + if (tmp) + current->nsproxy->pid_ns->flags |= PID_NS_HIDE_CHILD; + else + current->nsproxy->pid_ns->flags &= ~PID_NS_HIDE_CHILD; + return 0; +} + +static struct ctl_table pid_ns_kern_table[] = { + { + .procname = "pid_ns_hide_child", + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_pid_ns_hide_child, + }, + {} +}; + +static struct ctl_table pid_ns_root_table[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = pid_ns_kern_table, + }, + {} +}; + +static __init int pid_ns_sysctl_init(void) +{ + register_sysctl_table(pid_ns_root_table); + return 0; +} +postcore_initcall(pid_ns_sysctl_init); +#endif /* CONFIG_PID_NS */ + /* * No sense putting this after each symbol definition, twice, * exception granted :-) @@ -3013,7 +3192,9 @@ EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); EXPORT_SYMBOL(register_sysctl_table); +EXPORT_SYMBOL(register_sysctl_glob_table); EXPORT_SYMBOL(register_sysctl_paths); +EXPORT_SYMBOL(register_sysctl_glob_paths); EXPORT_SYMBOL(sysctl_intvec); EXPORT_SYMBOL(sysctl_jiffies); EXPORT_SYMBOL(sysctl_ms_jiffies); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index bd6be76..e115826 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -254,7 +254,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, stats->nvcsw += tsk->nvcsw; stats->nivcsw += tsk->nivcsw; - } while_each_thread(first, tsk); + } while_each_thread_all(first, tsk); unlock_task_sighand(first, &flags); rc = 0; diff --git a/kernel/time.c b/kernel/time.c index 6a08660..c986346 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -601,10 +601,12 @@ EXPORT_SYMBOL(jiffies_to_clock_t); unsigned long clock_t_to_jiffies(unsigned long x) { #if (HZ % USER_HZ)==0 + WARN_ON((long)x < 0); if (x >= ~0UL / (HZ / USER_HZ)) return ~0UL; return x * (HZ / USER_HZ); #else + WARN_ON((long)x < 0); /* Don't worry about loss of precision here .. */ if (x >= ~0UL / HZ * USER_HZ) return ~0UL; @@ -617,6 +619,7 @@ EXPORT_SYMBOL(clock_t_to_jiffies); u64 jiffies_64_to_clock_t(u64 x) { + WARN_ON((s64)x < 0); #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ x = div_u64(x * USER_HZ, HZ); @@ -639,6 +642,7 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t); u64 nsec_to_clock_t(u64 x) { + WARN_ON((s64)x < 0); #if (NSEC_PER_SEC % USER_HZ) == 0 return div_u64(x, NSEC_PER_SEC / USER_HZ); #elif (USER_HZ % 512) == 0 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e91c29f..3db0c59 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -43,6 +43,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); * used instead. */ struct timespec xtime __attribute__ ((aligned (16))); +EXPORT_SYMBOL_GPL(xtime); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ diff --git a/kernel/timer.c b/kernel/timer.c index 03bc7f1..05b6a6d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include @@ -795,7 +797,11 @@ static inline void __run_timers(struct tvec_base *base) spin_unlock_irq(&base->lock); { int preempt_count = preempt_count(); + struct ve_struct *ve; + + ve = set_exec_env(get_ve0()); fn(data); + (void)set_exec_env(ve); if (preempt_count != preempt_count()) { printk(KERN_ERR "huh, entered %p " "with preempt_count %08x, exited" @@ -1006,6 +1012,37 @@ EXPORT_SYMBOL(avenrun); * calc_load - given tick count, update the avenrun load estimates. * This is called while holding a write_lock on xtime_lock. */ + + +#ifdef CONFIG_VE +static void calc_load_ve(void) +{ + unsigned long flags, nr_unint, nr_active; + struct ve_struct *ve; + + read_lock(&ve_list_lock); + for_each_ve(ve) { + nr_active = nr_running_ve(ve) + nr_uninterruptible_ve(ve); + nr_active *= FIXED_1; + + CALC_LOAD(ve->avenrun[0], EXP_1, nr_active); + CALC_LOAD(ve->avenrun[1], EXP_5, nr_active); + CALC_LOAD(ve->avenrun[2], EXP_15, nr_active); + } + read_unlock(&ve_list_lock); + + nr_unint = nr_uninterruptible() * FIXED_1; + spin_lock_irqsave(&kstat_glb_lock, flags); + CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint); + CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint); + CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint); + spin_unlock_irqrestore(&kstat_glb_lock, flags); + +} +#else +#define calc_load_ve() do { } while (0) +#endif + static inline void calc_load(unsigned long ticks) { unsigned long active_tasks; /* fixed-point */ @@ -1018,6 +1055,7 @@ static inline void calc_load(unsigned long ticks) CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); + calc_load_ve(); count += LOAD_FREQ; } while (count < 0); } @@ -1267,11 +1305,12 @@ int do_sysinfo(struct sysinfo *info) unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; unsigned long seq; + unsigned long *__avenrun; + struct timespec tp; memset(info, 0, sizeof(struct sysinfo)); do { - struct timespec tp; seq = read_seqbegin(&xtime_lock); /* @@ -1289,18 +1328,34 @@ int do_sysinfo(struct sysinfo *info) tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; tp.tv_sec++; } - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - - info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + } while (read_seqretry(&xtime_lock, seq)); + if (ve_is_super(get_exec_env())) { + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + __avenrun = &avenrun[0]; info->procs = nr_threads; - } while (read_seqretry(&xtime_lock, seq)); + } +#ifdef CONFIG_VE + else { + struct ve_struct *ve; + ve = get_exec_env(); + __avenrun = &ve->avenrun[0]; + info->procs = atomic_read(&ve->pcounter); + info->uptime = tp.tv_sec - ve->start_timespec.tv_sec; + } +#endif + info->loads[0] = __avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[1] = __avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[2] = __avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); si_meminfo(info); si_swapinfo(info); +#ifdef CONFIG_BEANCOUNTERS + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_SYSINFO, info) + & NOTIFY_FAIL) + return -ENOMSG; +#endif /* * If the sum of all the available memory (i.e. ram + swap) * is less than can be stored in a 32 bit unsigned long then diff --git a/kernel/user.c b/kernel/user.c index 865ecf5..b1139b3 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -314,6 +314,7 @@ static void remove_user_sysfs_dir(struct work_struct *w) done: uids_mutex_unlock(); } +EXPORT_SYMBOL_GPL(free_uid); /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released @@ -383,6 +384,7 @@ void free_uid(struct user_struct *up) else local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(free_uid); struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) { @@ -447,6 +449,7 @@ out_unlock: uids_mutex_unlock(); return NULL; } +EXPORT_SYMBOL_GPL(alloc_uid); void switch_uid(struct user_struct *new_user) { @@ -477,6 +480,7 @@ void switch_uid(struct user_struct *new_user) free_uid(old_user); suid_keys(current); } +EXPORT_SYMBOL_GPL(switch_uid); #ifdef CONFIG_USER_NS void release_uids(struct user_namespace *ns) @@ -510,7 +514,7 @@ static int __init uid_cache_init(void) int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4ab9659..5e4b983 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -26,6 +26,10 @@ static void *get_uts(ctl_table *table, int write) down_read(&uts_sem); else down_write(&uts_sem); + + if (strcmp(table->procname, "virt_osrelease") == 0) + return virt_utsname.release; + return which; } @@ -127,19 +131,27 @@ static struct ctl_table uts_kern_table[] = { {} }; -static struct ctl_table uts_root_table[] = { +static struct ctl_table uts_virt_osrelease_table[] = { { - .ctl_name = CTL_KERN, - .procname = "kernel", - .mode = 0555, - .child = uts_kern_table, + .procname = "virt_osrelease", + .data = virt_utsname.release, + .maxlen = sizeof(virt_utsname.release), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = sysctl_uts_string, }, {} }; +static struct ctl_path uts_path[] = { + { .ctl_name = CTL_KERN, .procname = "kernel", }, + { } +}; + static int __init utsname_sysctl_init(void) { - register_sysctl_table(uts_root_table); + register_sysctl_glob_paths(uts_path, uts_kern_table, 1); + register_sysctl_paths(uts_path, uts_virt_osrelease_table); return 0; } diff --git a/kernel/ve/Makefile b/kernel/ve/Makefile new file mode 100644 index 0000000..9d60161 --- /dev/null +++ b/kernel/ve/Makefile @@ -0,0 +1,16 @@ +# +# +# kernel/ve/Makefile +# +# Copyright (C) 2000-2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +obj-$(CONFIG_VE) = ve.o veowner.o hooks.o +obj-$(CONFIG_VZ_WDOG) += vzwdog.o +obj-$(CONFIG_VE_CALLS) += vzmon.o + +vzmon-objs = vecalls.o + +obj-$(CONFIG_VZ_DEV) += vzdev.o diff --git a/kernel/ve/hooks.c b/kernel/ve/hooks.c new file mode 100644 index 0000000..1b82c35 --- /dev/null +++ b/kernel/ve/hooks.c @@ -0,0 +1,114 @@ +/* + * linux/kernel/ve/hooks.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include + +static struct list_head ve_hooks[VE_MAX_CHAINS]; +static DECLARE_RWSEM(ve_hook_sem); + +void ve_hook_register(int chain, struct ve_hook *vh) +{ + struct list_head *lh; + struct ve_hook *tmp; + + BUG_ON(chain > VE_MAX_CHAINS); + + down_write(&ve_hook_sem); + list_for_each(lh, &ve_hooks[chain]) { + tmp = list_entry(lh, struct ve_hook, list); + if (vh->priority < tmp->priority) + break; + } + + list_add_tail(&vh->list, lh); + up_write(&ve_hook_sem); +} + +EXPORT_SYMBOL(ve_hook_register); + +void ve_hook_unregister(struct ve_hook *vh) +{ + down_write(&ve_hook_sem); + list_del(&vh->list); + up_write(&ve_hook_sem); +} + +EXPORT_SYMBOL(ve_hook_unregister); + +static inline int ve_hook_init(struct ve_hook *vh, struct ve_struct *ve) +{ + int err; + + err = 0; + if (try_module_get(vh->owner)) { + err = vh->init(ve); + module_put(vh->owner); + } + return err; +} + +static inline void ve_hook_fini(struct ve_hook *vh, struct ve_struct *ve) +{ + if (vh->fini != NULL && try_module_get(vh->owner)) { + vh->fini(ve); + module_put(vh->owner); + } +} + +int ve_hook_iterate_init(int chain, void *ve) +{ + struct ve_hook *vh; + int err; + + err = 0; + + down_read(&ve_hook_sem); + list_for_each_entry(vh, &ve_hooks[chain], list) + if ((err = ve_hook_init(vh, ve)) < 0) + break; + + if (err) + list_for_each_entry_continue_reverse(vh, &ve_hooks[chain], list) + ve_hook_fini(vh, ve); + + up_read(&ve_hook_sem); + return err; +} + +EXPORT_SYMBOL(ve_hook_iterate_init); + +void ve_hook_iterate_fini(int chain, void *ve) +{ + struct ve_hook *vh; + + down_read(&ve_hook_sem); + list_for_each_entry_reverse(vh, &ve_hooks[chain], list) + ve_hook_fini(vh, ve); + up_read(&ve_hook_sem); +} + +EXPORT_SYMBOL(ve_hook_iterate_fini); + +static int __init ve_hooks_init(void) +{ + int i; + + for (i = 0; i < VE_MAX_CHAINS; i++) + INIT_LIST_HEAD(&ve_hooks[i]); + return 0; +} + +core_initcall(ve_hooks_init); + diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c new file mode 100644 index 0000000..0248f38 --- /dev/null +++ b/kernel/ve/ve.c @@ -0,0 +1,147 @@ +/* + * linux/kernel/ve/ve.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * 've.c' helper file performing VE sub-system initialization + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +unsigned long vz_rstamp = 0x37e0f59d; + +#ifdef CONFIG_MODULES +struct module no_module = { .state = MODULE_STATE_GOING }; +EXPORT_SYMBOL(no_module); +#endif + +INIT_KSYM_MODULE(ip_tables); +INIT_KSYM_MODULE(ip6_tables); +INIT_KSYM_MODULE(iptable_filter); +INIT_KSYM_MODULE(ip6table_filter); +INIT_KSYM_MODULE(iptable_mangle); +INIT_KSYM_MODULE(ip6table_mangle); +INIT_KSYM_MODULE(ip_conntrack); +INIT_KSYM_MODULE(nf_conntrack); +INIT_KSYM_MODULE(nf_conntrack_ipv4); +INIT_KSYM_MODULE(nf_conntrack_ipv6); +INIT_KSYM_MODULE(ip_nat); +INIT_KSYM_MODULE(nf_nat); +INIT_KSYM_MODULE(iptable_nat); + +INIT_KSYM_CALL(int, init_iptable_conntrack, (void)); +INIT_KSYM_CALL(int, nf_conntrack_init_ve, (void)); +INIT_KSYM_CALL(int, init_nf_ct_l3proto_ipv4, (void)); +INIT_KSYM_CALL(int, init_nf_ct_l3proto_ipv6, (void)); +INIT_KSYM_CALL(int, nf_nat_init, (void)); +INIT_KSYM_CALL(int, init_iptable_nat, (void)); +INIT_KSYM_CALL(void, fini_iptable_nat, (void)); +INIT_KSYM_CALL(int, init_nftable_nat, (void)); +INIT_KSYM_CALL(void, fini_nftable_nat, (void)); +INIT_KSYM_CALL(void, nf_nat_cleanup, (void)); +INIT_KSYM_CALL(void, fini_iptable_conntrack, (void)); +INIT_KSYM_CALL(void, nf_conntrack_cleanup_ve, (void)); +INIT_KSYM_CALL(void, fini_nf_ct_l3proto_ipv4, (void)); +INIT_KSYM_CALL(void, fini_nf_ct_l3proto_ipv6, (void)); + +#if defined(CONFIG_VE_CALLS_MODULE) || defined(CONFIG_VE_CALLS) +INIT_KSYM_MODULE(vzmon); +INIT_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); + +void do_env_free(struct ve_struct *env) +{ + KSYMSAFECALL_VOID(vzmon, real_do_env_free, (env)); +} +EXPORT_SYMBOL(do_env_free); +#endif + +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) +INIT_KSYM_MODULE(vzethdev); +INIT_KSYM_CALL(int, veth_open, (struct net_device *dev)); +#endif + +struct ve_struct ve0 = { + .counter = ATOMIC_INIT(1), + .pcounter = ATOMIC_INIT(1), + .ve_list = LIST_HEAD_INIT(ve0.ve_list), + .vetask_lh = LIST_HEAD_INIT(ve0.vetask_lh), + .start_jiffies = INITIAL_JIFFIES, +#ifdef CONFIG_UNIX98_PTYS + .devpts_config = &devpts_config, +#endif + .ve_ns = &init_nsproxy, + .ve_netns = &init_net, + .is_running = 1, + .op_sem = __RWSEM_INITIALIZER(ve0.op_sem), +#ifdef CONFIG_VE_IPTABLES + .ipt_mask = ~0ULL, +#endif + .features = VE_FEATURE_SIT | VE_FEATURE_IPIP, +}; + +EXPORT_SYMBOL(ve0); + +DEFINE_PER_CPU_STATIC(struct ve_cpu_stats, ve0_cpu_stats); + +LIST_HEAD(ve_list_head); +rwlock_t ve_list_lock = RW_LOCK_UNLOCKED; + +LIST_HEAD(ve_cleanup_list); +DEFINE_SPINLOCK(ve_cleanup_lock); +struct task_struct *ve_cleanup_thread; + +EXPORT_SYMBOL(ve_list_lock); +EXPORT_SYMBOL(ve_list_head); +EXPORT_SYMBOL(ve_cleanup_lock); +EXPORT_SYMBOL(ve_cleanup_list); +EXPORT_SYMBOL(ve_cleanup_thread); + +void init_ve0(void) +{ + struct ve_struct *ve; + + ve = get_ve0(); + ve->cpu_stats = percpu_static_init(ve0_cpu_stats); + list_add(&ve->ve_list, &ve_list_head); +} + +void ve_cleanup_schedule(struct ve_struct *ve) +{ + BUG_ON(ve_cleanup_thread == NULL); + + spin_lock(&ve_cleanup_lock); + list_add_tail(&ve->cleanup_list, &ve_cleanup_list); + spin_unlock(&ve_cleanup_lock); + + wake_up_process(ve_cleanup_thread); +} diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c new file mode 100644 index 0000000..86dd8ef --- /dev/null +++ b/kernel/ve/vecalls.c @@ -0,0 +1,2417 @@ +/* + * linux/kernel/ve/vecalls.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + */ + +/* + * 'vecalls.c' is file with basic VE support. It provides basic primities + * along with initialization script + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#ifdef CONFIG_VZ_FAIRSCHED +#include +#endif + +#include +#include +#include +#include + +int nr_ve = 1; /* One VE always exists. Compatibility with vestat */ +EXPORT_SYMBOL(nr_ve); + +static int do_env_enter(struct ve_struct *ve, unsigned int flags); +static int alloc_ve_tty_drivers(struct ve_struct* ve); +static void free_ve_tty_drivers(struct ve_struct* ve); +static int register_ve_tty_drivers(struct ve_struct* ve); +static void unregister_ve_tty_drivers(struct ve_struct* ve); +static int init_ve_tty_drivers(struct ve_struct *); +static void fini_ve_tty_drivers(struct ve_struct *); +static void clear_termios(struct tty_driver* driver ); + +static void vecalls_exit(void); + +struct ve_struct *__find_ve_by_id(envid_t veid) +{ + struct ve_struct *ve; + + for_each_ve(ve) { + if (ve->veid == veid) + return ve; + } + return NULL; +} +EXPORT_SYMBOL(__find_ve_by_id); + +struct ve_struct *get_ve_by_id(envid_t veid) +{ + struct ve_struct *ve; + read_lock(&ve_list_lock); + ve = __find_ve_by_id(veid); + get_ve(ve); + read_unlock(&ve_list_lock); + return ve; +} +EXPORT_SYMBOL(get_ve_by_id); + +/* + * real_put_ve() MUST be used instead of put_ve() inside vecalls. + */ +void real_do_env_free(struct ve_struct *ve); +static inline void real_put_ve(struct ve_struct *ve) +{ + if (ve && atomic_dec_and_test(&ve->counter)) { + BUG_ON(atomic_read(&ve->pcounter) > 0); + BUG_ON(ve->is_running); + real_do_env_free(ve); + } +} + +static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat __user *buf) +{ + struct ve_struct *ve; + struct vz_cpu_stat *vstat; + int retval; + int i, cpu; + unsigned long tmp; + + if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid)) + return -EPERM; + if (veid == 0) + return -ESRCH; + + vstat = kzalloc(sizeof(*vstat), GFP_KERNEL); + if (!vstat) + return -ENOMEM; + + retval = -ESRCH; + read_lock(&ve_list_lock); + ve = __find_ve_by_id(veid); + if (ve == NULL) + goto out_unlock; + for_each_online_cpu(cpu) { + struct ve_cpu_stats *st; + + st = VE_CPU_STATS(ve, cpu); + vstat->user_jif += (unsigned long)cputime64_to_clock_t(st->user); + vstat->nice_jif += (unsigned long)cputime64_to_clock_t(st->nice); + vstat->system_jif += (unsigned long)cputime64_to_clock_t(st->system); + vstat->idle_clk += ve_sched_get_idle_time(ve, cpu); + } + vstat->uptime_clk = get_cycles() - ve->start_cycles; + vstat->uptime_jif = (unsigned long)cputime64_to_clock_t( + get_jiffies_64() - ve->start_jiffies); + for (i = 0; i < 3; i++) { + tmp = ve->avenrun[i] + (FIXED_1/200); + vstat->avenrun[i].val_int = LOAD_INT(tmp); + vstat->avenrun[i].val_frac = LOAD_FRAC(tmp); + } + read_unlock(&ve_list_lock); + + retval = 0; + if (copy_to_user(buf, vstat, sizeof(*vstat))) + retval = -EFAULT; +out_free: + kfree(vstat); + return retval; + +out_unlock: + read_unlock(&ve_list_lock); + goto out_free; +} + +static int real_setdevperms(envid_t veid, unsigned type, + dev_t dev, unsigned mask) +{ + struct ve_struct *ve; + int err; + + if (!capable(CAP_SETVEID) || veid == 0) + return -EPERM; + + if ((ve = get_ve_by_id(veid)) == NULL) + return -ESRCH; + + down_read(&ve->op_sem); + err = -ESRCH; + if (ve->is_running) + err = set_device_perms_ve(ve, type, dev, mask); + up_read(&ve->op_sem); + real_put_ve(ve); + return err; +} + +/********************************************************************** + ********************************************************************** + * + * VE start: subsystems + * + ********************************************************************** + **********************************************************************/ + +#ifdef CONFIG_INET +#include +#include +#include +#include + +static int init_fini_ve_mibs(struct ve_struct *ve, int fini) +{ + if (fini) + goto fini; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (init_ipv6_mibs()) + goto err_ipv6; +#endif + return 0; + +fini: +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + cleanup_ipv6_mibs(); +err_ipv6: +#endif + return -ENOMEM; +} + +static inline int init_ve_mibs(struct ve_struct *ve) +{ + return init_fini_ve_mibs(ve, 0); +} + +static inline void fini_ve_mibs(struct ve_struct *ve) +{ + (void)init_fini_ve_mibs(ve, 1); +} +#else +#define init_ve_mibs(ve) (0) +#define fini_ve_mibs(ve) do { } while (0) +#endif + +static int prepare_proc_root(struct ve_struct *ve) +{ + struct proc_dir_entry *de; + + de = kzalloc(sizeof(struct proc_dir_entry) + 6, GFP_KERNEL); + if (de == NULL) + return -ENOMEM; + + memcpy(de + 1, "/proc", 6); + de->name = (char *)(de + 1); + de->namelen = 5; + de->mode = S_IFDIR | S_IRUGO | S_IXUGO; + de->nlink = 2; + atomic_set(&de->count, 1); + + ve->proc_root = de; + return 0; +} + +#ifdef CONFIG_PROC_FS +static int init_ve_proc(struct ve_struct *ve) +{ + int err; + + err = prepare_proc_root(ve); + if (err) + goto out_root; + + err = register_ve_fs_type(ve, &proc_fs_type, + &ve->proc_fstype, &ve->proc_mnt); + if (err) + goto out_reg; + +#ifdef CONFIG_PRINTK + proc_create("kmsg", S_IRUSR, ve->proc_root, &proc_kmsg_operations); +#endif + proc_mkdir("vz", ve->proc_root); + + ve->ve_ns->pid_ns->proc_mnt = mntget(ve->proc_mnt); + return 0; + +out_reg: + /* proc_fstype and proc_root are freed in real_put_ve -> free_ve_proc */ + ; +out_root: + return err; +} + +static void fini_ve_proc(struct ve_struct *ve) +{ + remove_proc_entry("vz", ve->proc_root); + remove_proc_entry("kmsg", ve->proc_root); + unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt); + ve->proc_mnt = NULL; +} + +static void free_ve_proc(struct ve_struct *ve) +{ + /* proc filesystem frees proc_dir_entries on remove_proc_entry() only, + so we check that everything was removed and not lost */ + if (ve->proc_root && ve->proc_root->subdir) { + struct proc_dir_entry *p = ve->proc_root; + printk(KERN_WARNING "CT: %d: proc entry /proc", ve->veid); + while ((p = p->subdir) != NULL) + printk("/%s", p->name); + printk(" is not removed!\n"); + } + + kfree(ve->proc_root); + kfree(ve->proc_fstype); + + ve->proc_fstype = NULL; + ve->proc_root = NULL; +} +#else +#define init_ve_proc(ve) (0) +#define fini_ve_proc(ve) do { } while (0) +#define free_ve_proc(ve) do { } while (0) +#endif + +#ifdef CONFIG_UNIX98_PTYS +#include + +/* + * DEVPTS needs a virtualization: each environment should see each own list of + * pseudo-terminals. + * To implement it we need to have separate devpts superblocks for each + * VE, and each VE should mount its own one. + * Thus, separate vfsmount structures are required. + * To minimize intrusion into vfsmount lookup code, separate file_system_type + * structures are created. + * + * In addition to this, patch fo character device itself is required, as file + * system itself is used only for MINOR/MAJOR lookup. + */ + +static int init_ve_devpts(struct ve_struct *ve) +{ + int err; + + err = -ENOMEM; + ve->devpts_config = kzalloc(sizeof(struct devpts_config), GFP_KERNEL); + if (ve->devpts_config == NULL) + goto out; + + ve->devpts_config->mode = 0600; + err = register_ve_fs_type(ve, &devpts_fs_type, + &ve->devpts_fstype, &ve->devpts_mnt); + if (err) { + kfree(ve->devpts_config); + ve->devpts_config = NULL; + } +out: + return err; +} + +static void fini_ve_devpts(struct ve_struct *ve) +{ + unregister_ve_fs_type(ve->devpts_fstype, ve->devpts_mnt); + /* devpts_fstype is freed in real_put_ve -> free_ve_filesystems */ + ve->devpts_mnt = NULL; + kfree(ve->devpts_config); + ve->devpts_config = NULL; +} +#else +#define init_ve_devpts(ve) (0) +#define fini_ve_devpts(ve) do { } while (0) +#endif + +static int init_ve_shmem(struct ve_struct *ve) +{ + return register_ve_fs_type(ve, + &tmpfs_fs_type, + &ve->shmem_fstype, + &ve->shmem_mnt); +} + +static void fini_ve_shmem(struct ve_struct *ve) +{ + unregister_ve_fs_type(ve->shmem_fstype, ve->shmem_mnt); + /* shmem_fstype is freed in real_put_ve -> free_ve_filesystems */ + ve->shmem_mnt = NULL; +} + +#ifdef CONFIG_SYSFS +static int init_ve_sysfs_root(struct ve_struct *ve) +{ + struct sysfs_dirent *sysfs_root; + + sysfs_root = kzalloc(sizeof(struct sysfs_dirent), GFP_KERNEL); + if (sysfs_root == NULL) + return -ENOMEM; + sysfs_root->s_name = ""; + atomic_set(&sysfs_root->s_count, 1); + sysfs_root->s_flags = SYSFS_DIR; + sysfs_root->s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + sysfs_root->s_ino = 1; + + ve->_sysfs_root = sysfs_root; + return 0; +} +#endif + +#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) +extern struct device_attribute ve_net_class_attributes[]; +static inline int init_ve_netclass(void) +{ + struct class *nc; + int err; + + nc = kzalloc(sizeof(*nc), GFP_KERNEL); + if (!nc) + return -ENOMEM; + + nc->name = net_class.name; + nc->dev_release = net_class.dev_release; + nc->dev_uevent = net_class.dev_uevent; + nc->dev_attrs = ve_net_class_attributes; + + err = class_register(nc); + if (!err) { + get_exec_env()->net_class = nc; + return 0; + } + kfree(nc); + return err; +} + +static inline void fini_ve_netclass(void) +{ + struct ve_struct *ve = get_exec_env(); + + class_unregister(ve->net_class); + kfree(ve->net_class); + ve->net_class = NULL; +} +#else +static inline int init_ve_netclass(void) { return 0; } +static inline void fini_ve_netclass(void) { ; } +#endif + +extern struct kset devices_subsys; + +static const struct { + unsigned minor; + char *name; +} mem_class_devices [] = { + {3, "null"}, + {5, "zero"}, + {7, "full"}, + {8, "random"}, + {9, "urandom"}, + {0, NULL}, +}; + +static int init_ve_mem_class(void) +{ + int i; + struct class *ve_mem_class; + + ve_mem_class = class_create(THIS_MODULE, "mem"); + if (IS_ERR(ve_mem_class)) + return -ENOMEM; + + for (i = 0; mem_class_devices[i].name; i++) + device_create(ve_mem_class, NULL, + MKDEV(MEM_MAJOR, mem_class_devices[i].minor), + NULL, mem_class_devices[i].name); + + get_exec_env()->mem_class = ve_mem_class; + return 0; +} + + +void fini_ve_mem_class(void) +{ + int i; + struct class *ve_mem_class = get_exec_env()->mem_class; + + for (i = 0; mem_class_devices[i].name; i++) + device_destroy(ve_mem_class, + MKDEV(MEM_MAJOR, mem_class_devices[i].minor)); + class_destroy(ve_mem_class); +} + +static int init_ve_sysfs(struct ve_struct *ve) +{ + int err; + +#ifdef CONFIG_SYSFS + err = 0; + if (ve->features & VE_FEATURE_SYSFS) { + err = init_ve_sysfs_root(ve); + if (err != 0) + goto out; + err = register_ve_fs_type(ve, + &sysfs_fs_type, + &ve->sysfs_fstype, + &ve->sysfs_mnt); + if (err != 0) + goto out_fs_type; + } +#endif + + err = classes_init(); + if (err != 0) + goto err_classes; + + err = devices_init(); + if (err != 0) + goto err_devices; + + err = init_ve_netclass(); + if (err != 0) + goto err_net; + + err = init_ve_tty_class(); + if (err != 0) + goto err_tty; + + err = init_ve_mem_class(); + if (err != 0) + goto err_mem; + + return 0; + +err_mem: + fini_ve_tty_class(); +err_tty: + fini_ve_netclass(); +err_net: + devices_fini(); +err_devices: + classes_fini(); +err_classes: +#ifdef CONFIG_SYSFS + unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); + /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ +out_fs_type: + kfree(ve->_sysfs_root); + ve->_sysfs_root = NULL; +out: +#endif + return err; +} + +static void fini_ve_sysfs(struct ve_struct *ve) +{ + fini_ve_mem_class(); + fini_ve_tty_class(); + fini_ve_netclass(); + devices_fini(); + classes_fini(); +#ifdef CONFIG_SYSFS + unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); + ve->sysfs_mnt = NULL; + kfree(ve->_sysfs_root); + ve->_sysfs_root = NULL; + /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ +#endif +} + +static void free_ve_filesystems(struct ve_struct *ve) +{ +#ifdef CONFIG_SYSFS + kfree(ve->sysfs_fstype); + ve->sysfs_fstype = NULL; +#endif + kfree(ve->shmem_fstype); + ve->shmem_fstype = NULL; + + kfree(ve->devpts_fstype); + ve->devpts_fstype = NULL; + + free_ve_proc(ve); +} + +static int init_printk(struct ve_struct *ve) +{ + struct ve_prep_printk { + wait_queue_head_t log_wait; + unsigned log_start; + unsigned log_end; + unsigned logged_chars; + } *tmp; + + tmp = kzalloc(sizeof(struct ve_prep_printk), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + init_waitqueue_head(&tmp->log_wait); + ve->_log_wait = &tmp->log_wait; + ve->_log_start = &tmp->log_start; + ve->_log_end = &tmp->log_end; + ve->_logged_chars = &tmp->logged_chars; + /* ve->log_buf will be initialized later by ve_log_init() */ + return 0; +} + +static void fini_printk(struct ve_struct *ve) +{ + /* + * there is no spinlock protection here because nobody can use + * log_buf at the moments when this code is called. + */ + kfree(ve->log_buf); + kfree(ve->_log_wait); +} + +static void fini_venet(struct ve_struct *ve) +{ +#ifdef CONFIG_INET + tcp_v4_kill_ve_sockets(ve); + synchronize_net(); +#endif +} + +static int init_ve_sched(struct ve_struct *ve) +{ +#ifdef CONFIG_VZ_FAIRSCHED + int err; + + /* + * We refuse to switch to an already existing node since nodes + * keep a pointer to their ve_struct... + */ + err = sys_fairsched_mknod(0, 1, ve->veid); + if (err < 0) { + printk(KERN_WARNING "Can't create fairsched node %d\n", + ve->veid); + return err; + } + err = sys_fairsched_mvpr(current->pid, ve->veid); + if (err) { + printk(KERN_WARNING "Can't switch to fairsched node %d\n", + ve->veid); + if (sys_fairsched_rmnod(ve->veid)) + printk(KERN_ERR "Can't clean fairsched node %d\n", + ve->veid); + return err; + } +#endif + ve_sched_attach(ve); + return 0; +} + +static void fini_ve_sched(struct ve_struct *ve) +{ +#ifdef CONFIG_VZ_FAIRSCHED + if (task_fairsched_node_id(current) == ve->veid) + if (sys_fairsched_mvpr(current->pid, FAIRSCHED_INIT_NODE_ID)) + printk(KERN_WARNING "Can't leave fairsched node %d\n", + ve->veid); + if (sys_fairsched_rmnod(ve->veid)) + printk(KERN_ERR "Can't remove fairsched node %d\n", + ve->veid); +#endif +} + +/* + * Namespaces + */ + +static inline int init_ve_namespaces(struct ve_struct *ve, + struct nsproxy **old) +{ + int err; + struct task_struct *tsk; + struct nsproxy *cur; + + tsk = current; + cur = tsk->nsproxy; + + err = copy_namespaces(CLONE_NAMESPACES_MASK & ~CLONE_NEWNET, tsk); + if (err < 0) + return err; + + ve->ve_ns = get_nsproxy(tsk->nsproxy); + memcpy(ve->ve_ns->uts_ns->name.release, virt_utsname.release, + sizeof(virt_utsname.release)); + + if (cur->pid_ns->flags & PID_NS_HIDE_CHILD) + ve->ve_ns->pid_ns->flags |= PID_NS_HIDDEN; + + *old = cur; + return 0; +} + +static inline void fini_ve_namespaces(struct ve_struct *ve, + struct nsproxy *old) +{ + struct task_struct *tsk = current; + struct nsproxy *tmp; + + if (old) { + tmp = tsk->nsproxy; + tsk->nsproxy = get_nsproxy(old); + put_nsproxy(tmp); + tmp = ve->ve_ns; + ve->ve_ns = get_nsproxy(old); + put_nsproxy(tmp); + } else { + put_nsproxy(ve->ve_ns); + ve->ve_ns = NULL; + } +} + +static int init_ve_netns(struct ve_struct *ve, struct nsproxy **old) +{ + int err; + struct task_struct *tsk; + struct nsproxy *cur; + + tsk = current; + cur = tsk->nsproxy; + + err = copy_namespaces(CLONE_NEWNET, tsk); + if (err < 0) + return err; + + put_nsproxy(ve->ve_ns); + ve->ve_ns = get_nsproxy(tsk->nsproxy); + ve->ve_netns = get_net(ve->ve_ns->net_ns); + *old = cur; + return 0; +} + +static inline void switch_ve_namespaces(struct ve_struct *ve, + struct task_struct *tsk) +{ + struct nsproxy *old_ns; + struct nsproxy *new_ns; + + BUG_ON(tsk != current); + old_ns = tsk->nsproxy; + new_ns = ve->ve_ns; + + if (old_ns != new_ns) { + tsk->nsproxy = get_nsproxy(new_ns); + put_nsproxy(old_ns); + } +} + +static __u64 get_ve_features(env_create_param_t *data, int datalen) +{ + __u64 known_features; + + if (datalen < sizeof(struct env_create_param3)) + /* this version of vzctl is aware of VE_FEATURES_OLD only */ + known_features = VE_FEATURES_OLD; + else + known_features = data->known_features; + + /* + * known features are set as required + * yet unknown features are set as in VE_FEATURES_DEF + */ + return (data->feature_mask & known_features) | + (VE_FEATURES_DEF & ~known_features); +} + +static int init_ve_struct(struct ve_struct *ve, envid_t veid, + u32 class_id, env_create_param_t *data, int datalen) +{ + (void)get_ve(ve); + ve->veid = veid; + ve->class_id = class_id; + ve->features = get_ve_features(data, datalen); + INIT_LIST_HEAD(&ve->vetask_lh); + init_rwsem(&ve->op_sem); + + ve->start_timespec = current->start_time; + /* The value is wrong, but it is never compared to process + * start times */ + ve->start_jiffies = get_jiffies_64(); + ve->start_cycles = get_cycles(); + + return 0; +} + +/********************************************************************** + ********************************************************************** + * + * /proc/meminfo virtualization + * + ********************************************************************** + **********************************************************************/ +static int ve_set_meminfo(envid_t veid, unsigned long val) +{ +#ifdef CONFIG_BEANCOUNTERS + struct ve_struct *ve; + + ve = get_ve_by_id(veid); + if (!ve) + return -EINVAL; + + ve->meminfo_val = val; + real_put_ve(ve); + return 0; +#else + return -ENOTTY; +#endif +} + +static int init_ve_meminfo(struct ve_struct *ve) +{ + ve->meminfo_val = 0; + return 0; +} + +static inline void fini_ve_meminfo(struct ve_struct *ve) +{ +} + +static void set_ve_root(struct ve_struct *ve, struct task_struct *tsk) +{ + read_lock(&tsk->fs->lock); + ve->root_path = tsk->fs->root; + read_unlock(&tsk->fs->lock); + mark_tree_virtual(&ve->root_path); +} + +static void set_ve_caps(struct ve_struct *ve, struct task_struct *tsk) +{ + /* required for real_setdevperms from register_ve_ above */ + memcpy(&ve->ve_cap_bset, &tsk->cap_effective, sizeof(kernel_cap_t)); + cap_lower(ve->ve_cap_bset, CAP_SETVEID); +} + +static int ve_list_add(struct ve_struct *ve) +{ + write_lock_irq(&ve_list_lock); + if (__find_ve_by_id(ve->veid) != NULL) + goto err_exists; + + list_add(&ve->ve_list, &ve_list_head); + nr_ve++; + write_unlock_irq(&ve_list_lock); + return 0; + +err_exists: + write_unlock_irq(&ve_list_lock); + return -EEXIST; +} + +static void ve_list_del(struct ve_struct *ve) +{ + write_lock_irq(&ve_list_lock); + list_del(&ve->ve_list); + nr_ve--; + write_unlock_irq(&ve_list_lock); +} + +static void set_task_ve_caps(struct task_struct *tsk, struct ve_struct *ve) +{ + kernel_cap_t bset; + + spin_lock(&task_capability_lock); + bset = ve->ve_cap_bset; + tsk->cap_effective = cap_intersect(tsk->cap_effective, bset); + tsk->cap_inheritable = cap_intersect(tsk->cap_inheritable, bset); + tsk->cap_permitted = cap_intersect(tsk->cap_permitted, bset); + spin_unlock(&task_capability_lock); +} + +void ve_move_task(struct task_struct *tsk, struct ve_struct *new) +{ + struct ve_struct *old; + + might_sleep(); + BUG_ON(tsk != current); + BUG_ON(!(thread_group_leader(tsk) && thread_group_empty(tsk))); + + /* this probihibts ptracing of task entered to VE from host system */ + tsk->mm->vps_dumpable = 0; + /* setup capabilities before enter */ + set_task_ve_caps(tsk, new); + + old = tsk->ve_task_info.owner_env; + tsk->ve_task_info.owner_env = new; + tsk->ve_task_info.exec_env = new; + + write_lock_irq(&tasklist_lock); + list_del_rcu(&tsk->ve_task_info.vetask_list); + write_unlock_irq(&tasklist_lock); + + synchronize_rcu(); + + write_lock_irq(&tasklist_lock); + list_add_tail_rcu(&tsk->ve_task_info.vetask_list, + &new->vetask_lh); + write_unlock_irq(&tasklist_lock); + + atomic_dec(&old->pcounter); + real_put_ve(old); + + atomic_inc(&new->pcounter); + get_ve(new); + + tsk->cgroups = new->ve_css_set; +} + +EXPORT_SYMBOL(ve_move_task); + +#ifdef CONFIG_VE_IPTABLES + +#define KSYMIPTINIT(mask, ve, full_mask, mod, name, args) \ +({ \ + int ret = 0; \ + if (VE_IPT_CMP(mask, full_mask) && \ + VE_IPT_CMP((ve)->_iptables_modules, \ + full_mask & ~(full_mask##_MOD))) { \ + ret = KSYMERRCALL(1, mod, name, args); \ + if (ret == 0) \ + (ve)->_iptables_modules |= \ + full_mask##_MOD; \ + if (ret == 1) \ + ret = 0; \ + } \ + ret; \ +}) + +#define KSYMIPTFINI(mask, full_mask, mod, name, args) \ +({ \ + if (VE_IPT_CMP(mask, full_mask##_MOD)) \ + KSYMSAFECALL_VOID(mod, name, args); \ +}) + + +static int do_ve_iptables(struct ve_struct *ve, __u64 init_mask, + int init_or_cleanup) +{ + int err; + + /* Remove when userspace will start supplying IPv6-related bits. */ + init_mask &= ~VE_IP_IPTABLES6; + init_mask &= ~VE_IP_FILTER6; + init_mask &= ~VE_IP_MANGLE6; + init_mask &= ~VE_IP_IPTABLE_NAT_MOD; + init_mask &= ~VE_NF_CONNTRACK_MOD; + if ((init_mask & VE_IP_IPTABLES) == VE_IP_IPTABLES) + init_mask |= VE_IP_IPTABLES6; + if ((init_mask & VE_IP_FILTER) == VE_IP_FILTER) + init_mask |= VE_IP_FILTER6; + if ((init_mask & VE_IP_MANGLE) == VE_IP_MANGLE) + init_mask |= VE_IP_MANGLE6; + if ((init_mask & VE_IP_NAT) == VE_IP_NAT) + init_mask |= VE_IP_IPTABLE_NAT; + + if ((init_mask & VE_IP_CONNTRACK) == VE_IP_CONNTRACK) + init_mask |= VE_NF_CONNTRACK; + + err = 0; + if (!init_or_cleanup) + goto cleanup; + + /* init part */ +#if defined(CONFIG_NF_CONNTRACK_IPV4) || \ + defined(CONFIG_NF_CONNTRACK_IPV4_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_NF_CONNTRACK, + nf_conntrack, nf_conntrack_init_ve, ()); + if (err < 0) + goto err_nf_conntrack; + + err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK, + nf_conntrack_ipv4, init_nf_ct_l3proto_ipv4, ()); + if (err < 0) + goto err_nf_conntrack_ipv4; +#endif +#if defined(CONFIG_NF_NAT) || \ + defined(CONFIG_NF_NAT_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT, + nf_nat, nf_nat_init, ()); + if (err < 0) + goto err_nftable_nat; + err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLE_NAT, + iptable_nat, init_nftable_nat, ()); + if (err < 0) + goto err_nftable_nat2; +#endif + return 0; + +/* ------------------------------------------------------------------------- */ + +cleanup: +#if defined(CONFIG_NF_NAT) || \ + defined(CONFIG_NF_NAT_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLE_NAT, + iptable_nat, fini_nftable_nat, ()); +err_nftable_nat2: + KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, + nf_nat, nf_nat_cleanup, ()); +err_nftable_nat: +#endif +#if defined(CONFIG_NF_CONNTRACK_IPV4) || \ + defined(CONFIG_NF_CONNTRACK_IPV4_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK, + nf_conntrack_ipv4, fini_nf_ct_l3proto_ipv4, ()); +err_nf_conntrack_ipv4: + KSYMIPTFINI(ve->_iptables_modules, VE_NF_CONNTRACK, + nf_conntrack, nf_conntrack_cleanup_ve, ()); +err_nf_conntrack: +#endif + /* Do not reset _iptables_modules as + * net hooks used one + */ + return err; +} + +static inline int init_ve_iptables(struct ve_struct *ve, __u64 init_mask) +{ + return do_ve_iptables(ve, init_mask, 1); +} + +static inline void fini_ve_iptables(struct ve_struct *ve, __u64 init_mask) +{ + (void)do_ve_iptables(ve, init_mask, 0); +} + +#else +#define init_ve_iptables(x, y) (0) +#define fini_ve_iptables(x, y) do { } while (0) +#endif + +static inline int init_ve_cpustats(struct ve_struct *ve) +{ + ve->cpu_stats = alloc_percpu(struct ve_cpu_stats); + return ve->cpu_stats == NULL ? -ENOMEM : 0; +} + +static inline void free_ve_cpustats(struct ve_struct *ve) +{ + free_percpu(ve->cpu_stats); + ve->cpu_stats = NULL; +} + +static int alone_in_pgrp(struct task_struct *tsk) +{ + struct task_struct *p; + int alone = 0; + + read_lock(&tasklist_lock); + do_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p) { + if (p != tsk) + goto out; + } while_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p); + do_each_pid_task(task_pid(tsk), PIDTYPE_SID, p) { + if (p != tsk) + goto out; + } while_each_pid_task(task_pid(tsk), PIDTYPE_SID, p); + alone = 1; +out: + read_unlock(&tasklist_lock); + return alone; +} + +static int do_env_create(envid_t veid, unsigned int flags, u32 class_id, + env_create_param_t *data, int datalen) +{ + struct task_struct *tsk; + struct ve_struct *old; + struct ve_struct *old_exec; + struct ve_struct *ve; + __u64 init_mask; + int err; + struct nsproxy *old_ns, *old_ns_net; + DECLARE_COMPLETION_ONSTACK(sysfs_completion); + + tsk = current; + old = VE_TASK_INFO(tsk)->owner_env; + + if (!thread_group_leader(tsk) || !thread_group_empty(tsk)) + return -EINVAL; + + if (tsk->signal->tty) { + printk("ERR: CT init has controlling terminal\n"); + return -EINVAL; + } + if (task_pgrp(tsk) != task_pid(tsk) || + task_session(tsk) != task_pid(tsk)) { + int may_setsid; + + read_lock(&tasklist_lock); + may_setsid = !tsk->signal->leader && + !find_task_by_pid_type_ns(PIDTYPE_PGID, task_pid_nr(tsk), &init_pid_ns); + read_unlock(&tasklist_lock); + + if (!may_setsid) { + printk("ERR: CT init is process group leader\n"); + return -EINVAL; + } + } + /* Check that the process is not a leader of non-empty group/session. + * If it is, we cannot virtualize its PID and must fail. */ + if (!alone_in_pgrp(tsk)) { + printk("ERR: CT init is not alone in process group\n"); + return -EINVAL; + } + + + VZTRACE("%s: veid=%d classid=%d pid=%d\n", + __FUNCTION__, veid, class_id, current->pid); + + err = -ENOMEM; + ve = kzalloc(sizeof(struct ve_struct), GFP_KERNEL); + if (ve == NULL) + goto err_struct; + + init_ve_struct(ve, veid, class_id, data, datalen); + __module_get(THIS_MODULE); + down_write(&ve->op_sem); + if (flags & VE_LOCK) + ve->is_locked = 1; + + /* + * this should be done before adding to list + * because if calc_load_ve finds this ve in + * list it will be very surprised + */ + if ((err = init_ve_cpustats(ve)) < 0) + goto err_cpu_stats; + + if ((err = ve_list_add(ve)) < 0) + goto err_exist; + + /* this should be done before context switching */ + if ((err = init_printk(ve)) < 0) + goto err_log_wait; + + old_exec = set_exec_env(ve); + + if ((err = init_ve_sched(ve)) < 0) + goto err_sched; + + set_ve_root(ve, tsk); + + if ((err = init_ve_sysfs(ve))) + goto err_sysfs; + + if ((err = init_ve_mibs(ve))) + goto err_mibs; + + if ((err = init_ve_namespaces(ve, &old_ns))) + goto err_ns; + + if ((err = init_ve_proc(ve))) + goto err_proc; + + + init_mask = data ? data->iptables_mask : VE_IP_DEFAULT; + +#ifdef CONFIG_VE_IPTABLES + /* Set up ipt_mask as it will be used during + * net namespace initialization + */ + ve->ipt_mask = init_mask; +#endif + + if ((err = init_ve_netns(ve, &old_ns_net))) + goto err_netns; + + if ((err = init_ve_cgroups(ve))) + goto err_cgroup; + + if ((err = init_ve_tty_drivers(ve)) < 0) + goto err_tty; + + if ((err = init_ve_shmem(ve))) + goto err_shmem; + + if ((err = init_ve_devpts(ve))) + goto err_devpts; + + if((err = init_ve_meminfo(ve))) + goto err_meminf; + + set_ve_caps(ve, tsk); + + /* It is safe to initialize netfilter here as routing initialization and + interface setup will be done below. This means that NO skb can be + passed inside. Den */ + /* iptables ve initialization for non ve0; + ve0 init is in module_init */ + + if ((err = init_ve_iptables(ve, init_mask)) < 0) + goto err_iptables; + + if ((err = pid_ns_attach_init(ve->ve_ns->pid_ns, tsk)) < 0) + goto err_vpid; + + if ((err = ve_hook_iterate_init(VE_SS_CHAIN, ve)) < 0) + goto err_ve_hook; + + put_nsproxy(old_ns); + put_nsproxy(old_ns_net); + + /* finally: set vpids and move inside */ + ve_move_task(tsk, ve); + + ve->is_running = 1; + up_write(&ve->op_sem); + + printk(KERN_INFO "CT: %d: started\n", veid); + return veid; + +err_ve_hook: + mntget(ve->proc_mnt); +err_vpid: + fini_venet(ve); + fini_ve_iptables(ve, init_mask); +err_iptables: + fini_ve_meminfo(ve); +err_meminf: + fini_ve_devpts(ve); +err_devpts: + fini_ve_shmem(ve); +err_shmem: + fini_ve_tty_drivers(ve); +err_tty: + fini_ve_cgroups(ve); +err_cgroup: + fini_ve_namespaces(ve, old_ns_net); + put_nsproxy(old_ns_net); + ve->ve_netns->sysfs_completion = &sysfs_completion; + put_net(ve->ve_netns); + wait_for_completion(&sysfs_completion); +err_netns: + /* + * If process hasn't become VE's init, proc_mnt won't be put during + * pidns death, so this mntput by hand is needed. If it has, we + * compensate with mntget above. + */ + mntput(ve->proc_mnt); + fini_ve_proc(ve); +err_proc: + /* free_ve_utsname() is called inside real_put_ve() */ + fini_ve_namespaces(ve, old_ns); + put_nsproxy(old_ns); + /* + * We need to compensate, because fini_ve_namespaces() assumes + * ve->ve_ns will continue to be used after, but VE will be freed soon + * (in kfree() sense). + */ + put_nsproxy(ve->ve_ns); +err_ns: + fini_ve_mibs(ve); +err_mibs: + fini_ve_sysfs(ve); +err_sysfs: + /* It is safe to restore current->envid here because + * ve_fairsched_detach does not use current->envid. */ + /* Really fairsched code uses current->envid in sys_fairsched_mknod + * only. It is correct if sys_fairsched_mknod is called from + * userspace. If sys_fairsched_mknod is called from + * ve_fairsched_attach, then node->envid and node->parent_node->envid + * are explicitly set to valid value after the call. */ + /* FIXME */ + VE_TASK_INFO(tsk)->owner_env = old; + VE_TASK_INFO(tsk)->exec_env = old_exec; + + fini_ve_sched(ve); +err_sched: + (void)set_exec_env(old_exec); + + /* we can jump here having incorrect envid */ + VE_TASK_INFO(tsk)->owner_env = old; + fini_printk(ve); +err_log_wait: + /* cpustats will be freed in do_env_free */ + ve_list_del(ve); + up_write(&ve->op_sem); + + real_put_ve(ve); +err_struct: + printk(KERN_INFO "CT: %d: failed to start with err=%d\n", veid, err); + return err; + +err_exist: + free_ve_cpustats(ve); +err_cpu_stats: + kfree(ve); + goto err_struct; +} + + +/********************************************************************** + ********************************************************************** + * + * VE start/stop callbacks + * + ********************************************************************** + **********************************************************************/ + +int real_env_create(envid_t veid, unsigned flags, u32 class_id, + env_create_param_t *data, int datalen) +{ + int status; + struct ve_struct *ve; + + if (!flags) { + status = get_exec_env()->veid; + goto out; + } + + status = -EPERM; + if (!capable(CAP_SETVEID)) + goto out; + + status = -EINVAL; + if ((flags & VE_TEST) && (flags & (VE_ENTER|VE_CREATE))) + goto out; + + status = -EINVAL; + ve = get_ve_by_id(veid); + if (ve) { + if (flags & VE_TEST) { + status = 0; + goto out_put; + } + if (flags & VE_EXCLUSIVE) { + status = -EACCES; + goto out_put; + } + if (flags & VE_CREATE) { + flags &= ~VE_CREATE; + flags |= VE_ENTER; + } + } else { + if (flags & (VE_TEST|VE_ENTER)) { + status = -ESRCH; + goto out; + } + } + + if (flags & VE_CREATE) { + status = do_env_create(veid, flags, class_id, data, datalen); + goto out; + } else if (flags & VE_ENTER) + status = do_env_enter(ve, flags); + + /* else: returning EINVAL */ + +out_put: + real_put_ve(ve); +out: + return status; +} +EXPORT_SYMBOL(real_env_create); + +static int do_env_enter(struct ve_struct *ve, unsigned int flags) +{ + struct task_struct *tsk = current; + int err; + + VZTRACE("%s: veid=%d\n", __FUNCTION__, ve->veid); + + err = -EBUSY; + down_read(&ve->op_sem); + if (!ve->is_running) + goto out_up; + if (ve->is_locked && !(flags & VE_SKIPLOCK)) + goto out_up; + err = -EINVAL; + if (!thread_group_leader(tsk) || !thread_group_empty(tsk)) + goto out_up; + +#ifdef CONFIG_VZ_FAIRSCHED + err = sys_fairsched_mvpr(current->pid, ve->veid); + if (err) + goto out_up; +#endif + ve_sched_attach(ve); + switch_ve_namespaces(ve, tsk); + ve_move_task(current, ve); + + /* Check that the process is not a leader of non-empty group/session. + * If it is, we cannot virtualize its PID. Do not fail, just leave + * it non-virtual. + */ + if (alone_in_pgrp(tsk) && !(flags & VE_SKIPLOCK)) + pid_ns_attach_task(ve->ve_ns->pid_ns, tsk); + + /* Unlike VE_CREATE, we do not setsid() in VE_ENTER. + * Process is allowed to be in an external group/session. + * If user space callers wants, it will do setsid() after + * VE_ENTER. + */ + err = VE_TASK_INFO(tsk)->owner_env->veid; + tsk->did_ve_enter = 1; + +out_up: + up_read(&ve->op_sem); + return err; +} + +static void env_cleanup(struct ve_struct *ve) +{ + struct ve_struct *old_ve; + DECLARE_COMPLETION_ONSTACK(sysfs_completion); + + VZTRACE("real_do_env_cleanup\n"); + + down_read(&ve->op_sem); + old_ve = set_exec_env(ve); + + ve_hook_iterate_fini(VE_SS_CHAIN, ve); + + fini_venet(ve); + + /* no new packets in flight beyond this point */ + + /* kill iptables */ + /* No skb belonging to VE can exist at this point as unregister_netdev + is an operation awaiting until ALL skb's gone */ + fini_ve_iptables(ve, ve->_iptables_modules); + + fini_ve_sched(ve); + + fini_ve_devpts(ve); + fini_ve_shmem(ve); + unregister_ve_tty_drivers(ve); + fini_ve_meminfo(ve); + + fini_ve_cgroups(ve); + + fini_ve_namespaces(ve, NULL); + ve->ve_netns->sysfs_completion = &sysfs_completion; + put_net(ve->ve_netns); + wait_for_completion(&sysfs_completion); + fini_ve_mibs(ve); + fini_ve_proc(ve); + fini_ve_sysfs(ve); + + (void)set_exec_env(old_ve); + fini_printk(ve); /* no printk can happen in ve context anymore */ + + ve_list_del(ve); + up_read(&ve->op_sem); + + real_put_ve(ve); +} + +static DECLARE_COMPLETION(vzmond_complete); +static volatile int stop_vzmond; + +static int vzmond_helper(void *arg) +{ + char name[18]; + struct ve_struct *ve; + + ve = (struct ve_struct *)arg; + snprintf(name, sizeof(name), "vzmond/%d", ve->veid); + daemonize(name); + env_cleanup(ve); + module_put_and_exit(0); +} + +static void do_pending_env_cleanups(void) +{ + int err; + struct ve_struct *ve; + + spin_lock(&ve_cleanup_lock); + while (1) { + if (list_empty(&ve_cleanup_list) || need_resched()) + break; + + ve = list_first_entry(&ve_cleanup_list, + struct ve_struct, cleanup_list); + list_del(&ve->cleanup_list); + spin_unlock(&ve_cleanup_lock); + + __module_get(THIS_MODULE); + err = kernel_thread(vzmond_helper, (void *)ve, 0); + if (err < 0) { + env_cleanup(ve); + module_put(THIS_MODULE); + } + + spin_lock(&ve_cleanup_lock); + } + spin_unlock(&ve_cleanup_lock); +} + +static inline int have_pending_cleanups(void) +{ + return !list_empty(&ve_cleanup_list); +} + +static int vzmond(void *arg) +{ + daemonize("vzmond"); + set_current_state(TASK_INTERRUPTIBLE); + + while (!stop_vzmond || have_pending_cleanups()) { + schedule(); + try_to_freeze(); + if (signal_pending(current)) + flush_signals(current); + + do_pending_env_cleanups(); + set_current_state(TASK_INTERRUPTIBLE); + if (have_pending_cleanups()) + __set_current_state(TASK_RUNNING); + } + + __set_task_state(current, TASK_RUNNING); + complete_and_exit(&vzmond_complete, 0); +} + +static int __init init_vzmond(void) +{ + int pid; + struct task_struct *tsk; + + pid = kernel_thread(vzmond, NULL, 0); + if (pid > 0) { + tsk = find_task_by_vpid(pid); + BUG_ON(tsk == NULL); + ve_cleanup_thread = tsk; + } + return pid; +} + +static void fini_vzmond(void) +{ + stop_vzmond = 1; + wake_up_process(ve_cleanup_thread); + wait_for_completion(&vzmond_complete); + ve_cleanup_thread = NULL; + WARN_ON(!list_empty(&ve_cleanup_list)); +} + +void real_do_env_free(struct ve_struct *ve) +{ + VZTRACE("real_do_env_free\n"); + + free_ve_tty_drivers(ve); + free_ve_filesystems(ve); + free_ve_cpustats(ve); + printk(KERN_INFO "CT: %d: stopped\n", VEID(ve)); + kfree(ve); + + module_put(THIS_MODULE); +} +EXPORT_SYMBOL(real_do_env_free); + + +/********************************************************************** + ********************************************************************** + * + * VE TTY handling + * + ********************************************************************** + **********************************************************************/ + +static struct tty_driver *alloc_ve_tty_driver(struct tty_driver *base, + struct ve_struct *ve) +{ + size_t size; + struct tty_driver *driver; + + /* FIXME: make it a normal way (or wait till ms version) */ + + driver = kmalloc(sizeof(struct tty_driver), GFP_KERNEL_UBC); + if (!driver) + goto out; + + memcpy(driver, base, sizeof(struct tty_driver)); + + driver->driver_state = NULL; + + size = base->num * 3 * sizeof(void *); + if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) { + void **p; + p = kzalloc(size, GFP_KERNEL_UBC); + if (!p) + goto out_free; + + driver->ttys = (struct tty_struct **)p; + driver->termios = (struct ktermios **)(p + driver->num); + driver->termios_locked = (struct ktermios **) + (p + driver->num * 2); + } else { + driver->ttys = NULL; + driver->termios = NULL; + driver->termios_locked = NULL; + } + + driver->owner_env = ve; + driver->flags |= TTY_DRIVER_INSTALLED; + driver->refcount = 0; + + return driver; + +out_free: + kfree(driver); +out: + return NULL; +} + +static void free_ve_tty_driver(struct tty_driver *driver) +{ + if (!driver) + return; + + clear_termios(driver); + kfree(driver->ttys); + kfree(driver); +} + +static int alloc_ve_tty_drivers(struct ve_struct* ve) +{ +#ifdef CONFIG_LEGACY_PTYS + /* Traditional BSD devices */ + ve->pty_driver = alloc_ve_tty_driver(pty_driver, ve); + if (!ve->pty_driver) + goto out_mem; + + ve->pty_slave_driver = alloc_ve_tty_driver(pty_slave_driver, ve); + if (!ve->pty_slave_driver) + goto out_mem; + + ve->pty_driver->other = ve->pty_slave_driver; + ve->pty_slave_driver->other = ve->pty_driver; +#endif + +#ifdef CONFIG_UNIX98_PTYS + ve->ptm_driver = alloc_ve_tty_driver(ptm_driver, ve); + if (!ve->ptm_driver) + goto out_mem; + + ve->pts_driver = alloc_ve_tty_driver(pts_driver, ve); + if (!ve->pts_driver) + goto out_mem; + + ve->ptm_driver->other = ve->pts_driver; + ve->pts_driver->other = ve->ptm_driver; + + ve->allocated_ptys = kmalloc(sizeof(*ve->allocated_ptys), + GFP_KERNEL_UBC); + if (!ve->allocated_ptys) + goto out_mem; + ida_init(ve->allocated_ptys); +#endif + return 0; + +out_mem: + free_ve_tty_drivers(ve); + return -ENOMEM; +} + +static void free_ve_tty_drivers(struct ve_struct* ve) +{ +#ifdef CONFIG_LEGACY_PTYS + free_ve_tty_driver(ve->pty_driver); + free_ve_tty_driver(ve->pty_slave_driver); + ve->pty_driver = ve->pty_slave_driver = NULL; +#endif +#ifdef CONFIG_UNIX98_PTYS + free_ve_tty_driver(ve->ptm_driver); + free_ve_tty_driver(ve->pts_driver); + kfree(ve->allocated_ptys); + ve->ptm_driver = ve->pts_driver = NULL; + ve->allocated_ptys = NULL; +#endif +} + +static inline void __register_tty_driver(struct tty_driver *driver) +{ + list_add(&driver->tty_drivers, &tty_drivers); +} + +static inline void __unregister_tty_driver(struct tty_driver *driver) +{ + if (!driver) + return; + list_del(&driver->tty_drivers); +} + +static int register_ve_tty_drivers(struct ve_struct* ve) +{ + mutex_lock(&tty_mutex); +#ifdef CONFIG_UNIX98_PTYS + __register_tty_driver(ve->ptm_driver); + __register_tty_driver(ve->pts_driver); +#endif +#ifdef CONFIG_LEGACY_PTYS + __register_tty_driver(ve->pty_driver); + __register_tty_driver(ve->pty_slave_driver); +#endif + mutex_unlock(&tty_mutex); + + return 0; +} + +static void unregister_ve_tty_drivers(struct ve_struct* ve) +{ + VZTRACE("unregister_ve_tty_drivers\n"); + + mutex_lock(&tty_mutex); +#ifdef CONFIG_LEGACY_PTYS + __unregister_tty_driver(ve->pty_driver); + __unregister_tty_driver(ve->pty_slave_driver); +#endif +#ifdef CONFIG_UNIX98_PTYS + __unregister_tty_driver(ve->ptm_driver); + __unregister_tty_driver(ve->pts_driver); +#endif + mutex_unlock(&tty_mutex); +} + +static int init_ve_tty_drivers(struct ve_struct *ve) +{ + int err; + + if ((err = alloc_ve_tty_drivers(ve))) + goto err_ttyalloc; + if ((err = register_ve_tty_drivers(ve))) + goto err_ttyreg; + return 0; + +err_ttyreg: + free_ve_tty_drivers(ve); +err_ttyalloc: + return err; +} + +static void fini_ve_tty_drivers(struct ve_struct *ve) +{ + unregister_ve_tty_drivers(ve); + free_ve_tty_drivers(ve); +} + +/* + * Free the termios and termios_locked structures because + * we don't want to get memory leaks when modular tty + * drivers are removed from the kernel. + */ +static void clear_termios(struct tty_driver *driver) +{ + int i; + struct ktermios *tp; + + if (driver->termios == NULL) + return; + for (i = 0; i < driver->num; i++) { + tp = driver->termios[i]; + if (tp) { + driver->termios[i] = NULL; + kfree(tp); + } + tp = driver->termios_locked[i]; + if (tp) { + driver->termios_locked[i] = NULL; + kfree(tp); + } + } +} + + +/********************************************************************** + ********************************************************************** + * + * Pieces of VE network + * + ********************************************************************** + **********************************************************************/ + +#ifdef CONFIG_NET +#include +#include +#include +#include +#include +#include +#endif + +static int ve_dev_add(envid_t veid, char *dev_name) +{ + struct net_device *dev; + struct ve_struct *dst_ve; + struct net *dst_net; + int err = -ESRCH; + + dst_ve = get_ve_by_id(veid); + if (dst_ve == NULL) + goto out; + + dst_net = dst_ve->ve_netns; + + rtnl_lock(); + read_lock(&dev_base_lock); + dev = __dev_get_by_name(&init_net, dev_name); + read_unlock(&dev_base_lock); + if (dev == NULL) + goto out_unlock; + + err = __dev_change_net_namespace(dev, dst_net, dev_name, get_exec_ub()); +out_unlock: + rtnl_unlock(); + real_put_ve(dst_ve); + + if (dev == NULL) + printk(KERN_WARNING "%s: device %s not found\n", + __func__, dev_name); +out: + return err; +} + +static int ve_dev_del(envid_t veid, char *dev_name) +{ + struct net_device *dev; + struct ve_struct *src_ve; + struct net *src_net; + int err = -ESRCH; + + src_ve = get_ve_by_id(veid); + if (src_ve == NULL) + goto out; + + src_net = src_ve->ve_netns; + + rtnl_lock(); + + read_lock(&dev_base_lock); + dev = __dev_get_by_name(src_net, dev_name); + read_unlock(&dev_base_lock); + if (dev == NULL) + goto out_unlock; + + err = __dev_change_net_namespace(dev, &init_net, dev_name, + netdev_bc(dev)->owner_ub); +out_unlock: + rtnl_unlock(); + real_put_ve(src_ve); + + if (dev == NULL) + printk(KERN_WARNING "%s: device %s not found\n", + __func__, dev_name); +out: + return err; +} + +int real_ve_dev_map(envid_t veid, int op, char *dev_name) +{ + if (!capable(CAP_SETVEID)) + return -EPERM; + switch (op) { + case VE_NETDEV_ADD: + return ve_dev_add(veid, dev_name); + case VE_NETDEV_DEL: + return ve_dev_del(veid, dev_name); + default: + return -EINVAL; + } +} + +/********************************************************************** + ********************************************************************** + * + * VE information via /proc + * + ********************************************************************** + **********************************************************************/ +#ifdef CONFIG_PROC_FS +#if BITS_PER_LONG == 32 +#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21) +#define VESTAT_LINE_FMT "%10u %10lu %10lu %10lu %10Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n" +#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n" +#else +#define VESTAT_LINE_WIDTH (12 * 21) +#define VESTAT_LINE_FMT "%20u %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n" +#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n" +#endif + +static int vestat_seq_show(struct seq_file *m, void *v) +{ + struct list_head *entry; + struct ve_struct *ve; + struct ve_struct *curve; + int cpu; + unsigned long user_ve, nice_ve, system_ve; + unsigned long long uptime; + cycles_t uptime_cycles, idle_time, strv_time, used; + + entry = (struct list_head *)v; + ve = list_entry(entry, struct ve_struct, ve_list); + + curve = get_exec_env(); + if (entry == ve_list_head.next || + (!ve_is_super(curve) && ve == curve)) { + /* print header */ + seq_printf(m, "%-*s\n", + VESTAT_LINE_WIDTH - 1, + "Version: 2.2"); + seq_printf(m, VESTAT_HEAD_FMT, "VEID", + "user", "nice", "system", + "uptime", "idle", + "strv", "uptime", "used", + "maxlat", "totlat", "numsched"); + } + + if (ve == get_ve0()) + return 0; + + user_ve = nice_ve = system_ve = 0; + idle_time = strv_time = used = 0; + + for_each_online_cpu(cpu) { + struct ve_cpu_stats *st; + + st = VE_CPU_STATS(ve, cpu); + user_ve += st->user; + nice_ve += st->nice; + system_ve += st->system; + used += st->used_time; + idle_time += ve_sched_get_idle_time(ve, cpu); + } + uptime_cycles = get_cycles() - ve->start_cycles; + uptime = get_jiffies_64() - ve->start_jiffies; + + seq_printf(m, VESTAT_LINE_FMT, ve->veid, + user_ve, nice_ve, system_ve, + (unsigned long long)uptime, + (unsigned long long)idle_time, + (unsigned long long)strv_time, + (unsigned long long)uptime_cycles, + (unsigned long long)used, + (unsigned long long)ve->sched_lat_ve.last.maxlat, + (unsigned long long)ve->sched_lat_ve.last.totlat, + ve->sched_lat_ve.last.count); + return 0; +} + +void *ve_seq_start(struct seq_file *m, loff_t *pos) +{ + struct ve_struct *curve; + + curve = get_exec_env(); + read_lock(&ve_list_lock); + if (!ve_is_super(curve)) { + if (*pos != 0) + return NULL; + return curve; + } + + return seq_list_start(&ve_list_head, *pos); +} +EXPORT_SYMBOL(ve_seq_start); + +void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + if (!ve_is_super(get_exec_env())) + return NULL; + else + return seq_list_next(v, &ve_list_head, pos); +} +EXPORT_SYMBOL(ve_seq_next); + +void ve_seq_stop(struct seq_file *m, void *v) +{ + read_unlock(&ve_list_lock); +} +EXPORT_SYMBOL(ve_seq_stop); + +static struct seq_operations vestat_seq_op = { + .start = ve_seq_start, + .next = ve_seq_next, + .stop = ve_seq_stop, + .show = vestat_seq_show +}; + +static int vestat_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vestat_seq_op); +} + +static struct file_operations proc_vestat_operations = { + .open = vestat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static struct seq_operations devperms_seq_op = { + .start = ve_seq_start, + .next = ve_seq_next, + .stop = ve_seq_stop, + .show = devperms_seq_show, +}; + +static int devperms_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &devperms_seq_op); +} + +static struct file_operations proc_devperms_ops = { + .open = devperms_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int vz_version_show(struct seq_file *file, void* v) +{ + static const char ver[] = VZVERSION "\n"; + + return seq_puts(file, ver); +} + +static int vz_version_open(struct inode *inode, struct file *file) +{ + return single_open(file, vz_version_show, NULL); +} + +static struct file_operations proc_vz_version_oparations = { + .open = vz_version_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static inline unsigned long ve_used_mem(struct user_beancounter *ub) +{ + extern int glob_ve_meminfo; + return glob_ve_meminfo ? ub->ub_parms[UB_OOMGUARPAGES].held : + ub->ub_parms[UB_PRIVVMPAGES].held ; +} + +static inline void ve_mi_replace(struct meminfo *mi) +{ +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *ub; + unsigned long meminfo_val; + unsigned long nodettram; + unsigned long usedmem; + + meminfo_val = get_exec_env()->meminfo_val; + + if(!meminfo_val) + return; /* No virtualization */ + + nodettram = mi->si.totalram; + ub = current->mm->mm_ub; + usedmem = ve_used_mem(ub); + + memset(mi, 0, sizeof(*mi)); + + mi->si.totalram = (meminfo_val > nodettram) ? + nodettram : meminfo_val; + mi->si.freeram = (mi->si.totalram > usedmem) ? + (mi->si.totalram - usedmem) : 0; +#else + return; +#endif +} + +static int meminfo_call(struct vnotifier_block *self, + unsigned long event, void *arg, int old_ret) +{ + if (event != VIRTINFO_MEMINFO) + return old_ret; + + ve_mi_replace((struct meminfo *)arg); + + return NOTIFY_OK; +} + + +static struct vnotifier_block meminfo_notifier_block = { + .notifier_call = meminfo_call +}; + +static int __init init_vecalls_proc(void) +{ + struct proc_dir_entry *de; + + de = proc_create("vestat", S_IFREG | S_IRUSR, proc_vz_dir, + &proc_vestat_operations); + if (!de) + printk(KERN_WARNING "VZMON: can't make vestat proc entry\n"); + + de = proc_create("devperms", S_IFREG | S_IRUSR, proc_vz_dir, + &proc_devperms_ops); + if (!de) + printk(KERN_WARNING "VZMON: can't make devperms proc entry\n"); + + de = proc_create("version", S_IFREG | S_IRUGO, proc_vz_dir, + &proc_vz_version_oparations); + if (!de) + printk(KERN_WARNING "VZMON: can't make version proc entry\n"); + + virtinfo_notifier_register(VITYPE_GENERAL, &meminfo_notifier_block); + return 0; +} + +static void fini_vecalls_proc(void) +{ + remove_proc_entry("version", proc_vz_dir); + remove_proc_entry("devperms", proc_vz_dir); + remove_proc_entry("vestat", proc_vz_dir); + virtinfo_notifier_unregister(VITYPE_GENERAL, &meminfo_notifier_block); +} +#else +#define init_vecalls_proc() (0) +#define fini_vecalls_proc() do { } while (0) +#endif /* CONFIG_PROC_FS */ + + +/********************************************************************** + ********************************************************************** + * + * User ctl + * + ********************************************************************** + **********************************************************************/ + +int vzcalls_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + err = -ENOTTY; + switch(cmd) { + case VZCTL_MARK_ENV_TO_DOWN: { + /* Compatibility issue */ + err = 0; + } + break; + case VZCTL_SETDEVPERMS: { + /* Device type was mistakenly declared as dev_t + * in the old user-kernel interface. + * That's wrong, dev_t is a kernel internal type. + * I use `unsigned' not having anything better in mind. + * 2001/08/11 SAW */ + struct vzctl_setdevperms s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_setdevperms(s.veid, s.type, + new_decode_dev(s.dev), s.mask); + } + break; +#ifdef CONFIG_INET + case VZCTL_VE_NETDEV: { + struct vzctl_ve_netdev d; + char *s; + err = -EFAULT; + if (copy_from_user(&d, (void __user *)arg, sizeof(d))) + break; + err = -ENOMEM; + s = kmalloc(IFNAMSIZ+1, GFP_KERNEL); + if (s == NULL) + break; + err = -EFAULT; + if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) { + s[IFNAMSIZ] = 0; + err = real_ve_dev_map(d.veid, d.op, s); + } + kfree(s); + } + break; +#endif + case VZCTL_ENV_CREATE: { + struct vzctl_env_create s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_env_create(s.veid, s.flags, s.class_id, + NULL, 0); + } + break; + case VZCTL_ENV_CREATE_DATA: { + struct vzctl_env_create_data s; + env_create_param_t *data; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err=-EINVAL; + if (s.datalen < VZCTL_ENV_CREATE_DATA_MINLEN || + s.datalen > VZCTL_ENV_CREATE_DATA_MAXLEN || + s.data == 0) + break; + err = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + break; + + err = -EFAULT; + if (copy_from_user(data, (void __user *)s.data, + s.datalen)) + goto free_data; + err = real_env_create(s.veid, s.flags, s.class_id, + data, s.datalen); +free_data: + kfree(data); + } + break; + case VZCTL_GET_CPU_STAT: { + struct vzctl_cpustatctl s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = ve_get_cpu_stat(s.veid, s.cpustat); + } + break; + case VZCTL_VE_MEMINFO: { + struct vzctl_ve_meminfo s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = ve_set_meminfo(s.veid, s.val); + } + break; + } + return err; +} + +#ifdef CONFIG_COMPAT +int compat_vzcalls_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int err; + + switch(cmd) { + case VZCTL_GET_CPU_STAT: { + /* FIXME */ + } + case VZCTL_COMPAT_ENV_CREATE_DATA: { + struct compat_vzctl_env_create_data cs; + struct vzctl_env_create_data __user *s; + + s = compat_alloc_user_space(sizeof(*s)); + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + if (put_user(cs.veid, &s->veid) || + put_user(cs.flags, &s->flags) || + put_user(cs.class_id, &s->class_id) || + put_user(compat_ptr(cs.data), &s->data) || + put_user(cs.datalen, &s->datalen)) + break; + err = vzcalls_ioctl(file, VZCTL_ENV_CREATE_DATA, + (unsigned long)s); + break; + } +#ifdef CONFIG_NET + case VZCTL_COMPAT_VE_NETDEV: { + struct compat_vzctl_ve_netdev cs; + struct vzctl_ve_netdev __user *s; + + s = compat_alloc_user_space(sizeof(*s)); + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + if (put_user(cs.veid, &s->veid) || + put_user(cs.op, &s->op) || + put_user(compat_ptr(cs.dev_name), &s->dev_name)) + break; + err = vzcalls_ioctl(file, VZCTL_VE_NETDEV, (unsigned long)s); + break; + } +#endif + case VZCTL_COMPAT_VE_MEMINFO: { + struct compat_vzctl_ve_meminfo cs; + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + err = ve_set_meminfo(cs.veid, cs.val); + break; + } + default: + err = vzcalls_ioctl(file, cmd, arg); + break; + } + return err; +} +#endif + +static struct vzioctlinfo vzcalls = { + .type = VZCTLTYPE, + .ioctl = vzcalls_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_vzcalls_ioctl, +#endif + .owner = THIS_MODULE, +}; + + +/********************************************************************** + ********************************************************************** + * + * Init/exit stuff + * + ********************************************************************** + **********************************************************************/ + +static int __init init_vecalls_symbols(void) +{ + KSYMRESOLVE(real_do_env_free); + KSYMMODRESOLVE(vzmon); + return 0; +} + +static void fini_vecalls_symbols(void) +{ + KSYMMODUNRESOLVE(vzmon); + KSYMUNRESOLVE(real_do_env_free); +} + +static inline __init int init_vecalls_ioctls(void) +{ + vzioctl_register(&vzcalls); + return 0; +} + +static inline void fini_vecalls_ioctls(void) +{ + vzioctl_unregister(&vzcalls); +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *table_header; + +static ctl_table kernel_table[] = { + { + .procname = "ve_allow_kthreads", + .data = &ve_allow_kthreads, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { 0 } +}; + +static ctl_table root_table[] = { + {CTL_KERN, "kernel", NULL, 0, 0555, kernel_table}, + { 0 } +}; + +static int init_vecalls_sysctl(void) +{ + table_header = register_sysctl_table(root_table); + if (!table_header) + return -ENOMEM ; + return 0; +} + +static void fini_vecalls_sysctl(void) +{ + unregister_sysctl_table(table_header); +} +#else +static int init_vecalls_sysctl(void) { return 0; } +static void fini_vecalls_sysctl(void) { ; } +#endif + +static int __init vecalls_init(void) +{ + int err; + + err = init_vecalls_sysctl(); + if (err) + goto out_vzmond; + + err = init_vzmond(); + if (err < 0) + goto out_sysctl; + + err = init_vecalls_symbols(); + if (err < 0) + goto out_sym; + + err = init_vecalls_proc(); + if (err < 0) + goto out_proc; + + err = init_vecalls_ioctls(); + if (err < 0) + goto out_ioctls; + + return 0; + +out_ioctls: + fini_vecalls_proc(); +out_proc: + fini_vecalls_symbols(); +out_sym: + fini_vzmond(); +out_sysctl: + fini_vecalls_sysctl(); +out_vzmond: + return err; +} + +static void vecalls_exit(void) +{ + fini_vecalls_ioctls(); + fini_vecalls_proc(); + fini_vecalls_symbols(); + fini_vzmond(); + fini_vecalls_sysctl(); +} + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Control"); +MODULE_LICENSE("GPL v2"); + +module_init(vecalls_init) +module_exit(vecalls_exit) diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c new file mode 100644 index 0000000..8774e9c --- /dev/null +++ b/kernel/ve/veowner.c @@ -0,0 +1,149 @@ +/* + * kernel/ve/veowner.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +void prepare_ve0_process(struct task_struct *tsk) +{ + VE_TASK_INFO(tsk)->exec_env = get_ve0(); + VE_TASK_INFO(tsk)->owner_env = get_ve0(); + VE_TASK_INFO(tsk)->sleep_time = 0; + VE_TASK_INFO(tsk)->wakeup_stamp = 0; + VE_TASK_INFO(tsk)->sched_time = 0; + seqcount_init(&VE_TASK_INFO(tsk)->wakeup_lock); + + if (tsk->pid) { + list_add_rcu(&tsk->ve_task_info.vetask_list, + &get_ve0()->vetask_lh); + atomic_inc(&get_ve0()->pcounter); + } +} + +/* + * ------------------------------------------------------------------------ + * proc entries + * ------------------------------------------------------------------------ + */ + +#ifdef CONFIG_PROC_FS +struct proc_dir_entry *proc_vz_dir; +EXPORT_SYMBOL(proc_vz_dir); + +struct proc_dir_entry *glob_proc_vz_dir; +EXPORT_SYMBOL(glob_proc_vz_dir); + +static void prepare_proc(void) +{ + proc_vz_dir = proc_mkdir("vz", NULL); + if (!proc_vz_dir) + panic("Can't create /proc/vz dir\n"); + + glob_proc_vz_dir = proc_mkdir("vz", &glob_proc_root); + if (!proc_vz_dir) + panic("Can't create /proc/vz dir\n"); +} +#endif + +/* + * ------------------------------------------------------------------------ + * OpenVZ sysctl + * ------------------------------------------------------------------------ + */ +extern int ve_area_access_check; + +#ifdef CONFIG_INET +static struct ctl_table vz_ipv4_route_table[] = { + { + .procname = "src_check", + .data = &ip_rt_src_check, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { 0 } +}; + +static struct ctl_path net_ipv4_route_path[] = { + { .ctl_name = CTL_NET, .procname = "net", }, + { .ctl_name = NET_IPV4, .procname = "ipv4", }, + { .ctl_name = NET_IPV4_ROUTE, .procname = "route", }, + { } +}; +#endif + +static struct ctl_table vz_fs_table[] = { + { + .procname = "ve-area-access-check", + .data = &ve_area_access_check, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { 0 } +}; + +static struct ctl_path fs_path[] = { + { .ctl_name = CTL_FS, .procname = "fs", }, + { } +}; + +static void prepare_sysctl(void) +{ +#ifdef CONFIG_INET + register_sysctl_paths(net_ipv4_route_path, vz_ipv4_route_table); +#endif + register_sysctl_paths(fs_path, vz_fs_table); +} + +/* + * ------------------------------------------------------------------------ + * XXX init_ve_system + * ------------------------------------------------------------------------ + */ + +void init_ve_system(void) +{ + struct task_struct *init_entry; + struct ve_struct *ve; + + ve = get_ve0(); + + init_entry = init_pid_ns.child_reaper; + /* if ve_move_task to VE0 (e.g. in cpt code) * + * occurs, ve_cap_bset on VE0 is required */ + ve->ve_cap_bset = CAP_INIT_EFF_SET; + + read_lock(&init_entry->fs->lock); + ve->root_path = init_entry->fs->root; + read_unlock(&init_entry->fs->lock); + +#ifdef CONFIG_PROC_FS + prepare_proc(); +#endif + prepare_sysctl(); +} diff --git a/kernel/ve/vzdev.c b/kernel/ve/vzdev.c new file mode 100644 index 0000000..cc4b1b7 --- /dev/null +++ b/kernel/ve/vzdev.c @@ -0,0 +1,154 @@ +/* + * kernel/ve/vzdev.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VZCTL_MAJOR 126 +#define VZCTL_NAME "vzctl" + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Interface"); +MODULE_LICENSE("GPL v2"); + +static LIST_HEAD(ioctls); +static spinlock_t ioctl_lock = SPIN_LOCK_UNLOCKED; + +static struct vzioctlinfo *vzctl_get_handler(unsigned int cmd) +{ + struct vzioctlinfo *h; + + spin_lock(&ioctl_lock); + list_for_each_entry(h, &ioctls, list) { + if (h->type == _IOC_TYPE(cmd)) + goto found; + } + h = NULL; +found: + if (h && !try_module_get(h->owner)) + h = NULL; + spin_unlock(&ioctl_lock); + return h; +} + +static void vzctl_put_handler(struct vzioctlinfo *h) +{ + if (!h) + return; + + module_put(h->owner); +} + +long vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct vzioctlinfo *h; + int err; + + err = -ENOTTY; + h = vzctl_get_handler(cmd); + if (h && h->ioctl) + err = (*h->ioctl)(file, cmd, arg); + vzctl_put_handler(h); + + return err; +} + +long compat_vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct vzioctlinfo *h; + int err; + + err = -ENOIOCTLCMD; + h = vzctl_get_handler(cmd); + if (h && h->compat_ioctl) + err = (*h->compat_ioctl)(file, cmd, arg); + vzctl_put_handler(h); + + return err; +} + +void vzioctl_register(struct vzioctlinfo *inf) +{ + spin_lock(&ioctl_lock); + list_add(&inf->list, &ioctls); + spin_unlock(&ioctl_lock); +} +EXPORT_SYMBOL(vzioctl_register); + +void vzioctl_unregister(struct vzioctlinfo *inf) +{ + spin_lock(&ioctl_lock); + list_del_init(&inf->list); + spin_unlock(&ioctl_lock); +} +EXPORT_SYMBOL(vzioctl_unregister); + +/* + * Init/exit stuff. + */ +static struct file_operations vzctl_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = vzctl_ioctl, + .compat_ioctl = compat_vzctl_ioctl, +}; + +static struct class *vzctl_class; + +static void __exit vzctl_exit(void) +{ + device_destroy(vzctl_class, MKDEV(VZCTL_MAJOR, 0)); + class_destroy(vzctl_class); + unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); +} + +static int __init vzctl_init(void) +{ + int ret; + struct device *class_err; + + ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops); + if (ret < 0) + goto out; + + vzctl_class = class_create(THIS_MODULE, "vzctl"); + if (IS_ERR(vzctl_class)) { + ret = PTR_ERR(vzctl_class); + goto out_cleandev; + } + + class_err = device_create(vzctl_class, NULL, + MKDEV(VZCTL_MAJOR, 0), NULL, VZCTL_NAME); + if (IS_ERR(class_err)) { + ret = PTR_ERR(class_err); + goto out_rmclass; + } + + goto out; + +out_rmclass: + class_destroy(vzctl_class); +out_cleandev: + unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); +out: + return ret; +} + +module_init(vzctl_init) +module_exit(vzctl_exit); diff --git a/kernel/ve/vzevent.c b/kernel/ve/vzevent.c new file mode 100644 index 0000000..554f169 --- /dev/null +++ b/kernel/ve/vzevent.c @@ -0,0 +1,125 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define NETLINK_UEVENT 31 +#define VZ_EVGRP_ALL 0x01 + +/* + * NOTE: the original idea was to send events via kobject_uevent(), + * however, it turns out that it has negative consequences like + * start of /sbin/hotplug which tries to react on our events in inadequate manner. + */ + +static struct sock *vzev_sock; + +static char *action_to_string(int action) +{ + switch (action) { + case KOBJ_MOUNT: + return "ve-mount"; + case KOBJ_UMOUNT: + return "ve-umount"; + case KOBJ_START: + return "ve-start"; + case KOBJ_STOP: + return "ve-stop"; + default: + return NULL; + } +} + +static int do_vzevent_send(int event, char *msg, int len) +{ + struct sk_buff *skb; + char *buf, *action; + int alen; + + action = action_to_string(event); + alen = strlen(action); + + skb = alloc_skb(len + 1 + alen, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + buf = skb_put(skb, len + 1 + alen); + memcpy(buf, action, alen); + buf[alen] = '@'; + memcpy(buf + alen + 1, msg, len); + (void)netlink_broadcast(vzev_sock, skb, 0, VZ_EVGRP_ALL, GFP_KERNEL); + return 0; +} + +int vzevent_send(int event, const char *attrs_fmt, ...) +{ + va_list args; + int len, err; + struct ve_struct *ve; + char *page; + + err = -ENOMEM; + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + goto out; + + va_start(args, attrs_fmt); + len = vscnprintf(page, PAGE_SIZE, attrs_fmt, args); + va_end(args); + + ve = set_exec_env(get_ve0()); + err = do_vzevent_send(event, page, len); + (void)set_exec_env(ve); + free_page((unsigned long)page); +out: + return err; +} +EXPORT_SYMBOL(vzevent_send); + +static int ve_start(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + vzevent_send(KOBJ_START, "%d", ve->veid); + return 0; +} + +static void ve_stop(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + vzevent_send(KOBJ_STOP, "%d", ve->veid); +} + +static struct ve_hook ve_start_stop_hook = { + .init = ve_start, + .fini = ve_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_AFTERALL, +}; + +static int __init init_vzevent(void) +{ + vzev_sock = netlink_kernel_create(NETLINK_UEVENT, 0, NULL, THIS_MODULE); + if (vzev_sock == NULL) + return -ENOMEM; + ve_hook_register(VE_SS_CHAIN, &ve_start_stop_hook); + return 0; +} + +static void __exit exit_vzevent(void) +{ + ve_hook_unregister(&ve_start_stop_hook); + sock_release(vzev_sock->sk_socket); +} + +MODULE_LICENSE("GPL"); + +module_init(init_vzevent); +module_exit(exit_vzevent); diff --git a/kernel/ve/vzwdog.c b/kernel/ve/vzwdog.c new file mode 100644 index 0000000..c9a4024 --- /dev/null +++ b/kernel/ve/vzwdog.c @@ -0,0 +1,288 @@ +/* + * kernel/ve/vzwdog.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Staff regading kernel thread polling VE validity */ +static int sleep_timeout = 60; +static struct task_struct *wdog_thread_tsk; + +extern void show_mem(void); + +static struct file *intr_file; +static char page[PAGE_SIZE]; + +static void parse_irq_list(int len) +{ + int i, k, skip; + for (i = 0; i < len; ) { + k = i; + while (i < len && page[i] != '\n' && page[i] != ':') + i++; + skip = 0; + if (i < len && page[i] != '\n') { + i++; /* skip ':' */ + while (i < len && (page[i] == ' ' || page[i] == '0')) + i++; + skip = (i < len && (page[i] < '0' || page[i] > '9')); + while (i < len && page[i] != '\n') + i++; + } + if (!skip) + printk("%.*s\n", i - k, page + k); + if (i < len) + i++; /* skip '\n' */ + } +} + +extern loff_t vfs_llseek(struct file *file, loff_t, int); +extern ssize_t vfs_read(struct file *file, char __user *, size_t, loff_t *); +extern struct file *filp_open(const char *filename, int flags, int mode); +extern int filp_close(struct file *filp, fl_owner_t id); +static void show_irq_list(void) +{ + mm_segment_t fs; + int r; + + fs = get_fs(); + set_fs(KERNEL_DS); + vfs_llseek(intr_file, 0, 0); + r = vfs_read(intr_file, (void __user *)page, sizeof(page), + &intr_file->f_pos); + set_fs(fs); + + if (r > 0) + parse_irq_list(r); +} + +static void show_alloc_latency(void) +{ + static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = { + "A0", + "L0", + "H0", + "L1", + "H1" + }; + int i; + + printk("lat: "); + for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) { + struct kstat_lat_struct *p; + cycles_t maxlat, avg0, avg1, avg2; + + p = &kstat_glob.alloc_lat[i]; + spin_lock_irq(&kstat_glb_lock); + maxlat = p->last.maxlat; + avg0 = p->avg[0]; + avg1 = p->avg[1]; + avg2 = p->avg[2]; + spin_unlock_irq(&kstat_glb_lock); + + printk("%s %Lu (%Lu %Lu %Lu)", + alloc_descr[i], + (unsigned long long)maxlat, + (unsigned long long)avg0, + (unsigned long long)avg1, + (unsigned long long)avg2); + } + printk("\n"); +} + +static void show_schedule_latency(void) +{ + struct kstat_lat_pcpu_struct *p; + cycles_t maxlat, totlat, avg0, avg1, avg2; + unsigned long count; + + p = &kstat_glob.sched_lat; + spin_lock_irq(&kstat_glb_lock); + maxlat = p->last.maxlat; + totlat = p->last.totlat; + count = p->last.count; + avg0 = p->avg[0]; + avg1 = p->avg[1]; + avg2 = p->avg[2]; + spin_unlock_irq(&kstat_glb_lock); + + printk("sched lat: %Lu/%Lu/%lu (%Lu %Lu %Lu)\n", + (unsigned long long)maxlat, + (unsigned long long)totlat, + count, + (unsigned long long)avg0, + (unsigned long long)avg1, + (unsigned long long)avg2); +} + +static void show_header(void) +{ + struct timeval tv; + + do_gettimeofday(&tv); + preempt_disable(); + printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n", + tv.tv_sec, (long)tv.tv_usec, + (unsigned long long)get_jiffies_64(), + smp_processor_id()); +#ifdef CONFIG_FAIRSCHED + printk("*** cycles_per_jiffy %lu jiffies_per_second %u ***\n", + cycles_per_jiffy, HZ); +#else + printk("*** jiffies_per_second %u ***\n", HZ); +#endif + preempt_enable(); +} + +static void show_pgdatinfo(void) +{ + pg_data_t *pgdat; + + printk("pgdat:"); + for_each_online_pgdat(pgdat) { + printk(" %d: %lu,%lu,%lu", + pgdat->node_id, + pgdat->node_start_pfn, + pgdat->node_present_pages, + pgdat->node_spanned_pages); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + printk(",%p", pgdat->node_mem_map); +#endif + } + printk("\n"); +} + +static int show_partition_io(struct device *dev, void *x) +{ + char *name; + char buf[BDEVNAME_SIZE]; + struct gendisk *gd; + + gd = dev_to_disk(dev); + + name = disk_name(gd, 0, buf); + if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) && + isdigit(name[4])) + return 0; + + if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) && + isdigit(name[3])) + return 0; + + printk("(%u,%u) %s r(%lu %lu %lu) w(%lu %lu %lu)\n", + gd->major, gd->first_minor, + name, + disk_stat_read(gd, ios[READ]), + disk_stat_read(gd, sectors[READ]), + disk_stat_read(gd, merges[READ]), + disk_stat_read(gd, ios[WRITE]), + disk_stat_read(gd, sectors[WRITE]), + disk_stat_read(gd, merges[WRITE])); + + return 0; +} + +static void show_diskio(void) +{ + printk("disk_io: "); + class_for_each_device(&block_class, NULL, NULL, show_partition_io); + printk("\n"); +} + +static void show_nrprocs(void) +{ + unsigned long _nr_running, _nr_sleeping, + _nr_unint, _nr_zombie, _nr_dead, _nr_stopped; + + _nr_running = nr_running(); + _nr_unint = nr_uninterruptible(); + _nr_sleeping = nr_sleeping(); + _nr_zombie = nr_zombie; + _nr_dead = atomic_read(&nr_dead); + _nr_stopped = nr_stopped(); + + printk("VEnum: %d, proc R %lu, S %lu, D %lu, " + "Z %lu, X %lu, T %lu (tot %d)\n", + nr_ve, _nr_running, _nr_sleeping, _nr_unint, + _nr_zombie, _nr_dead, _nr_stopped, nr_threads); +} + +static void wdog_print(void) +{ + show_header(); + show_irq_list(); + show_pgdatinfo(); + show_mem(); + show_diskio(); + show_schedule_latency(); + show_alloc_latency(); + show_nrprocs(); +} + +static int wdog_loop(void* data) +{ + while (1) { + wdog_print(); + try_to_freeze(); + + set_current_state(TASK_UNINTERRUPTIBLE); + if (kthread_should_stop()) + break; + schedule_timeout(sleep_timeout*HZ); + } + return 0; +} + +static int __init wdog_init(void) +{ + struct file *file; + + file = filp_open("/proc/interrupts", 0, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + intr_file = file; + + wdog_thread_tsk = kthread_run(wdog_loop, NULL, "vzwdog"); + if (IS_ERR(wdog_thread_tsk)) { + filp_close(intr_file, NULL); + return -EBUSY; + } + return 0; +} + +static void __exit wdog_exit(void) +{ + kthread_stop(wdog_thread_tsk); + filp_close(intr_file, NULL); +} + +module_param(sleep_timeout, int, 0660); +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo WDOG"); +MODULE_LICENSE("GPL v2"); + +module_init(wdog_init) +module_exit(wdog_exit) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0b50481..7194076 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -128,6 +128,15 @@ config DEBUG_SECTION_MISMATCH - Enable verbose reporting from modpost to help solving the section mismatches reported. +config SYSRQ_DEBUG + bool "Debugging via sysrq keys" + depends on MAGIC_SYSRQ + default y + help + Say Y if you want to extend functionality of magic key. It will + provide you with some debugging facilities such as dumping and + writing memory, resolving symbols and some other. + config DEBUG_KERNEL bool "Kernel debugging" help diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 3f91472..b3e8c6b 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -38,6 +38,8 @@ static const char *kobject_actions[] = { [KOBJ_REMOVE] = "remove", [KOBJ_CHANGE] = "change", [KOBJ_MOVE] = "move", + [KOBJ_START] = "start", + [KOBJ_STOP] = "stop", [KOBJ_ONLINE] = "online", [KOBJ_OFFLINE] = "offline", }; diff --git a/lib/show_mem.c b/lib/show_mem.c index 238e72a..57c038d 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -8,6 +8,7 @@ #include #include #include +#include void show_mem(void) { @@ -61,3 +62,4 @@ void show_mem(void) quicklist_total_size()); #endif } +EXPORT_SYMBOL_GPL(show_mem); diff --git a/mm/filemap.c b/mm/filemap.c index 876bc59..58eead0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,6 +42,7 @@ #include +#include /* * Shared mappings implemented 30.11.1994. It's not fully working yet, @@ -118,6 +119,7 @@ void __remove_from_page_cache(struct page *page) mem_cgroup_uncharge_cache_page(page); radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; + ub_io_release_debug(page); mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b5167df..82a9fc8 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -19,6 +19,7 @@ #include #include #include +#include /* * We do use our own empty page to avoid interference with other users @@ -194,6 +195,8 @@ retry: flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page, vma); + pb_remove_ref(page, mm); + ub_unused_privvm_inc(mm, vma); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); diff --git a/mm/fremap.c b/mm/fremap.c index 7881638..b043155 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -21,6 +21,8 @@ #include #include +#include + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -36,6 +38,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (pte_dirty(pte)) set_page_dirty(page); page_remove_rmap(page, vma); + pb_remove_ref(page, mm); page_cache_release(page); update_hiwater_rss(mm); dec_mm_counter(mm, file_rss); @@ -62,8 +65,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte) goto out; - if (!pte_none(*pte)) + if (!pte_none(*pte)) { zap_pte(mm, vma, addr, pte); + ub_unused_privvm_inc(mm, vma); + } set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); /* @@ -240,4 +245,5 @@ out: return err; } +EXPORT_SYMBOL_GPL(sys_remap_file_pages); diff --git a/mm/memory.c b/mm/memory.c index 1002f47..cf94817 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -42,6 +42,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -62,6 +65,11 @@ #include #include +#include +#include +#include +#include + #include "internal.h" #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -118,18 +126,21 @@ void pgd_clear_bad(pgd_t *pgd) pgd_ERROR(*pgd); pgd_clear(pgd); } +EXPORT_SYMBOL_GPL(pgd_clear_bad); void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); pud_clear(pud); } +EXPORT_SYMBOL_GPL(pud_clear_bad); void pmd_clear_bad(pmd_t *pmd) { pmd_ERROR(*pmd); pmd_clear(pmd); } +EXPORT_SYMBOL_GPL(pmd_clear_bad); /* * Note: this doesn't free the actual pages themselves. That @@ -340,6 +351,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) pte_free(mm, new); return 0; } +EXPORT_SYMBOL_GPL(__pte_alloc); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) { @@ -481,6 +493,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, out: return pfn_to_page(pfn); } +EXPORT_SYMBOL_GPL(vm_normal_page); /* * copy one vm_area from one task to the other. Assumes the page tables @@ -491,7 +504,7 @@ out: static inline void copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) + unsigned long addr, int *rss, struct page_beancounter **pbc) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; @@ -546,6 +559,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (page) { get_page(page); page_dup_rmap(page, vma, addr); + pb_dup_ref(page, dst_mm, pbc); rss[!!PageAnon(page)]++; } @@ -553,20 +567,35 @@ out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); } +#define pte_ptrs(a) (PTRS_PER_PTE - ((a >> PAGE_SHIFT)&(PTRS_PER_PTE - 1))) +#ifdef CONFIG_BEANCOUNTERS +#define same_ub(mm1, mm2) ((mm1)->mm_ub == (mm2)->mm_ub) +#else +#define same_ub(mm1, mm2) 1 +#endif + static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + pmd_t *dst_pmd, pmd_t *src_pmd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; - int rss[2]; + int rss[2], rss_tot; + struct page_beancounter *pbc; + int err; + err = -ENOMEM; + pbc = same_ub(src_mm, dst_mm) ? PBC_COPY_SAME : NULL; again: + if (pbc != PBC_COPY_SAME && pb_alloc_list(&pbc, pte_ptrs(addr))) + goto out; rss[1] = rss[0] = 0; dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) - return -ENOMEM; + goto out; src_pte = pte_offset_map_nested(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -587,23 +616,32 @@ again: progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss, + &pbc); progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(src_pte - 1); + rss_tot = rss[0] + rss[1]; + ub_unused_privvm_sub(dst_mm, dst_vma, rss_tot); add_mm_rss(dst_mm, rss[0], rss[1]); pte_unmap_unlock(dst_pte - 1, dst_ptl); cond_resched(); if (addr != end) goto again; - return 0; + + err = 0; +out: + pb_free_list(&pbc); + return err; } static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + pud_t *dst_pud, pud_t *src_pud, + struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; @@ -618,14 +656,16 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next)) + dst_vma, vma, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + pgd_t *dst_pgd, pgd_t *src_pgd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; @@ -640,19 +680,21 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next)) + dst_vma, vma, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } -int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma) +int __copy_page_range(struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, + unsigned long addr, size_t size) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = vma->vm_mm; pgd_t *src_pgd, *dst_pgd; unsigned long next; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; + unsigned long end = addr + size; int ret; /* @@ -686,7 +728,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next))) { + dst_vma, vma, addr, next))) { ret = -ENOMEM; break; } @@ -697,6 +739,17 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, vma->vm_start, end); return ret; } +EXPORT_SYMBOL_GPL(__copy_page_range); + +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *dst_vma, struct vm_area_struct *vma) +{ + if (dst_vma->vm_mm != dst) + BUG(); + if (vma->vm_mm != src) + BUG(); + return __copy_page_range(dst_vma, vma, vma->vm_start, vma->vm_end-vma->vm_start); +} static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, @@ -708,6 +761,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; int file_rss = 0; int anon_rss = 0; + int rss; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -762,6 +816,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, file_rss--; } page_remove_rmap(page, vma); + pb_remove_ref(page, mm); tlb_remove_page(tlb, page); continue; } @@ -776,6 +831,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); + rss = -(file_rss + anon_rss); + ub_unused_privvm_add(mm, vma, rss); add_mm_rss(mm, file_rss, anon_rss); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -1768,6 +1825,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, int reuse = 0, ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; + struct page_beancounter *pbc; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { @@ -1839,6 +1897,7 @@ reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + ClearPageCheckpointed(old_page); if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, entry); ret |= VM_FAULT_WRITE; @@ -1852,6 +1911,9 @@ reuse: gotten: pte_unmap_unlock(page_table, ptl); + if (unlikely(pb_alloc(&pbc))) + goto oom_nopb; + if (unlikely(anon_vma_prepare(vma))) goto oom; VM_BUG_ON(old_page == ZERO_PAGE(0)); @@ -1870,12 +1932,15 @@ gotten: page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { + pb_remove_ref(old_page, mm); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); } - } else + } else { + ub_unused_privvm_dec(mm, vma); inc_mm_counter(mm, anon_rss); + } flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -1890,6 +1955,7 @@ gotten: update_mmu_cache(vma, address, entry); lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); + pb_add_ref(new_page, mm, &pbc); if (old_page) { /* @@ -1927,6 +1993,7 @@ gotten: page_cache_release(new_page); if (old_page) page_cache_release(old_page); + pb_free(&pbc); unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { @@ -1949,6 +2016,8 @@ unlock: oom_free_new: page_cache_release(new_page); oom: + pb_free(&pbc); +oom_nopb: if (old_page) page_cache_release(old_page); return VM_FAULT_OOM; @@ -2256,10 +2325,16 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, swp_entry_t entry; pte_t pte; int ret = 0; + struct page_beancounter *pbc; + cycles_t start; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - goto out; + goto out_nostat; + if (unlikely(pb_alloc(&pbc))) + return VM_FAULT_OOM; + + start = get_cycles(); entry = pte_to_swp_entry(orig_pte); if (is_migration_entry(entry)) { migration_entry_wait(mm, pmd, address); @@ -2313,6 +2388,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* The page isn't present yet, go ahead with the fault. */ inc_mm_counter(mm, anon_rss); + ub_percpu_inc(mm->mm_ub, swapin); pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -2322,10 +2398,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); + pb_add_ref(page, mm, &pbc); + ub_unused_privvm_dec(mm, vma); swap_free(entry); - if (vm_swap_full()) - remove_exclusive_swap_page(page); + try_to_remove_exclusive_swap_page(page); unlock_page(page); if (write_access) { @@ -2340,10 +2417,16 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unlock: pte_unmap_unlock(page_table, ptl); out: + pb_free(&pbc); + spin_lock_irq(&kstat_glb_lock); + KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start); + spin_unlock_irq(&kstat_glb_lock); +out_nostat: return ret; out_nomap: mem_cgroup_uncharge_page(page); pte_unmap_unlock(page_table, ptl); + pb_free(&pbc); unlock_page(page); page_cache_release(page); return ret; @@ -2361,10 +2444,14 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; spinlock_t *ptl; pte_t entry; + struct page_beancounter *pbc; /* Allocate our own private page. */ pte_unmap(page_table); + if (unlikely(pb_alloc(&pbc))) + goto oom_nopb; + if (unlikely(anon_vma_prepare(vma))) goto oom; page = alloc_zeroed_user_highpage_movable(vma, address); @@ -2384,11 +2471,14 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); page_add_new_anon_rmap(page, vma, address); + pb_add_ref(page, mm, &pbc); + ub_unused_privvm_dec(mm, vma); set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, entry); unlock: + pb_free(&pbc); pte_unmap_unlock(page_table, ptl); return 0; release: @@ -2398,6 +2488,8 @@ release: oom_free_page: page_cache_release(page); oom: + pb_free(&pbc); +oom_nopb: return VM_FAULT_OOM; } @@ -2424,6 +2516,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t entry; int anon = 0; struct page *dirty_page = NULL; + struct page_beancounter *pbc; struct vm_fault vmf; int ret; int page_mkwrite = 0; @@ -2433,9 +2526,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, vmf.flags = flags; vmf.page = NULL; + ret = VM_FAULT_OOM; + if (unlikely(pb_alloc(&pbc))) + goto oom_nopb; + ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) - return ret; + goto out_fault; /* * For consistency in subsequent calls, make the faulted page always @@ -2516,6 +2613,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ /* Only go through if we didn't race with anybody else... */ if (likely(pte_same(*page_table, orig_pte))) { + struct user_beancounter *ub; + flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) @@ -2533,6 +2632,25 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); } } + ub = page_ub(page); + if (ub != NULL && +#ifdef CONFIG_BC_IO_ACCOUNTING + !((unsigned long)ub & PAGE_IO_MARK) && +#endif + ub->ub_magic == UB_MAGIC) { + /* + * WOW: Page was already charged as page_ub. This may + * happens for example then some driver export its low + * memory pages to user space. We can't account page as + * page_ub and page_bp at the same time. So uncharge + * page from UB counter. + */ + WARN_ON_ONCE(1); + ub_page_uncharge(page, 0); + } + + pb_add_ref(page, mm, &pbc); + ub_unused_privvm_dec(mm, vma); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, address, entry); @@ -2558,7 +2676,9 @@ out_unlocked: set_page_dirty_balance(dirty_page, page_mkwrite); put_page(dirty_page); } - +out_fault: + pb_free(&pbc); +oom_nopb: return ret; } @@ -2684,6 +2804,27 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd; pte_t *pte; +#ifdef CONFIG_VZ_GENCALLS + do { + int ret; +#ifdef CONFIG_BEANCOUNTERS + struct task_beancounter *tbc; + + tbc = ¤t->task_bc; + if (!test_bit(UB_AFLAG_NOTIF_PAGEIN, &mm->mm_ub->ub_aflags) && + tbc->pgfault_allot) { + tbc->pgfault_allot--; + break; /* skip notifier */ + } +#endif + ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_PAGEIN, + (void *)1); + if (ret & NOTIFY_FAIL) + return VM_FAULT_SIGBUS; + if (ret & NOTIFY_OK) + return VM_FAULT_MINOR; /* retry */ + } while (0); +#endif __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); @@ -2728,6 +2869,8 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) } #endif /* __PAGETABLE_PUD_FOLDED */ +EXPORT_SYMBOL_GPL(__pud_alloc); + #ifndef __PAGETABLE_PMD_FOLDED /* * Allocate page middle directory. @@ -2758,6 +2901,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ +EXPORT_SYMBOL_GPL(__pmd_alloc); + int make_pages_present(unsigned long addr, unsigned long end) { int ret, len, write; @@ -2787,6 +2932,8 @@ int make_pages_present(unsigned long addr, unsigned long end) return ret == len ? 0 : -ENOMEM; } +EXPORT_SYMBOL(make_pages_present); + #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) diff --git a/mm/mempool.c b/mm/mempool.c index a46eb1b..0e1a6bf 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -77,6 +77,8 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, init_waitqueue_head(&pool->wait); pool->alloc = alloc_fn; pool->free = free_fn; + if (alloc_fn == mempool_alloc_slab) + kmem_mark_nocharge((struct kmem_cache *)pool_data); /* * First pre-allocate the guaranteed number of buffers. @@ -118,6 +120,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) unsigned long flags; BUG_ON(new_min_nr <= 0); + gfp_mask &= ~__GFP_UBC; spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { @@ -211,6 +214,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ + gfp_mask &= ~__GFP_UBC; gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); diff --git a/mm/mlock.c b/mm/mlock.c index 01fbe93..0488f60 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -8,10 +8,12 @@ #include #include #include +#include #include #include #include #include +#include int can_do_mlock(void) { @@ -36,6 +38,14 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, goto out; } + if (newflags & VM_LOCKED) { + ret = ub_locked_charge(mm, end - start); + if (ret < 0) { + *prev = vma; + goto out; + } + } + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); @@ -49,13 +59,13 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, if (start != vma->vm_start) { ret = split_vma(mm, vma, start, 1); if (ret) - goto out; + goto out_uncharge; } if (end != vma->vm_end) { ret = split_vma(mm, vma, end, 0); if (ret) - goto out; + goto out_uncharge; } success: @@ -74,11 +84,17 @@ success: pages = -pages; if (!(newflags & VM_IO)) ret = make_pages_present(start, end); - } + } else + ub_locked_uncharge(mm, end - start); mm->locked_vm -= pages; out: return ret; + +out_uncharge: + if (newflags & VM_LOCKED) + ub_locked_uncharge(mm, end - start); + goto out; } static int do_mlock(unsigned long start, size_t len, int on) @@ -155,6 +171,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len) up_write(¤t->mm->mmap_sem); return error; } +EXPORT_SYMBOL_GPL(sys_mlock); asmlinkage long sys_munlock(unsigned long start, size_t len) { @@ -167,6 +184,7 @@ asmlinkage long sys_munlock(unsigned long start, size_t len) up_write(¤t->mm->mmap_sem); return ret; } +EXPORT_SYMBOL_GPL(sys_munlock); static int do_mlockall(int flags) { diff --git a/mm/mmap.c b/mm/mmap.c index e7a5a68..0e47a12 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -39,10 +40,13 @@ #define arch_mmap_check(addr, len, flags) (0) #endif +#include + #ifndef arch_rebalance_pgtables #define arch_rebalance_pgtables(addr, len) (addr) #endif +static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft); static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); @@ -108,6 +112,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) vm_acct_memory(pages); +#ifdef CONFIG_BEANCOUNTERS + switch (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_ENOUGHMEM, + (void *)pages) + & (NOTIFY_OK | NOTIFY_FAIL)) { + case NOTIFY_OK: + return 0; + case NOTIFY_FAIL: + vm_unacct_memory(pages); + return -ENOMEM; + } +#endif + /* * Sometimes we want to use more memory than we have */ @@ -232,6 +248,9 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) struct vm_area_struct *next = vma->vm_next; might_sleep(); + + ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start, + vma->vm_flags, vma->vm_file); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (vma->vm_file) { @@ -289,7 +308,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk) goto out; /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + if (__do_brk(oldbrk, newbrk-oldbrk, UB_HARD) != oldbrk) goto out; set_brk: mm->brk = brk; @@ -1100,6 +1119,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; + unsigned long ub_charged = 0; /* Clear old maps */ error = -ENOMEM; @@ -1134,6 +1154,11 @@ munmap_back: } } + if (ub_memory_charge(mm, len, vm_flags, file, + (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) + goto charge_error; + ub_charged = 1; + /* * Can we just expand an old private anonymous mapping? * The VM_SHARED test is necessary because shmem_zero_setup @@ -1149,7 +1174,8 @@ munmap_back: * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL | + (flags & MAP_EXECPRIO ? __GFP_SOFT_UBC : 0)); if (!vma) { error = -ENOMEM; goto unacct_error; @@ -1179,6 +1205,19 @@ munmap_back: goto unmap_and_free_vma; if (vm_flags & VM_EXECUTABLE) added_exe_file_vma(mm); + if (vm_flags != vma->vm_flags) { + /* + * ->vm_flags has been changed in f_op->mmap method. + * We have to recharge ub memory. + */ + ub_memory_uncharge(mm, len, vm_flags, file); + if (ub_memory_charge(mm, len, vma->vm_flags, file, + (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) { + ub_charged = 0; + error = -ENOMEM; + goto unmap_and_free_vma; + } + } } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) @@ -1243,6 +1282,9 @@ unmap_and_free_vma: free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: + if (ub_charged) + ub_memory_uncharge(mm, len, vm_flags, file); +charge_error: if (charged) vm_unacct_memory(charged); return error; @@ -1565,12 +1607,16 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un if (is_hugepage_only_range(vma->vm_mm, new_start, size)) return -EFAULT; + if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags, + vma->vm_file, UB_SOFT)) + goto fail_charge; + /* * Overcommit.. This must be the final test, as it will * update security statistics. */ if (security_vm_enough_memory(grow)) - return -ENOMEM; + goto fail_sec; /* Ok, everything looks good - let it rip */ mm->total_vm += grow; @@ -1578,6 +1624,11 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); return 0; + +fail_sec: + ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file); +fail_charge: + return -ENOMEM; } #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) @@ -1862,6 +1913,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, return 0; } +EXPORT_SYMBOL_GPL(split_vma); /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the @@ -1955,7 +2007,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -unsigned long do_brk(unsigned long addr, unsigned long len) +static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft) { struct mm_struct * mm = current->mm; struct vm_area_struct * vma, * prev; @@ -2021,8 +2073,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if (mm->map_count > sysctl_max_map_count) return -ENOMEM; + if (ub_memory_charge(mm, len, flags, NULL, soft)) + goto fail_charge; + if (security_vm_enough_memory(len >> PAGE_SHIFT)) - return -ENOMEM; + goto fail_sec; /* Can we just expand an old private anonymous mapping? */ if (vma_merge(mm, prev, addr, addr + len, flags, @@ -2032,11 +2087,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len) /* * create a vma struct for an anonymous mapping */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; - } + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL | + (soft == UB_SOFT ? __GFP_SOFT_UBC : 0)); + if (!vma) + goto fail_alloc; vma->vm_mm = mm; vma->vm_start = addr; @@ -2052,8 +2106,19 @@ out: make_pages_present(addr, addr + len); } return addr; + +fail_alloc: + vm_unacct_memory(len >> PAGE_SHIFT); +fail_sec: + ub_memory_uncharge(mm, len, flags, NULL); +fail_charge: + return -ENOMEM; } +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + return __do_brk(addr, len, UB_SOFT); +} EXPORT_SYMBOL(do_brk); /* Release all mmaps. */ @@ -2231,10 +2296,11 @@ static void special_mapping_close(struct vm_area_struct *vma) { } -static struct vm_operations_struct special_mapping_vmops = { +struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, }; +EXPORT_SYMBOL_GPL(special_mapping_vmops); /* * Called with mm->mmap_sem held for writing. diff --git a/mm/mmzone.c b/mm/mmzone.c index 16ce8b9..e9a5958 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -13,6 +13,7 @@ struct pglist_data *first_online_pgdat(void) { return NODE_DATA(first_online_node); } +EXPORT_SYMBOL_GPL(first_online_pgdat); struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) { @@ -22,6 +23,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) return NULL; return NODE_DATA(nid); } +EXPORT_SYMBOL_GPL(next_online_pgdat); /* * next_zone - helper magic for for_each_zone() diff --git a/mm/mprotect.c b/mm/mprotect.c index fded06f..93d0e7c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -27,6 +28,8 @@ #include #include +#include + #ifndef pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { @@ -143,6 +146,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long charged = 0; pgoff_t pgoff; int error; + unsigned long ch_size; + int ch_dir; int dirty_accountable = 0; if (newflags == oldflags) { @@ -150,6 +155,12 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, return 0; } + error = -ENOMEM; + ch_size = nrpages - pages_in_vma_range(vma, start, end); + ch_dir = ub_protected_charge(mm, ch_size, newflags, vma); + if (ch_dir == PRIVVM_ERROR) + goto fail_ch; + /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we @@ -160,7 +171,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, VM_SHARED|VM_NORESERVE))) { charged = nrpages; if (security_vm_enough_memory(charged)) - return -ENOMEM; + goto fail_sec; newflags |= VM_ACCOUNT; } } @@ -212,10 +223,16 @@ success: mmu_notifier_invalidate_range_end(mm, start, end); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); + if (ch_dir == PRIVVM_TO_SHARED) + __ub_unused_privvm_dec(mm, ch_size); return 0; fail: vm_unacct_memory(charged); +fail_sec: + if (ch_dir == PRIVVM_TO_PRIVATE) + __ub_unused_privvm_dec(mm, ch_size); +fail_ch: return error; } @@ -317,3 +334,4 @@ out: up_write(¤t->mm->mmap_sem); return error; } +EXPORT_SYMBOL_GPL(sys_mprotect); diff --git a/mm/mremap.c b/mm/mremap.c index 1a77439..107144a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -24,6 +24,8 @@ #include #include +#include + static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -173,17 +175,21 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long hiwater_vm; int split = 0; + if (ub_memory_charge(mm, new_len, vm_flags, + vma->vm_file, UB_HARD)) + goto err; + /* * We'd prefer to avoid failure later on in do_munmap: * which may split one vma into three before unmapping. */ if (mm->map_count >= sysctl_max_map_count - 3) - return -ENOMEM; + goto err_nomem; new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); if (!new_vma) - return -ENOMEM; + goto err_nomem; moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); if (moved_len < old_len) { @@ -242,7 +248,13 @@ static unsigned long move_vma(struct vm_area_struct *vma, new_addr + new_len); } - return new_addr; + if (new_addr != -ENOMEM) + return new_addr; + +err_nomem: + ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file); +err: + return -ENOMEM; } /* @@ -370,7 +382,15 @@ unsigned long do_mremap(unsigned long addr, max_addr = vma->vm_next->vm_start; /* can we just expand the current mapping? */ if (max_addr - addr >= new_len) { - int pages = (new_len - old_len) >> PAGE_SHIFT; + unsigned long len; + int pages; + + len = new_len - old_len; + pages = len >> PAGE_SHIFT; + ret = -ENOMEM; + if (ub_memory_charge(mm, len, vma->vm_flags, + vma->vm_file, UB_HARD)) + goto out; vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 64e5b4b..365405b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include #include @@ -28,6 +30,9 @@ #include #include +#include +#include + int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks; @@ -200,16 +205,16 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, * * (not docbooked, we don't want this one cluttering up the manual) */ -static struct task_struct *select_bad_process(unsigned long *ppoints, +struct task_struct *select_bad_process(struct user_beancounter *ub, struct mem_cgroup *mem) { struct task_struct *g, *p; struct task_struct *chosen = NULL; struct timespec uptime; - *ppoints = 0; + unsigned long chosen_points = 0; do_posix_clock_monotonic_gettime(&uptime); - do_each_thread(g, p) { + do_each_thread_all(g, p) { unsigned long points; /* @@ -223,6 +228,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, continue; if (mem && !task_in_mem_cgroup(p, mem)) continue; + if (ub_oom_task_skip(ub, p)) + continue; /* * This task already has access to memory reserves and is @@ -251,18 +258,18 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, return ERR_PTR(-1UL); chosen = p; - *ppoints = ULONG_MAX; + chosen_points = ULONG_MAX; } if (p->oomkilladj == OOM_DISABLE) continue; points = badness(p, uptime.tv_sec); - if (points > *ppoints || !chosen) { + if (points > chosen_points || !chosen) { chosen = p; - *ppoints = points; + chosen_points = points; } - } while_each_thread(g, p); + } while_each_thread_all(g, p); return chosen; } @@ -286,7 +293,7 @@ static void dump_tasks(const struct mem_cgroup *mem) printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " "name\n"); - do_each_thread(g, p) { + do_each_thread_all(g, p) { /* * total_vm and rss sizes do not exist for tasks with a * detached mm so there's no need to report them. @@ -302,7 +309,7 @@ static void dump_tasks(const struct mem_cgroup *mem) get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj, p->comm); task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); } /* @@ -337,13 +344,16 @@ static void __oom_kill_task(struct task_struct *p, int verbose) set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); + ub_oom_task_killed(p); } static int oom_kill_task(struct task_struct *p) { struct mm_struct *mm; + struct user_beancounter *ub; struct task_struct *g, *q; + task_lock(p); mm = p->mm; /* WARNING: mm may not be dereferenced since we did not obtain its @@ -355,16 +365,21 @@ static int oom_kill_task(struct task_struct *p) * However, this is of no concern to us. */ - if (mm == NULL) + if (mm == NULL) { + task_unlock(p); return 1; + } + + ub = get_beancounter(mm_ub(mm)); + task_unlock(p); /* * Don't kill the process if any threads are set to OOM_DISABLE */ - do_each_thread(g, q) { + do_each_thread_all(g, q) { if (q->mm == mm && q->oomkilladj == OOM_DISABLE) return 1; - } while_each_thread(g, q); + } while_each_thread_all(g, q); __oom_kill_task(p, 1); @@ -373,17 +388,18 @@ static int oom_kill_task(struct task_struct *p) * but are in a different thread group. Don't let them have access * to memory reserves though, otherwise we might deplete all memory. */ - do_each_thread(g, q) { + do_each_thread_all(g, q) { if (q->mm == mm && !same_thread_group(q, p)) force_sig(SIGKILL, q); - } while_each_thread(g, q); + } while_each_thread_all(g, q); + ub_oom_mm_killed(ub); + put_beancounter(ub); return 0; } -static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, - unsigned long points, struct mem_cgroup *mem, - const char *message) +int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + struct mem_cgroup *mem, const char *message) { struct task_struct *c; @@ -406,8 +422,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, return 0; } - printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", - message, task_pid_nr(p), p->comm, points); + printk(KERN_ERR "%s: kill process %d (%s) or a child\n", + message, task_pid_nr(p), p->comm); /* Try to kill a child first */ list_for_each_entry(c, &p->children, sibling) { @@ -522,9 +538,9 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) { struct task_struct *p; - unsigned long points = 0; unsigned long freed = 0; enum oom_constraint constraint; + struct user_beancounter *ub; blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) @@ -534,16 +550,34 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) if (sysctl_panic_on_oom == 2) panic("out of memory. Compulsory panic_on_oom is selected.\n"); + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_OUTOFMEM, NULL) + & (NOTIFY_OK | NOTIFY_FAIL)) + return; + + ub = NULL; + if (ub_oom_lock()) + goto out_oom_lock; + + read_lock(&tasklist_lock); + + if (printk_ratelimit()) { + printk(KERN_WARNING "%s invoked oom-killer: " + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); + dump_stack(); + show_mem(); + show_slab_info(); + } + /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ constraint = constrained_alloc(zonelist, gfp_mask); - read_lock(&tasklist_lock); switch (constraint) { case CONSTRAINT_MEMORY_POLICY: - oom_kill_process(current, gfp_mask, order, points, NULL, + oom_kill_process(current, gfp_mask, order, NULL, "No available memory (MPOL_BIND)"); break; @@ -553,27 +587,33 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) /* Fall-through */ case CONSTRAINT_CPUSET: if (sysctl_oom_kill_allocating_task) { - oom_kill_process(current, gfp_mask, order, points, NULL, + oom_kill_process(current, gfp_mask, order, NULL, "Out of memory (oom_kill_allocating_task)"); break; } retry: + put_beancounter(ub); + /* * Rambo mode: Shoot down a process and hope it solves whatever * issues we may have. */ - p = select_bad_process(&points, NULL); + ub = ub_oom_select_worst(); + p = select_bad_process(ub, NULL); if (PTR_ERR(p) == -1UL) goto out; /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { + if (ub != NULL) + goto retry; read_unlock(&tasklist_lock); + ub_oom_unlock(); panic("Out of memory and no killable processes...\n"); } - if (oom_kill_process(p, gfp_mask, order, points, NULL, + if (oom_kill_process(p, gfp_mask, order, NULL, "Out of memory")) goto retry; @@ -582,7 +622,10 @@ retry: out: read_unlock(&tasklist_lock); + ub_oom_unlock(); + put_beancounter(ub); +out_oom_lock: /* * Give "p" a good chance of killing itself before we * retry to allocate memory unless "p" is current diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 24de8b6..7e20345 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -35,6 +35,9 @@ #include #include +#include +#include + /* * The maximum number of pages to writeout in a single bdflush/kupdate * operation. We do this so we don't hold I_SYNC against an inode for @@ -903,6 +906,7 @@ retry: scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + struct user_beancounter *old_ub; /* * At this point we hold neither mapping->tree_lock nor @@ -933,7 +937,9 @@ retry: continue; } + old_ub = bc_io_switch_context(page); ret = (*writepage)(page, wbc, data); + bc_io_restore_context(old_ub); if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { unlock_page(page); @@ -1032,12 +1038,15 @@ int write_one_page(struct page *page, int wait) .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; + struct user_beancounter *old_ub; BUG_ON(!PageLocked(page)); if (wait) wait_on_page_writeback(page); + old_ub = bc_io_switch_context(page); + if (clear_page_dirty_for_io(page)) { page_cache_get(page); ret = mapping->a_ops->writepage(page, &wbc); @@ -1050,6 +1059,9 @@ int write_one_page(struct page *page, int wait) } else { unlock_page(page); } + + bc_io_restore_context(old_ub); + return ret; } EXPORT_SYMBOL(write_one_page); @@ -1081,6 +1093,9 @@ int __set_page_dirty_no_writeback(struct page *page) */ int __set_page_dirty_nobuffers(struct page *page) { + int acct; + + acct = 0; if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; @@ -1088,6 +1103,7 @@ int __set_page_dirty_nobuffers(struct page *page) if (!mapping) return 1; + acct = 0; spin_lock_irq(&mapping->tree_lock); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ @@ -1097,12 +1113,14 @@ int __set_page_dirty_nobuffers(struct page *page) __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); - task_io_account_write(PAGE_CACHE_SIZE); + acct = 1; } radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&mapping->tree_lock); + if (acct) + task_io_account_write(page, PAGE_CACHE_SIZE, 0); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1241,6 +1259,7 @@ int clear_page_dirty_for_io(struct page *page) dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + ub_io_release_context(page, PAGE_CACHE_SIZE); return 1; } return 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27b8681..0654364 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -51,6 +51,9 @@ #include #include "internal.h" +#include +#include + /* * Array of node states. */ @@ -102,6 +105,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 32, }; +EXPORT_SYMBOL(nr_swap_pages); EXPORT_SYMBOL(totalram_pages); static char * const zone_names[MAX_NR_ZONES] = { @@ -460,8 +464,11 @@ static inline int free_pages_check(struct page *page) (page_count(page) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) bad_page(page); - if (PageDirty(page)) + if (PageDirty(page)) { + ub_io_release_context(page, 0); __ClearPageDirty(page); + } else + ub_io_release_debug(page); /* * For now, we report if PG_reserved was found set, but do not * clear it, and do not free the page. But we shall soon need @@ -527,6 +534,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); + ub_page_uncharge(page, order); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order); @@ -987,6 +995,7 @@ static void free_hot_cold_page(struct page *page, int cold) kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp; + ub_page_uncharge(page, 0); local_irq_save(flags); __count_vm_event(PGFREE); if (cold) @@ -1434,6 +1443,31 @@ try_next_zone: return page; } +extern unsigned long cycles_per_jiffy; +static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order, + struct page *page, cycles_t time) +{ +#ifdef CONFIG_VE + int ind; + unsigned long flags; + + time = (jiffies - time) * cycles_per_jiffy; + if (!(gfp_mask & __GFP_WAIT)) + ind = 0; + else if (!(gfp_mask & __GFP_HIGHMEM)) + ind = (order > 0 ? 2 : 1); + else + ind = (order > 0 ? 4 : 3); + spin_lock_irqsave(&kstat_glb_lock, flags); + KSTAT_LAT_ADD(&kstat_glob.alloc_lat[ind], time); + if (!page) + kstat_glob.alloc_fails[ind]++; + spin_unlock_irqrestore(&kstat_glb_lock, flags); +#endif +} + +int alloc_fail_warn; + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -1452,6 +1486,7 @@ __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, int alloc_flags; unsigned long did_some_progress; unsigned long pages_reclaimed = 0; + cycles_t start; might_sleep_if(wait); @@ -1469,6 +1504,7 @@ restart: return NULL; } + start = jiffies; page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); if (page) @@ -1625,19 +1661,32 @@ nofail_alloc: do_retry = 1; } if (do_retry) { + if (total_swap_pages > 0 && nr_swap_pages == 0) { + out_of_memory(zonelist, gfp_mask, order); + goto restart; + } congestion_wait(WRITE, HZ/50); goto rebalance; } nopage: - if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { + __alloc_collect_stats(gfp_mask, order, NULL, start); + if (alloc_fail_warn && !(gfp_mask & __GFP_NOWARN) && + printk_ratelimit()) { printk(KERN_WARNING "%s: page allocation failure." " order:%d, mode:0x%x\n", p->comm, order, gfp_mask); dump_stack(); show_mem(); } + return NULL; + got_pg: + __alloc_collect_stats(gfp_mask, order, page, start); + if (ub_page_charge(page, order, gfp_mask)) { + __free_pages(page, order); + page = NULL; + } return page; } EXPORT_SYMBOL(__alloc_pages_internal); diff --git a/mm/rmap.c b/mm/rmap.c index 0383acf..3523db0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -51,6 +51,9 @@ #include #include +#include +#include + #include struct kmem_cache *anon_vma_cachep; @@ -94,6 +97,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) } return 0; } +EXPORT_SYMBOL_GPL(anon_vma_prepare); void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) { @@ -119,6 +123,7 @@ void anon_vma_link(struct vm_area_struct *vma) spin_unlock(&anon_vma->lock); } } +EXPORT_SYMBOL_GPL(anon_vma_link); void anon_vma_unlink(struct vm_area_struct *vma) { @@ -150,14 +155,14 @@ static void anon_vma_ctor(void *data) void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_UBC, anon_vma_ctor); } /* * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -176,12 +181,14 @@ out: rcu_read_unlock(); return NULL; } +EXPORT_SYMBOL_GPL(page_lock_anon_vma); -static void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(page_unlock_anon_vma); /* * At what user virtual address is page expected in @vma? @@ -676,6 +683,12 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) } mem_cgroup_uncharge_page(page); + /* + * Well, when a page is unmapped, we cannot keep PG_checkpointed + * flag, it is not accessible via process VM and we have no way + * to reset its state + */ + ClearPageCheckpointed(page); __dec_zone_page_state(page, PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); /* @@ -776,6 +789,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, page_remove_rmap(page, vma); + ub_unused_privvm_inc(mm, vma); + ub_percpu_inc(mm->mm_ub, unmap); + pb_remove_ref(page, mm); page_cache_release(page); out_unmap: @@ -866,6 +882,9 @@ static void try_to_unmap_cluster(unsigned long cursor, set_page_dirty(page); page_remove_rmap(page, vma); + ub_percpu_inc(mm->mm_ub, unmap); + pb_remove_ref(page, mm); + ub_unused_privvm_inc(mm, vma); page_cache_release(page); dec_mm_counter(mm, file_rss); (*mapcount)--; diff --git a/mm/shmem.c b/mm/shmem.c index 04fb4f1..87b813a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -55,6 +55,8 @@ #include #include +#include + /* This magic number is used in glibc for posix shared memory */ #define TMPFS_MAGIC 0x01021994 @@ -193,7 +195,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) static const struct super_operations shmem_ops; static const struct address_space_operations shmem_aops; -static const struct file_operations shmem_file_operations; +const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; @@ -256,7 +258,7 @@ static void shmem_free_inode(struct super_block *sb) * * It has to be called with the spinlock held. */ -static void shmem_recalc_inode(struct inode *inode) +static void shmem_recalc_inode(struct inode *inode, long swp_freed) { struct shmem_inode_info *info = SHMEM_I(inode); long freed; @@ -266,6 +268,8 @@ static void shmem_recalc_inode(struct inode *inode) info->alloced -= freed; shmem_unacct_blocks(info->flags, freed); shmem_free_blocks(inode, freed); + if (freed > swp_freed) + ub_tmpfs_respages_sub(info, freed - swp_freed); } } @@ -370,6 +374,11 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns struct page *page = kmap_atomic_to_page(entry); set_page_private(page, page_private(page) + incdec); } + + if (incdec == 1) + ub_tmpfs_respages_dec(info); + else + ub_tmpfs_respages_inc(info); } /** @@ -386,14 +395,24 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct page *page = NULL; swp_entry_t *entry; + unsigned long ub_val; if (sgp != SGP_WRITE && ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) return ERR_PTR(-EINVAL); + ub_val = 0; + if (info->next_index <= index) { + ub_val = index + 1 - info->next_index; + if (ub_shmpages_charge(info, ub_val)) + return ERR_PTR(-ENOSPC); + } + while (!(entry = shmem_swp_entry(info, index, &page))) { - if (sgp == SGP_READ) - return shmem_swp_map(ZERO_PAGE(0)); + if (sgp == SGP_READ) { + entry = shmem_swp_map(ZERO_PAGE(0)); + goto out; + } /* * Test free_blocks against 1 not 0, since we have 1 data * page (and perhaps indirect index pages) yet to allocate: @@ -403,7 +422,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long spin_lock(&sbinfo->stat_lock); if (sbinfo->free_blocks <= 1) { spin_unlock(&sbinfo->stat_lock); - return ERR_PTR(-ENOSPC); + entry = ERR_PTR(-ENOSPC); + goto out; } sbinfo->free_blocks--; inode->i_blocks += BLOCKS_PER_PAGE; @@ -411,31 +431,43 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long } spin_unlock(&info->lock); - page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); + page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | + __GFP_UBC); if (page) set_page_private(page, 0); spin_lock(&info->lock); if (!page) { - shmem_free_blocks(inode, 1); - return ERR_PTR(-ENOMEM); + entry = ERR_PTR(-ENOMEM); + goto out_block; } if (sgp != SGP_WRITE && ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { entry = ERR_PTR(-EINVAL); - break; + goto out_dir; } - if (info->next_index <= index) + if (info->next_index <= index) { + ub_val = 0; info->next_index = index + 1; + } } if (page) { /* another task gave its page, or truncated the file */ shmem_free_blocks(inode, 1); shmem_dir_free(page); } - if (info->next_index <= index && !IS_ERR(entry)) + if (info->next_index <= index) info->next_index = index + 1; return entry; + +out_dir: + shmem_dir_free(page); +out_block: + shmem_free_blocks(inode, 1); +out: + if (ub_val) + ub_shmpages_uncharge(info, ub_val); + return entry; } /** @@ -543,6 +575,7 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) return; spin_lock(&info->lock); + ub_shmpages_uncharge(info, info->next_index - idx); info->flags |= SHMEM_TRUNCATE; if (likely(end == (loff_t) -1)) { limit = info->next_index; @@ -729,7 +762,7 @@ done2: info->swapped -= nr_swaps_freed; if (nr_pages_to_free) shmem_free_blocks(inode, nr_pages_to_free); - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, nr_swaps_freed); spin_unlock(&info->lock); /* @@ -812,6 +845,7 @@ static void shmem_delete_inode(struct inode *inode) } } BUG_ON(inode->i_blocks); + shmi_ub_put(info); shmem_free_inode(inode->i_sb); clear_inode(inode); } @@ -995,6 +1029,12 @@ int shmem_unuse(swp_entry_t entry, struct page *page) out: return found; /* 0 or 1 or -ENOMEM */ } +#ifdef CONFIG_BEANCOUNTERS +#define shm_get_swap_page(info) (get_swap_page((info)->shmi_ub)) +#else +#define shm_get_swap_page(info) (get_swap_page(NULL)) +#endif + /* * Move the page from the page cache to the swap cache. */ @@ -1025,7 +1065,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * want to check if there's a redundant swappage to be discarded. */ if (wbc->for_reclaim) - swap = get_swap_page(); + swap = shm_get_swap_page(info); else swap.val = 0; @@ -1043,7 +1083,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) free_swap_and_cache(*entry); shmem_swp_set(info, entry, 0); } - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, 0); if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { remove_from_page_cache(page); @@ -1081,6 +1121,54 @@ redirty: return 0; } +/* Insert a swap entry to shmem inode address space. */ +int shmem_insertpage(struct inode * inode, unsigned long index, + swp_entry_t swap) +{ + struct shmem_inode_info *info; + swp_entry_t *entry; + int err; + + info = SHMEM_I(inode); + + spin_lock(&info->lock); + shmem_recalc_inode(inode, 0); + entry = shmem_swp_alloc(info, index, SGP_WRITE); + err = PTR_ERR(entry); + if (IS_ERR(entry)) + goto unlock; + + err = -EBUSY; + if (entry->val) + goto unlock_unmap; + + err = -EINVAL; + if (!swap_duplicate(swap)) + goto unlock_unmap; + + info->alloced++; + ub_tmpfs_respages_inc(info); + inode->i_blocks += BLOCKS_PER_PAGE; + shmem_swp_set(info, entry, swap.val); + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + if (list_empty(&info->swaplist)) { + mutex_lock(&shmem_swaplist_mutex); + /* move instead of add in case we're racing */ + list_move_tail(&info->swaplist, &shmem_swaplist); + mutex_unlock(&shmem_swaplist_mutex); + } + return 0; + +unlock_unmap: + shmem_swp_unmap(entry); +unlock: + spin_unlock(&info->lock); + return err; +} +EXPORT_SYMBOL(shmem_insertpage); + + #ifdef CONFIG_NUMA #ifdef CONFIG_TMPFS static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) @@ -1223,7 +1311,7 @@ repeat: } spin_lock(&info->lock); - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, 0); entry = shmem_swp_alloc(info, idx, sgp); if (IS_ERR(entry)) { spin_unlock(&info->lock); @@ -1417,6 +1505,7 @@ repeat: clear_highpage(filepage); flush_dcache_page(filepage); SetPageUptodate(filepage); + ub_tmpfs_respages_inc(info); if (sgp == SGP_DIRTY) set_page_dirty(filepage); } @@ -1518,6 +1607,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_generation = get_seconds(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); + shmi_ub_set(info, get_exec_ub()); spin_lock_init(&info->lock); INIT_LIST_HEAD(&info->swaplist); @@ -2388,7 +2478,7 @@ static const struct address_space_operations shmem_aops = { .migratepage = migrate_page, }; -static const struct file_operations shmem_file_operations = { +const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS .llseek = generic_file_llseek, @@ -2401,6 +2491,7 @@ static const struct file_operations shmem_file_operations = { .splice_write = generic_file_splice_write, #endif }; +EXPORT_SYMBOL_GPL(shmem_file_operations); static const struct inode_operations shmem_inode_operations = { .truncate = shmem_truncate, @@ -2470,6 +2561,10 @@ static struct vm_operations_struct shmem_vm_ops = { #endif }; +int is_shmem_mapping(struct address_space *map) +{ + return (map != NULL && map->a_ops == &shmem_aops); +} static int shmem_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) @@ -2477,13 +2572,19 @@ static int shmem_get_sb(struct file_system_type *fs_type, return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); } -static struct file_system_type tmpfs_fs_type = { +struct file_system_type tmpfs_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", .get_sb = shmem_get_sb, .kill_sb = kill_litter_super, }; +EXPORT_SYMBOL(tmpfs_fs_type); + +#ifdef CONFIG_VE +#define shm_mnt (get_exec_env()->shmem_mnt) +#else static struct vfsmount *shm_mnt; +#endif static int __init init_tmpfs(void) { @@ -2524,6 +2625,36 @@ out4: } module_init(init_tmpfs) +static inline int shm_charge_ahead(struct inode *inode) +{ +#ifdef CONFIG_BEANCOUNTERS + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long idx; + swp_entry_t *entry; + + if (!inode->i_size) + return 0; + idx = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + /* + * Just touch info to allocate space for entry and + * make all UBC checks + */ + spin_lock(&info->lock); + entry = shmem_swp_alloc(info, idx, SGP_CACHE); + if (IS_ERR(entry)) + goto err; + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + return 0; + +err: + spin_unlock(&info->lock); + return PTR_ERR(entry); +#else + return 0; +#endif +} + /** * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc//maps @@ -2570,6 +2701,9 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ + error = shm_charge_ahead(inode); + if (error) + goto close_file; init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, &shmem_file_operations); return file; @@ -2582,6 +2716,7 @@ put_memory: shmem_unacct_size(flags, size); return ERR_PTR(error); } +EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_zero_setup - setup a shared anonymous mapping @@ -2598,6 +2733,8 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); + else if (vma->vm_flags & VM_WRITE) + __ub_unused_privvm_dec(vma->vm_mm, size >> PAGE_SHIFT); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; return 0; diff --git a/mm/slab.c b/mm/slab.c index e76eee4..7cd5a15 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -111,30 +111,14 @@ #include #include #include +#include +#include #include #include #include -/* - * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. - * 0 for faster, smaller code (especially in the critical paths). - * - * STATS - 1 to collect stats for /proc/slabinfo. - * 0 for faster, smaller code (especially in the critical paths). - * - * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) - */ - -#ifdef CONFIG_DEBUG_SLAB -#define DEBUG 1 -#define STATS 1 -#define FORCED_DEBUG 1 -#else -#define DEBUG 0 -#define STATS 0 -#define FORCED_DEBUG 0 -#endif +#include /* Shouldn't this be in a header file somewhere? */ #define BYTES_PER_WORD sizeof(void *) @@ -169,19 +153,21 @@ #endif /* Legal flag mask for kmem_cache_create(). */ -#if DEBUG +#if SLAB_DEBUG # define CREATE_MASK (SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_UBC | SLAB_NO_CHARGE | \ SLAB_DEBUG_OBJECTS) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_UBC | SLAB_NO_CHARGE | \ SLAB_DEBUG_OBJECTS) #endif @@ -466,12 +452,14 @@ struct kmem_cache { #define REAPTIMEOUT_CPUC (2*HZ) #define REAPTIMEOUT_LIST3 (4*HZ) -#if STATS +#define STATS_INC_GROWN(x) ((x)->grown++) +#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) +#define STATS_INC_SHRUNK(x) ((x)->shrunk++) + +#if SLAB_STATS #define STATS_INC_ACTIVE(x) ((x)->num_active++) #define STATS_DEC_ACTIVE(x) ((x)->num_active--) #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) -#define STATS_INC_GROWN(x) ((x)->grown++) -#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) #define STATS_SET_HIGH(x) \ do { \ if ((x)->num_active > (x)->high_mark) \ @@ -494,8 +482,6 @@ struct kmem_cache { #define STATS_INC_ACTIVE(x) do { } while (0) #define STATS_DEC_ACTIVE(x) do { } while (0) #define STATS_INC_ALLOCED(x) do { } while (0) -#define STATS_INC_GROWN(x) do { } while (0) -#define STATS_ADD_REAPED(x,y) do { } while (0) #define STATS_SET_HIGH(x) do { } while (0) #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) @@ -508,7 +494,7 @@ struct kmem_cache { #define STATS_INC_FREEMISS(x) do { } while (0) #endif -#if DEBUG +#if SLAB_DEBUG /* * memory layout of objects: @@ -640,6 +626,8 @@ struct cache_sizes malloc_sizes[] = { #define CACHE(x) { .cs_size = (x) }, #include CACHE(ULONG_MAX) +#include + CACHE(ULONG_MAX) #undef CACHE }; EXPORT_SYMBOL(malloc_sizes); @@ -653,10 +641,17 @@ struct cache_names { static struct cache_names __initdata cache_names[] = { #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, #include + {NULL,}, +#undef CACHE +#define CACHE(x) { .name = "size-" #x "(UBC)", .name_dma = "size-" #x "(DMA,UBC)" }, +#include {NULL,} #undef CACHE }; +int malloc_cache_num; +EXPORT_SYMBOL(malloc_cache_num); + static struct arraycache_init initarray_cache __initdata = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; static struct arraycache_init initarray_generic = @@ -733,6 +728,7 @@ static inline void init_lock_keys(void) */ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; +static spinlock_t cache_chain_lock; /* * chicken and egg problem: delay the per-cpu array allocation @@ -765,7 +761,9 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, { struct cache_sizes *csizep = malloc_sizes; -#if DEBUG + if (gfpflags & __GFP_UBC) + csizep += malloc_cache_num; +#if SLAB_DEBUG /* This happens if someone tries to call * kmem_cache_create(), or __kmalloc(), before * the generic caches are initialized. @@ -795,9 +793,98 @@ static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) return __find_general_cachep(size, gfpflags); } -static size_t slab_mgmt_size(size_t nr_objs, size_t align) +static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +{ + return (kmem_bufctl_t *) (slabp + 1); +} + +#ifdef CONFIG_BEANCOUNTERS +#define init_slab_ubps(cachep, slabp) do { \ + if (!((cachep)->flags & SLAB_UBC)) \ + break; \ + memset(slab_ubcs(cachep, slabp), 0, \ + (cachep)->num * sizeof(void *)); \ + } while (0) + +#define UB_ALIGN(flags) (flags & SLAB_UBC ? sizeof(void *) : 1) +#define UB_EXTRA(flags) (flags & SLAB_UBC ? sizeof(void *) : 0) +#define set_cache_objuse(cachep) do { \ + (cachep)->objuse = ((PAGE_SIZE << (cachep)->gfporder) + \ + (cachep)->num - 1) / (cachep)->num; \ + if (!OFF_SLAB(cachep)) \ + break; \ + (cachep)->objuse += ((cachep)->slabp_cache->objuse + \ + (cachep)->num - 1) / (cachep)->num; \ + } while (0) + +void kmem_mark_nocharge(struct kmem_cache *cachep) +{ + cachep->flags |= SLAB_NO_CHARGE; +} + +int kmem_cache_objuse(struct kmem_cache *cachep) +{ + return cachep->objuse; +} + +EXPORT_SYMBOL(kmem_cache_objuse); + +int kmem_obj_objuse(void *obj) +{ + return virt_to_cache(obj)->objuse; +} + +int kmem_dname_objuse(void *obj) +{ + return virt_to_cache(obj)->objuse; +} + +unsigned long ub_cache_growth(struct kmem_cache *cachep) +{ + return (cachep->grown - cachep->reaped - cachep->shrunk) + << cachep->gfporder; +} + +#define slab_ubcs(cachep, slabp) ((struct user_beancounter **)\ + (ALIGN((unsigned long)(slab_bufctl(slabp) + (cachep)->num),\ + sizeof(void *)))) + +struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj) +{ + struct slab *slabp; + int objnr; + + BUG_ON(!(cachep->flags & SLAB_UBC)); + slabp = virt_to_slab(obj); + objnr = (obj - slabp->s_mem) / cachep->buffer_size; + return slab_ubcs(cachep, slabp) + objnr; +} + +struct user_beancounter *slab_ub(void *obj) +{ + return *ub_slab_ptr(virt_to_cache(obj), obj); +} + +EXPORT_SYMBOL(slab_ub); + +#else +#define UB_ALIGN(flags) 1 +#define UB_EXTRA(flags) 0 +#define set_cache_objuse(c) do { } while (0) +#define init_slab_ubps(c, s) do { } while (0) +#endif + +static size_t slab_mgmt_size_noalign(size_t nr_objs, int flags) +{ + size_t size_noub; + + size_noub = sizeof(struct slab) + nr_objs * sizeof(kmem_bufctl_t); + return ALIGN(size_noub, UB_ALIGN(flags)) + nr_objs * UB_EXTRA(flags); +} + +static size_t slab_mgmt_size(size_t nr_objs, size_t align, int flags) { - return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); + return ALIGN(slab_mgmt_size_noalign(nr_objs, flags), align); } /* @@ -842,20 +929,23 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * into account. */ nr_objs = (slab_size - sizeof(struct slab)) / - (buffer_size + sizeof(kmem_bufctl_t)); + (buffer_size + sizeof(kmem_bufctl_t) + + UB_EXTRA(flags)); /* * This calculated number will be either the right * amount, or one greater than what we want. */ - if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size - > slab_size) + if (slab_mgmt_size(nr_objs, align, flags) + + nr_objs * buffer_size > slab_size) nr_objs--; + BUG_ON(slab_mgmt_size(nr_objs, align, flags) + + nr_objs * buffer_size > slab_size); if (nr_objs > SLAB_LIMIT) nr_objs = SLAB_LIMIT; - mgmt_size = slab_mgmt_size(nr_objs, align); + mgmt_size = slab_mgmt_size(nr_objs, align, flags); } *num = nr_objs; *left_over = slab_size - nr_objs*buffer_size - mgmt_size; @@ -1403,6 +1493,7 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, cachep->nodelists[nodeid] = ptr; local_irq_enable(); } +static int offslab_limit; /* * For setting up all the kmem_list3s for cache whose buffer_size is same as @@ -1476,6 +1567,7 @@ void __init kmem_cache_init(void) /* 1) create the cache_cache */ INIT_LIST_HEAD(&cache_chain); + spin_lock_init(&cache_chain_lock); list_add(&cache_cache.next, &cache_chain); cache_cache.colour_off = cache_line_size(); cache_cache.array[smp_processor_id()] = &initarray_cache.cache; @@ -1487,7 +1579,7 @@ void __init kmem_cache_init(void) */ cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + nr_node_ids * sizeof(struct kmem_list3 *); -#if DEBUG +#if SLAB_DEBUG cache_cache.obj_size = cache_cache.buffer_size; #endif cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, @@ -1534,6 +1626,7 @@ void __init kmem_cache_init(void) slab_early_init = 0; + for (i = 0; i < 2; i++) { while (sizes->cs_size != ULONG_MAX) { /* * For performance, all the general caches are L1 aligned. @@ -1546,21 +1639,30 @@ void __init kmem_cache_init(void) sizes->cs_cachep = kmem_cache_create(names->name, sizes->cs_size, ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, + ARCH_KMALLOC_FLAGS|SLAB_PANIC| + (i ? SLAB_UBC : 0)|SLAB_NO_CHARGE, NULL); } + if (!(OFF_SLAB(sizes->cs_cachep))) + offslab_limit = sizes->cs_size; #ifdef CONFIG_ZONE_DMA - sizes->cs_dmacachep = kmem_cache_create( - names->name_dma, + sizes->cs_dmacachep = kmem_cache_create(names->name_dma, sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| + (i ? SLAB_UBC : 0) | SLAB_NO_CHARGE| SLAB_PANIC, NULL); #endif sizes++; names++; } + + sizes++; + names++; + if (!i) + malloc_cache_num = sizes - malloc_sizes; + } /* 4) Replace the bootstrap head arrays */ { struct array_cache *ptr; @@ -1730,7 +1832,7 @@ static void kmem_rcu_free(struct rcu_head *head) kmem_cache_free(cachep->slabp_cache, slab_rcu); } -#if DEBUG +#if SLAB_DEBUG #ifdef CONFIG_DEBUG_PAGEALLOC static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, @@ -1807,7 +1909,7 @@ static void dump_line(char *data, int offset, int limit) } #endif -#if DEBUG +#if SLAB_DEBUG static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) { @@ -1900,7 +2002,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) } #endif -#if DEBUG +#if SLAB_DEBUG static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) { int i; @@ -2000,7 +2102,6 @@ static void __kmem_cache_destroy(struct kmem_cache *cachep) static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { - unsigned long offslab_limit; size_t left_over = 0; int gfporder; @@ -2013,15 +2114,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, continue; if (flags & CFLGS_OFF_SLAB) { - /* - * Max number of objs-per-slab for caches which - * use off-slab slabs. Needed to avoid a possible - * looping condition in cache_grow(). - */ - offslab_limit = size - sizeof(struct slab); - offslab_limit /= sizeof(kmem_bufctl_t); + int slab_size; - if (num > offslab_limit) + slab_size = slab_mgmt_size_noalign(num, flags); + if (slab_size > offslab_limit) break; } @@ -2184,9 +2280,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, } } -#if DEBUG +#if SLAB_DEBUG WARN_ON(strchr(name, ' ')); /* It confuses parsers */ -#if FORCED_DEBUG +#if SLAB_FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with * large objects, if the increased size would increase the object size @@ -2271,7 +2367,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (!cachep) goto oops; -#if DEBUG +#if SLAB_DEBUG cachep->obj_size = size; /* @@ -2293,7 +2389,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, else size += BYTES_PER_WORD; } -#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) +#if SLAB_FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - size; @@ -2325,8 +2421,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep = NULL; goto oops; } - slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) - + sizeof(struct slab), align); + slab_size = slab_mgmt_size(cachep->num, align, flags); /* * If the slab has been placed off-slab, and we have enough space then @@ -2339,8 +2434,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - slab_size = - cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); + slab_size = slab_mgmt_size_noalign(cachep->num, flags); } cachep->colour_off = cache_line_size(); @@ -2377,7 +2471,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, } /* cache setup completed, link it into the list */ + spin_lock(&cache_chain_lock); list_add(&cachep->next, &cache_chain); + spin_unlock(&cache_chain_lock); + set_cache_objuse(cachep); oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", @@ -2388,7 +2485,7 @@ oops: } EXPORT_SYMBOL(kmem_cache_create); -#if DEBUG +#if SLAB_DEBUG static void check_irq_off(void) { BUG_ON(!irqs_disabled()); @@ -2484,10 +2581,11 @@ static int drain_freelist(struct kmem_cache *cache, } slabp = list_entry(p, struct slab, list); -#if DEBUG +#if SLAB_DEBUG BUG_ON(slabp->inuse); #endif list_del(&slabp->list); + STATS_INC_SHRUNK(cache); /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2570,10 +2668,14 @@ void kmem_cache_destroy(struct kmem_cache *cachep) /* * the chain is never empty, cache_cache is never destroyed */ + spin_lock(&cache_chain_lock); list_del(&cachep->next); + spin_unlock(&cache_chain_lock); if (__cache_shrink(cachep)) { slab_error(cachep, "Can't free all objects"); + spin_lock(&cache_chain_lock); list_add(&cachep->next, &cache_chain); + spin_unlock(&cache_chain_lock); mutex_unlock(&cache_chain_mutex); put_online_cpus(); return; @@ -2582,6 +2684,8 @@ void kmem_cache_destroy(struct kmem_cache *cachep) if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) synchronize_rcu(); + + ub_kmemcache_free(cachep); __kmem_cache_destroy(cachep); mutex_unlock(&cache_chain_mutex); put_online_cpus(); @@ -2608,7 +2712,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc_node(cachep->slabp_cache, - local_flags & ~GFP_THISNODE, nodeid); + local_flags & (~(__GFP_UBC | GFP_THISNODE)), + nodeid); if (!slabp) return NULL; } else { @@ -2620,14 +2725,10 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, slabp->s_mem = objp + colour_off; slabp->nodeid = nodeid; slabp->free = 0; + init_slab_ubps(cachep, slabp); return slabp; } -static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) -{ - return (kmem_bufctl_t *) (slabp + 1); -} - static void cache_init_objs(struct kmem_cache *cachep, struct slab *slabp) { @@ -2635,7 +2736,7 @@ static void cache_init_objs(struct kmem_cache *cachep, for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, slabp, i); -#if DEBUG +#if SLAB_DEBUG /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) poison_obj(cachep, objp, POISON_FREE); @@ -2693,7 +2794,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, slabp->inuse++; next = slab_bufctl(slabp)[slabp->free]; -#if DEBUG +#if SLAB_DEBUG slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; WARN_ON(slabp->nodeid != nodeid); #endif @@ -2707,7 +2808,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, { unsigned int objnr = obj_to_index(cachep, slabp, objp); -#if DEBUG +#if SLAB_DEBUG /* Verify that the slab belongs to the intended node */ WARN_ON(slabp->nodeid != nodeid); @@ -2795,7 +2896,7 @@ static int cache_grow(struct kmem_cache *cachep, * 'nodeid'. */ if (!objp) - objp = kmem_getpages(cachep, local_flags, nodeid); + objp = kmem_getpages(cachep, local_flags & ~__GFP_UBC, nodeid); if (!objp) goto failed; @@ -2828,7 +2929,7 @@ failed: return 0; } -#if DEBUG +#if SLAB_DEBUG /* * Perform extra freeing checks: @@ -3041,12 +3142,12 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) { might_sleep_if(flags & __GFP_WAIT); -#if DEBUG +#if SLAB_DEBUG kmem_flagcheck(cachep, flags); #endif } -#if DEBUG +#if SLAB_DEBUG static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, void *caller) { @@ -3462,9 +3563,14 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); - local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); + if (objp && should_charge(cachep, flags) && + ub_slab_charge(cachep, objp, flags)) { + kmem_cache_free(cachep, objp); + objp = NULL; + } + local_irq_restore(save_flags); if (unlikely((flags & __GFP_ZERO) && objp)) memset(objp, 0, obj_size(cachep)); @@ -3498,6 +3604,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, /* fixup slab chains */ if (slabp->inuse == 0) { if (l3->free_objects > l3->free_limit) { + STATS_INC_SHRUNK(cachep); l3->free_objects -= cachep->num; /* No need to drop any previously held * lock here, even if we have a off-slab slab @@ -3526,7 +3633,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) int node = numa_node_id(); batchcount = ac->batchcount; -#if DEBUG +#if SLAB_DEBUG BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); @@ -3547,7 +3654,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) free_block(cachep, ac->entry, batchcount, node); free_done: -#if STATS +#if SLAB_STATS { int i = 0; struct list_head *p; @@ -3581,6 +3688,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + if (should_uncharge(cachep)) + ub_slab_uncharge(cachep, objp); + /* * Skip calling cache_free_alien() when the platform is not numa. * This will avoid cache misses that happen while accessing slabp (which @@ -3989,7 +4099,7 @@ static int enable_cpucache(struct kmem_cache *cachep) if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) shared = 8; -#if DEBUG +#if SLAB_DEBUG /* * With debugging enabled, large batchcount lead to excessively long * periods with disabled local interrupts. Limit the batchcount @@ -4057,6 +4167,7 @@ static void cache_reap(struct work_struct *w) /* Give up. Setup the next iteration. */ goto out; + {KSTAT_PERF_ENTER(cache_reap) list_for_each_entry(searchp, &cache_chain, next) { check_irq_on(); @@ -4097,6 +4208,7 @@ next: check_irq_on(); mutex_unlock(&cache_chain_mutex); next_reap_node(); + KSTAT_PERF_LEAVE(cache_reap)} out: /* Set up the next iteration */ schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); @@ -4110,7 +4222,7 @@ static void print_slabinfo_header(struct seq_file *m) * Output format version, so at least we can change it * without _too_ many complaints. */ -#if STATS +#if SLAB_STATS seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); #else seq_puts(m, "slabinfo - version: 2.1\n"); @@ -4119,14 +4231,82 @@ static void print_slabinfo_header(struct seq_file *m) " "); seq_puts(m, " : tunables "); seq_puts(m, " : slabdata "); -#if STATS +#if SLAB_STATS seq_puts(m, " : globalstat " - " "); + " "); seq_puts(m, " : cpustat "); #endif seq_putc(m, '\n'); } +#define SHOW_TOP_SLABS 10 + +static unsigned long get_cache_size(struct kmem_cache *cachep) +{ + unsigned long flags; + unsigned long slabs; + struct kmem_list3 *l3; + struct list_head *lh; + int node; + + slabs = 0; + + for_each_online_node (node) { + l3 = cachep->nodelists[node]; + if (l3 == NULL) + continue; + + spin_lock_irqsave(&l3->list_lock, flags); + list_for_each (lh, &l3->slabs_full) + slabs++; + list_for_each (lh, &l3->slabs_partial) + slabs++; + list_for_each (lh, &l3->slabs_free) + slabs++; + spin_unlock_irqrestore(&l3->list_lock, flags); + } + + return slabs * (PAGE_SIZE << cachep->gfporder) + + (OFF_SLAB(cachep) ? + cachep->slabp_cache->buffer_size * slabs : 0); +} + +void show_slab_info(void) +{ + int i, j; + unsigned long size; + struct kmem_cache *ptr; + unsigned long sizes[SHOW_TOP_SLABS]; + struct kmem_cache *top[SHOW_TOP_SLABS]; + + memset(top, 0, sizeof(top)); + memset(sizes, 0, sizeof(sizes)); + + printk("Top %d caches:\n", SHOW_TOP_SLABS); + + spin_lock(&cache_chain_lock); + list_for_each_entry (ptr, &cache_chain, next) { + size = get_cache_size(ptr); + + j = 0; + for (i = 1; i < SHOW_TOP_SLABS; i++) + if (sizes[i] < sizes[j]) + j = i; + + if (size > sizes[j]) { + sizes[j] = size; + top[j] = ptr; + } + } + + for (i = 0; i < SHOW_TOP_SLABS; i++) + if (top[i]) + printk("%-21s: size %10lu objsize %10u\n", + top[i]->name, sizes[i], + top[i]->buffer_size); + spin_unlock(&cache_chain_lock); +} + static void *s_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; @@ -4205,19 +4385,20 @@ static int s_show(struct seq_file *m, void *p) if (error) printk(KERN_ERR "slab: cache %s error: %s\n", name, error); - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", + seq_printf(m, "%-21s %6lu %6lu %6u %4u %4d", name, active_objs, num_objs, cachep->buffer_size, cachep->num, (1 << cachep->gfporder)); seq_printf(m, " : tunables %4u %4u %4u", cachep->limit, cachep->batchcount, cachep->shared); seq_printf(m, " : slabdata %6lu %6lu %6lu", active_slabs, num_slabs, shared_avail); -#if STATS +#if SLAB_STATS { /* list3 stats */ unsigned long high = cachep->high_mark; unsigned long allocs = cachep->num_allocations; unsigned long grown = cachep->grown; unsigned long reaped = cachep->reaped; + unsigned long shrunk = cachep->shrunk; unsigned long errors = cachep->errors; unsigned long max_freeable = cachep->max_freeable; unsigned long node_allocs = cachep->node_allocs; @@ -4225,9 +4406,10 @@ static int s_show(struct seq_file *m, void *p) unsigned long overflows = cachep->node_overflow; seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ - %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, + %4lu %4lu %4lu %4lu %4lu %4lu", + allocs, high, grown, reaped, errors, max_freeable, node_allocs, - node_frees, overflows); + node_frees, overflows, shrunk); } /* cpu stats */ { diff --git a/mm/slub.c b/mm/slub.c index 0c83e6a..713fb2c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -24,6 +24,8 @@ #include #include +#include + /* * Lock order: * 1. slab_lock(page) @@ -137,9 +139,11 @@ /* * Set of flags that will prevent slab merging + * + * FIXME - think over how to allow merging accountable slubs */ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU) + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_UBC) #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ SLAB_CACHE_DMA) @@ -305,6 +309,95 @@ static inline int oo_objects(struct kmem_cache_order_objects x) return x.x & ((1 << 16) - 1); } +#ifdef CONFIG_BEANCOUNTERS +static inline void inc_cache_grown(struct kmem_cache *s) +{ + atomic_inc(&s->grown); +} + +static inline void dec_cache_grown(struct kmem_cache *s) +{ + atomic_dec(&s->grown); +} + +unsigned long ub_cache_growth(struct kmem_cache *cachep) +{ + return atomic_read(&cachep->grown) << cachep->oo.x; /* XXX huh? */ +} + +static void __flush_cpu_slab(struct kmem_cache *s, int cpu); + +int kmem_cache_objuse(struct kmem_cache *cachep) +{ + return cachep->objuse; +} + +EXPORT_SYMBOL(kmem_cache_objuse); + +int kmem_obj_objuse(void *obj) +{ + return kmem_cache_objuse(virt_to_head_page(obj)->slab); +} + +EXPORT_SYMBOL(kmem_obj_objuse); + +int kmem_dname_objuse(void *obj) +{ + struct kmem_cache *s; + + /* + * Allocations larger than PAGE_SIZE/2 go directly through + * __get_free_pages() and aren't associated with any cache. + */ + s = virt_to_head_page(obj)->slab; + if (!s) + return PAGE_SIZE; + return kmem_cache_objuse(s); +} + +#define page_ubs(pg) (pg->bc.slub_ubs) + +struct user_beancounter **ub_slab_ptr(struct kmem_cache *s, void *obj) +{ + struct page *pg; + + BUG_ON(!(s->flags & SLAB_UBC)); + pg = virt_to_head_page(obj); + return page_ubs(pg) + slab_index(obj, s, page_address(pg)); +} + +EXPORT_SYMBOL(ub_slab_ptr); + +struct user_beancounter *slab_ub(void *obj) +{ + struct page *pg; + + pg = virt_to_head_page(obj); + BUG_ON(!(pg->slab->flags & SLAB_UBC)); + return page_ubs(pg)[slab_index(obj, pg->slab, page_address(pg))]; +} + +EXPORT_SYMBOL(slab_ub); + +void kmem_mark_nocharge(struct kmem_cache *cachep) +{ + cachep->flags |= SLAB_NO_CHARGE; +} +#else +static inline void inc_cache_grown(struct kmem_cache *s) +{ +} + +static inline void dec_cache_grown(struct kmem_cache *s) +{ +} +#endif + +void show_slab_info(void) +{ + /* FIXME - show it */ +} + #ifdef CONFIG_SLUB_DEBUG /* * Debug settings: @@ -1073,6 +1166,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct page *page; struct kmem_cache_order_objects oo = s->oo; + flags &= ~__GFP_UBC; flags |= s->allocflags; page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, @@ -1095,9 +1189,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1 << oo_order(oo)); + inc_cache_grown(s); return page; } +static void __free_slab(struct kmem_cache *s, struct page *page); + static void setup_object(struct kmem_cache *s, struct page *page, void *object) { @@ -1120,6 +1217,18 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; +#ifdef CONFIG_BEANCOUNTERS + if (s->flags & SLAB_UBC) { + BUG_ON(page_ubs(page) != NULL); + page_ubs(page) = kzalloc(page->objects * sizeof(void *), + flags & ~__GFP_UBC); + if (page_ubs(page) == NULL) { + __free_slab(s, page); + page = NULL; + goto out; + } + } +#endif inc_slabs_node(s, page_to_nid(page), page->objects); page->slab = s; page->flags |= 1 << PG_slab; @@ -1169,6 +1278,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlab(page); reset_page_mapcount(page); +#ifdef CONFIG_BEANCOUNTERS + if (page_ubs(page) != NULL) { + BUG_ON(!(s->flags & SLAB_UBC)); + kfree(page_ubs(page)); + page_ubs(page) = NULL; + } +#endif __free_pages(page, order); } @@ -1191,6 +1307,8 @@ static void free_slab(struct kmem_cache *s, struct page *page) call_rcu(head, rcu_free_slab); } else __free_slab(s, page); + + dec_cache_grown(s); } static void discard_slab(struct kmem_cache *s, struct page *page) @@ -1602,6 +1720,13 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, c->freelist = object[c->offset]; stat(c, ALLOC_FASTPATH); } + + if (object && should_charge(s, gfpflags) && + ub_slab_charge(s, object, gfpflags)) { + kmem_cache_free(s, object); + object = NULL; + } + local_irq_restore(flags); if (unlikely((gfpflags & __GFP_ZERO) && object)) @@ -1712,6 +1837,9 @@ static __always_inline void slab_free(struct kmem_cache *s, local_irq_save(flags); c = get_cpu_slab(s, smp_processor_id()); debug_check_no_locks_freed(object, c->objsize); + + if (should_uncharge(s)) + ub_slab_uncharge(s, x); if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(object, s->objsize); if (likely(page == c->page && c->node >= 0)) { @@ -2315,6 +2443,9 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif +#ifdef CONFIG_BEANCOUNTERS + s->objuse = s->size + (sizeof(struct page) / oo_objects(s->oo)); +#endif if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) goto error; @@ -2469,6 +2600,10 @@ EXPORT_SYMBOL(kmem_cache_destroy); struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); +#ifdef CONFIG_BEANCOUNTERS +struct kmem_cache ub_kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; +EXPORT_SYMBOL(ub_kmalloc_caches); +#endif static int __init setup_slub_min_order(char *str) { @@ -2510,6 +2645,11 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, { unsigned int flags = 0; + if (gfp_flags & __GFP_UBC) { + flags = SLAB_UBC | SLAB_NO_CHARGE; + gfp_flags &= ~__GFP_UBC; + } + if (gfp_flags & SLUB_DMA) flags = SLAB_CACHE_DMA; @@ -2639,11 +2779,14 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) index = fls(size - 1); #ifdef CONFIG_ZONE_DMA - if (unlikely((flags & SLUB_DMA))) + if (unlikely((flags & SLUB_DMA))) { + BUG_ON(flags & __GFP_UBC); return dma_kmalloc_cache(index, flags); + } #endif - return &kmalloc_caches[index]; + + return __kmalloc_cache(flags, index); } void *__kmalloc(size_t size, gfp_t flags) @@ -2957,6 +3100,11 @@ void __init kmem_cache_init(void) create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", sizeof(struct kmem_cache_node), GFP_KERNEL); kmalloc_caches[0].refcount = -1; +#ifdef CONFIG_BEANCOUNTERS + create_kmalloc_cache(&ub_kmalloc_caches[0], "kmem_cache_node_ubc", + sizeof(struct kmem_cache_node), GFP_KERNEL_UBC); + ub_kmalloc_caches[0].refcount = -1; +#endif caches++; hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); @@ -2969,15 +3117,27 @@ void __init kmem_cache_init(void) if (KMALLOC_MIN_SIZE <= 64) { create_kmalloc_cache(&kmalloc_caches[1], "kmalloc-96", 96, GFP_KERNEL); +#ifdef CONFIG_BEANCOUNTERS + create_kmalloc_cache(&ub_kmalloc_caches[1], + "kmalloc-96-ubc", 96, GFP_KERNEL_UBC); +#endif caches++; create_kmalloc_cache(&kmalloc_caches[2], "kmalloc-192", 192, GFP_KERNEL); +#ifdef CONFIG_BEANCOUNTERS + create_kmalloc_cache(&ub_kmalloc_caches[2], + "kmalloc-192-ubc", 192, GFP_KERNEL_UBC); +#endif caches++; } for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], "kmalloc", 1 << i, GFP_KERNEL); +#ifdef CONFIG_BEANCOUNTERS + create_kmalloc_cache(&ub_kmalloc_caches[i], + "kmalloc-ubc", 1 << i, GFP_KERNEL_UBC); +#endif caches++; } @@ -3012,9 +3172,14 @@ void __init kmem_cache_init(void) slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) + for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { kmalloc_caches[i]. name = kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); +#ifdef CONFIG_BEANCOUNTERS + ub_kmalloc_caches[i].name = + kasprintf(GFP_KERNEL, "kmalloc-%d-ubc", 1 << i); +#endif + } #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); @@ -4280,6 +4445,8 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'a'; if (s->flags & SLAB_DEBUG_FREE) *p++ = 'F'; + if (s->flags & SLAB_UBC) + *p++ = 'b'; if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); diff --git a/mm/swap.c b/mm/swap.c index 9e0cb31..be5fc52 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -209,6 +209,7 @@ void lru_cache_add_active(struct page *page) __pagevec_lru_add_active(pvec); put_cpu_var(lru_add_active_pvecs); } +EXPORT_SYMBOL(lru_cache_add_active); /* * Drain pages out of the cpu's pagevecs. @@ -244,6 +245,8 @@ void lru_add_drain(void) put_cpu(); } +EXPORT_SYMBOL(lru_add_drain); + #ifdef CONFIG_NUMA static void lru_add_drain_per_cpu(struct work_struct *dummy) { diff --git a/mm/swap_state.c b/mm/swap_state.c index 797c383..e54c04c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -20,6 +20,9 @@ #include +#include +#include + /* * swapper_space is a fiction, retained to simplify the path through * vmscan's shrink_page_list, to make sync_page look nicer, and to allow @@ -44,6 +47,7 @@ struct address_space swapper_space = { .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info, }; +EXPORT_SYMBOL(swapper_space); #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -101,6 +105,8 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) return error; } +EXPORT_SYMBOL(add_to_swap_cache); + /* * This must be called only on pages that have * been verified to be in the swap cache. @@ -137,7 +143,14 @@ int add_to_swap(struct page * page, gfp_t gfp_mask) BUG_ON(!PageUptodate(page)); for (;;) { - entry = get_swap_page(); + struct user_beancounter *ub; + + ub = pb_grab_page_ub(page); + if (IS_ERR(ub)) + return 0; + + entry = get_swap_page(ub); + put_beancounter(ub); if (!entry.val) return 0; @@ -321,6 +334,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return found_page; } +EXPORT_SYMBOL(read_swap_cache_async); + /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory diff --git a/mm/swapfile.c b/mm/swapfile.c index 1e330f2..7ad8b1a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -33,6 +33,8 @@ #include #include +#include + static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; long total_swap_pages; @@ -44,9 +46,13 @@ static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -static struct swap_list_t swap_list = {-1, -1}; +struct swap_list_t swap_list = {-1, -1}; -static struct swap_info_struct swap_info[MAX_SWAPFILES]; +struct swap_info_struct swap_info[MAX_SWAPFILES]; +EXPORT_SYMBOL(total_swap_pages); +EXPORT_SYMBOL(swap_lock); +EXPORT_SYMBOL(swap_list); +EXPORT_SYMBOL(swap_info); static DEFINE_MUTEX(swapon_mutex); @@ -173,7 +179,7 @@ no_page: return 0; } -swp_entry_t get_swap_page(void) +swp_entry_t get_swap_page(struct user_beancounter *ub) { struct swap_info_struct *si; pgoff_t offset; @@ -194,6 +200,8 @@ swp_entry_t get_swap_page(void) wrapped++; } + if (si->flags & SWP_READONLY) + continue; if (!si->highest_bit) continue; if (!(si->flags & SWP_WRITEOK)) @@ -203,6 +211,7 @@ swp_entry_t get_swap_page(void) offset = scan_swap_map(si); if (offset) { spin_unlock(&swap_lock); + ub_swapentry_inc(si, offset, ub); return swp_entry(type, offset); } next = swap_list.next; @@ -214,6 +223,8 @@ noswap: return (swp_entry_t) {0}; } +EXPORT_SYMBOL(get_swap_page); + swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si; @@ -221,7 +232,7 @@ swp_entry_t get_swap_page_of_type(int type) spin_lock(&swap_lock); si = swap_info + type; - if (si->flags & SWP_WRITEOK) { + if (si->flags & SWP_WRITEOK && !(si->flags & SWP_READONLY)) { nr_swap_pages--; offset = scan_swap_map(si); if (offset) { @@ -278,6 +289,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) count--; p->swap_map[offset] = count; if (!count) { + ub_swapentry_dec(p, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) @@ -306,6 +318,8 @@ void swap_free(swp_entry_t entry) } } +EXPORT_SYMBOL(swap_free); + /* * How many references to page are currently swapped out? */ @@ -387,6 +401,55 @@ int remove_exclusive_swap_page(struct page *page) return retval; } +int try_to_remove_exclusive_swap_page(struct page *page) +{ + int retval; + struct swap_info_struct * p; + swp_entry_t entry; + + BUG_ON(PagePrivate(page)); + BUG_ON(!PageLocked(page)); + + if (!PageSwapCache(page)) + return 0; + if (PageWriteback(page)) + return 0; + if (page_count(page) != 2) /* 2: us + cache */ + return 0; + + entry.val = page->private; + p = swap_info_get(entry); + if (!p) + return 0; + + if (!vm_swap_full() && + (p->flags & (SWP_ACTIVE|SWP_READONLY)) == SWP_ACTIVE) { + spin_unlock(&swap_lock); + return 0; + } + + /* Is the only swap cache user the cache itself? */ + retval = 0; + if (p->swap_map[swp_offset(entry)] == 1) { + /* Recheck the page count with the swapcache lock held.. */ + spin_lock_irq(&swapper_space.tree_lock); + if ((page_count(page) == 2) && !PageWriteback(page)) { + __delete_from_swap_cache(page); + SetPageDirty(page); + retval = 1; + } + spin_unlock_irq(&swapper_space.tree_lock); + } + spin_unlock(&swap_lock); + + if (retval) { + swap_free(entry); + page_cache_release(page); + } + + return retval; +} + /* * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. @@ -426,6 +489,7 @@ void free_swap_and_cache(swp_entry_t entry) page_cache_release(page); } } +EXPORT_SYMBOL(free_swap_and_cache); #ifdef CONFIG_HIBERNATION /* @@ -509,11 +573,13 @@ unsigned int count_swap_pages(int type, int free) * force COW, vm_page_prot omits write permission from any private vma. */ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, swp_entry_t entry, struct page *page) + unsigned long addr, swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { spinlock_t *ptl; pte_t *pte; int ret = 1; + struct mm_struct *mm = vma->vm_mm; if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) ret = -ENOMEM; @@ -526,9 +592,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } - inc_mm_counter(vma->vm_mm, anon_rss); + inc_mm_counter(mm, anon_rss); + ub_unused_privvm_dec(mm, vma); + pb_add_ref(page, mm, pb); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, + set_pte_at(mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_anon_rmap(page, vma, addr); swap_free(entry); @@ -544,7 +612,8 @@ out: static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pte_t swp_pte = swp_entry_to_pte(entry); pte_t *pte; @@ -567,7 +636,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, */ if (unlikely(pte_same(*pte, swp_pte))) { pte_unmap(pte); - ret = unuse_pte(vma, pmd, addr, entry, page); + ret = unuse_pte(vma, pmd, addr, entry, page, pb); if (ret) goto out; pte = pte_offset_map(pmd, addr); @@ -580,7 +649,8 @@ out: static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pmd_t *pmd; unsigned long next; @@ -591,7 +661,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - ret = unuse_pte_range(vma, pmd, addr, next, entry, page); + ret = unuse_pte_range(vma, pmd, addr, next, entry, page, pb); if (ret) return ret; } while (pmd++, addr = next, addr != end); @@ -600,7 +670,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pud_t *pud; unsigned long next; @@ -611,7 +682,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - ret = unuse_pmd_range(vma, pud, addr, next, entry, page); + ret = unuse_pmd_range(vma, pud, addr, next, entry, page, pb); if (ret) return ret; } while (pud++, addr = next, addr != end); @@ -619,7 +690,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, } static int unuse_vma(struct vm_area_struct *vma, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pgd_t *pgd; unsigned long addr, end, next; @@ -641,7 +713,7 @@ static int unuse_vma(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - ret = unuse_pud_range(vma, pgd, addr, next, entry, page); + ret = unuse_pud_range(vma, pgd, addr, next, entry, page, pb); if (ret) return ret; } while (pgd++, addr = next, addr != end); @@ -649,7 +721,8 @@ static int unuse_vma(struct vm_area_struct *vma, } static int unuse_mm(struct mm_struct *mm, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { struct vm_area_struct *vma; int ret = 0; @@ -665,7 +738,7 @@ static int unuse_mm(struct mm_struct *mm, lock_page(page); } for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) + if (vma->anon_vma && (ret = unuse_vma(vma, entry, page, pb))) break; } up_read(&mm->mmap_sem); @@ -727,6 +800,7 @@ static int try_to_unuse(unsigned int type) int retval = 0; int reset_overflow = 0; int shmem; + struct page_beancounter *pb; /* * When searching mms for an entry, a good strategy is to @@ -779,6 +853,13 @@ static int try_to_unuse(unsigned int type) break; } + pb = NULL; + if (pb_alloc_all(&pb)) { + page_cache_release(page); + retval = -ENOMEM; + break; + } + /* * Don't hold on to start_mm if it looks like exiting. */ @@ -801,6 +882,20 @@ static int try_to_unuse(unsigned int type) lock_page(page); wait_on_page_writeback(page); + /* If read failed we cannot map not-uptodate page to + * user space. Actually, we are in serious troubles, + * we do not even know what process to kill. So, the only + * variant remains: to stop swapoff() and allow someone + * to kill processes to zap invalid pages. + */ + if (unlikely(!PageUptodate(page))) { + pb_free_list(&pb); + unlock_page(page); + page_cache_release(page); + retval = -EIO; + break; + } + /* * Remove all references to entry. * Whenever we reach init_mm, there's no address space @@ -812,7 +907,7 @@ static int try_to_unuse(unsigned int type) if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else - retval = unuse_mm(start_mm, entry, page); + retval = unuse_mm(start_mm, entry, page, &pb); } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); @@ -842,7 +937,7 @@ static int try_to_unuse(unsigned int type) set_start_mm = 1; shmem = shmem_unuse(entry, page); } else - retval = unuse_mm(mm, entry, page); + retval = unuse_mm(mm, entry, page, &pb); if (set_start_mm && *swap_map < swcount) { mmput(new_start_mm); atomic_inc(&mm->mm_users); @@ -863,6 +958,8 @@ static int try_to_unuse(unsigned int type) retval = shmem; break; } + + pb_free_list(&pb); if (retval) { unlock_page(page); page_cache_release(page); @@ -1215,6 +1312,10 @@ asmlinkage long sys_swapoff(const char __user * specialfile) int i, type, prev; int err; + /* VE admin check is just to be on the safe side, the admin may affect + * swaps only if he has access to special, i.e. if he has been granted + * access to the block device or if the swap file is in the area + * visible to him. */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1324,6 +1425,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); + ub_swap_fini(p); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); @@ -1343,6 +1445,8 @@ out: return err; } +EXPORT_SYMBOL(sys_swapoff); + #ifdef CONFIG_PROC_FS /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) @@ -1437,7 +1541,7 @@ static const struct file_operations proc_swaps_operations = { static int __init procswaps_init(void) { - proc_create("swaps", 0, NULL, &proc_swaps_operations); + proc_create("swaps", 0, &glob_proc_root, &proc_swaps_operations); return 0; } __initcall(procswaps_init); @@ -1669,6 +1773,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; } + if (ub_swap_init(p, maxpages)) { + error = -ENOMEM; + goto bad_swap; + } + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); if (swap_flags & SWAP_FLAG_PREFER) @@ -1678,6 +1787,8 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) p->prio = --least_priority; p->swap_map = swap_map; p->flags = SWP_ACTIVE; + if (swap_flags & SWAP_FLAG_READONLY) + p->flags |= SWP_READONLY; nr_swap_pages += nr_good_pages; total_swap_pages += nr_good_pages; @@ -1733,6 +1844,8 @@ out: return error; } +EXPORT_SYMBOL(sys_swapon); + void si_swapinfo(struct sysinfo *val) { unsigned int i; @@ -1792,6 +1905,8 @@ bad_file: goto out; } +EXPORT_SYMBOL(swap_duplicate); + struct swap_info_struct * get_swap_info_struct(unsigned type) { diff --git a/mm/truncate.c b/mm/truncate.c index 6650c1d..1c2719b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -77,6 +77,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) BDI_RECLAIMABLE); if (account_size) task_io_account_cancelled_write(account_size); + ub_io_release_context(page, account_size); } } } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 85b9a0d..78dba44 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -22,6 +22,9 @@ #include #include +#include +#include + DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; @@ -334,6 +337,70 @@ static struct vm_struct *__find_vm_area(const void *addr) return tmp; } +struct vm_struct * get_vm_area_best(unsigned long size, unsigned long flags) +{ + unsigned long addr, best_addr, delta, best_delta; + struct vm_struct **p, **best_p, *tmp, *area; + + area = kmalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + + size += PAGE_SIZE; /* one-page gap at the end */ + addr = VMALLOC_START; + best_addr = 0UL; + best_p = NULL; + best_delta = PAGE_ALIGN(VMALLOC_END) - VMALLOC_START; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) && + (tmp->addr <= (void *)PAGE_ALIGN(VMALLOC_END)); + p = &tmp->next) { + if ((unsigned long)tmp->addr < addr) + continue; + if ((size + addr) < addr) + break; + delta = (unsigned long) tmp->addr - (size + addr); + if (delta < best_delta) { + best_delta = delta; + best_addr = addr; + best_p = p; + } + addr = tmp->size + (unsigned long)tmp->addr; + if (addr > VMALLOC_END-size) + break; + } + + if (!tmp || (tmp->addr > (void *)PAGE_ALIGN(VMALLOC_END))) { + /* check free area after list end */ + delta = (unsigned long) PAGE_ALIGN(VMALLOC_END) - (size + addr); + if (delta < best_delta) { + best_delta = delta; + best_addr = addr; + best_p = p; + } + } + if (best_addr) { + area->flags = flags; + /* allocate at the end of this area */ + area->addr = (void *)(best_addr + best_delta); + area->size = size; + area->next = *best_p; + area->pages = NULL; + area->nr_pages = 0; + area->phys_addr = 0; + *best_p = area; + /* check like in __vunmap */ + WARN_ON((PAGE_SIZE - 1) & (unsigned long)area->addr); + } else { + kfree(area); + area = NULL; + } + write_unlock(&vmlist_lock); + + return area; +} + /* Caller must hold vmlist_lock */ static struct vm_struct *__remove_vm_area(const void *addr) { @@ -373,7 +440,7 @@ struct vm_struct *remove_vm_area(const void *addr) return v; } -static void __vunmap(const void *addr, int deallocate_pages) +static void __vunmap(const void *addr, int deallocate_pages, int uncharge) { struct vm_struct *area; @@ -398,6 +465,8 @@ static void __vunmap(const void *addr, int deallocate_pages) if (deallocate_pages) { int i; + if (uncharge) + dec_vmalloc_charged(area); for (i = 0; i < area->nr_pages; i++) { struct page *page = area->pages[i]; @@ -428,7 +497,7 @@ static void __vunmap(const void *addr, int deallocate_pages) void vfree(const void *addr) { BUG_ON(in_interrupt()); - __vunmap(addr, 1); + __vunmap(addr, 1, 1); } EXPORT_SYMBOL(vfree); @@ -444,7 +513,7 @@ EXPORT_SYMBOL(vfree); void vunmap(const void *addr) { BUG_ON(in_interrupt()); - __vunmap(addr, 0); + __vunmap(addr, 0, 0); } EXPORT_SYMBOL(vunmap); @@ -526,10 +595,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (map_vm_area(area, prot, &pages)) goto fail; + + inc_vmalloc_charged(area, gfp_mask); return area->addr; fail: - vfree(area->addr); + __vunmap(area->addr, 1, 0); return NULL; } @@ -576,6 +647,22 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); +static void *____vmalloc(unsigned long size, gfp_t mask, pgprot_t prot, + void *caller) +{ + struct vm_struct *area; + + size = PAGE_ALIGN(size); + if (!size || (size >> PAGE_SHIFT) > num_physpages) + return NULL; + + area = get_vm_area_best(size, VM_ALLOC); + if (!area) + return NULL; + + return __vmalloc_area_node(area, mask, prot, -1, caller); +} + /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -592,6 +679,28 @@ void *vmalloc(unsigned long size) } EXPORT_SYMBOL(vmalloc); +void *ub_vmalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL); +} +EXPORT_SYMBOL(ub_vmalloc); + +void *vmalloc_best(unsigned long size) +{ + return ____vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + __builtin_return_address(0)); +} + +EXPORT_SYMBOL(vmalloc_best); + +void *ub_vmalloc_best(unsigned long size) +{ + return ____vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL, + __builtin_return_address(0)); +} + +EXPORT_SYMBOL(ub_vmalloc_best); + /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size @@ -633,6 +742,13 @@ void *vmalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vmalloc_node); +void *ub_vmalloc_node(unsigned long size, int node) +{ + return __vmalloc_node(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(ub_vmalloc_node); + #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif @@ -896,6 +1012,39 @@ void free_vm_area(struct vm_struct *area) } EXPORT_SYMBOL_GPL(free_vm_area); +void vprintstat(void) +{ + struct vm_struct *p, *last_p = NULL; + unsigned long addr, size, free_size, max_free_size; + int num; + + addr = VMALLOC_START; + size = max_free_size = 0; + num = 0; + + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + free_size = (unsigned long)p->addr - addr; + if (free_size > max_free_size) + max_free_size = free_size; + addr = (unsigned long)p->addr + p->size; + size += p->size; + ++num; + last_p = p; + } + if (last_p) { + free_size = VMALLOC_END - + ((unsigned long)last_p->addr + last_p->size); + if (free_size > max_free_size) + max_free_size = free_size; + } + read_unlock(&vmlist_lock); + + printk("VMALLOC Used: %luKB Total: %luKB Entries: %d\n" + " Max_Free: %luKB Start: %lx End: %lx\n", + size/1024, (VMALLOC_END - VMALLOC_START)/1024, num, + max_free_size/1024, VMALLOC_START, VMALLOC_END); +} #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1ff1a58..0182a19 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -40,10 +40,14 @@ #include #include +#include +#include + #include #include #include +#include #include "internal.h" @@ -178,6 +182,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, if (scanned == 0) scanned = SWAP_CLUSTER_MAX; + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + return 1; + if (!down_read_trylock(&shrinker_rwsem)) return 1; /* Assume we'll be able to shrink next time */ @@ -212,6 +219,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, int shrink_ret; int nr_before; + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + goto done; + nr_before = (*shrinker->shrink)(0, gfp_mask); shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); if (shrink_ret == -1) @@ -226,6 +236,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, shrinker->nr += total_scan; } +done: up_read(&shrinker_rwsem); return ret; } @@ -339,6 +350,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, */ if (PagePrivate(page)) { if (try_to_free_buffers(page)) { + ub_io_release_context(page, 0); ClearPageDirty(page); printk("%s: orphaned page\n", __func__); return PAGE_CLEAN; @@ -1108,6 +1120,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, if (sc->may_swap) reclaim_mapped = calc_reclaim_mapped(sc, zone, priority); + {KSTAT_PERF_ENTER(refill_inact) lru_add_drain(); spin_lock_irq(&zone->lru_lock); pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, @@ -1197,6 +1210,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); + KSTAT_PERF_LEAVE(refill_inact)} } /* @@ -1249,6 +1263,8 @@ static unsigned long shrink_zone(int priority, struct zone *zone, nr_to_scan = min(nr_active, (unsigned long)sc->swap_cluster_max); nr_active -= nr_to_scan; + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + goto done; shrink_active_list(nr_to_scan, zone, sc, priority); } @@ -1256,12 +1272,15 @@ static unsigned long shrink_zone(int priority, struct zone *zone, nr_to_scan = min(nr_inactive, (unsigned long)sc->swap_cluster_max); nr_inactive -= nr_to_scan; + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + goto done; nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, sc); } } throttle_vm_writeout(sc->gfp_mask); +done: return nr_reclaimed; } @@ -1317,6 +1336,9 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, } nr_reclaimed += shrink_zone(priority, zone, sc); + + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + break; } return nr_reclaimed; @@ -1351,10 +1373,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zone *zone; enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + KSTAT_PERF_ENTER(ttfp); delayacct_freepages_start(); if (scan_global_lru(sc)) count_vm_event(ALLOCSTALL); + + ub_oom_start(); /* * mem_cgroup will not do shrink_slab. */ @@ -1404,6 +1429,11 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->may_writepage = 1; } + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) { + ret = 1; + goto out; + } + /* Take a nap, wait for some writeback to complete */ if (sc->nr_scanned && priority < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ/10); @@ -1435,6 +1465,7 @@ out: delayacct_freepages_end(); + KSTAT_PERF_LEAVE(ttfp); return ret; } diff --git a/mm/vmstat.c b/mm/vmstat.c index d7826af..f93d8df 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -15,6 +15,40 @@ #include #include #include +#include + +void __get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free, struct pglist_data *pgdat) +{ + struct zone *zones = pgdat->node_zones; + int i; + + *active = 0; + *inactive = 0; + *free = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + *active += zone_page_state(&zones[i], NR_ACTIVE); + *inactive += zone_page_state(&zones[i], NR_INACTIVE); + *free += zone_page_state(&zones[i], NR_FREE_PAGES); + } +} + +void get_zone_counts(unsigned long *active, + unsigned long *inactive, unsigned long *free) +{ + struct pglist_data *pgdat; + + *active = 0; + *inactive = 0; + *free = 0; + for_each_online_pgdat(pgdat) { + unsigned long l, m, n; + __get_zone_counts(&l, &m, &n, pgdat); + *active += l; + *inactive += m; + *free += n; + } +} #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -35,6 +69,20 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) } } +unsigned long vm_events(enum vm_event_item i) +{ + int cpu; + unsigned long sum; + struct vm_event_state *st; + + sum = 0; + for_each_online_cpu(cpu) { + st = &per_cpu(vm_event_states, cpu); + sum += st->event[i]; + } + + return (sum < 0 ? 0 : sum); +} /* * Accumulate the vm event counters across all CPUs. * The result is unavoidably approximate - it can change @@ -763,30 +811,40 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) unsigned long *v; #ifdef CONFIG_VM_EVENT_COUNTERS unsigned long *e; +#define VMSTAT_BUFSIZE (NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + \ + sizeof(struct vm_event_state)) +#else +#define VMSTAT_BUFSIZE (NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)) #endif int i; if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; -#ifdef CONFIG_VM_EVENT_COUNTERS - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) - + sizeof(struct vm_event_state), GFP_KERNEL); -#else - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), - GFP_KERNEL); -#endif + v = kmalloc(VMSTAT_BUFSIZE, GFP_KERNEL); m->private = v; if (!v) return ERR_PTR(-ENOMEM); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - v[i] = global_page_state(i); + + if (ve_is_super(get_exec_env())) { + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + v[i] = global_page_state(i); #ifdef CONFIG_VM_EVENT_COUNTERS - e = v + NR_VM_ZONE_STAT_ITEMS; - all_vm_events(e); - e[PGPGIN] /= 2; /* sectors -> kbytes */ - e[PGPGOUT] /= 2; + e = v + NR_VM_ZONE_STAT_ITEMS; + all_vm_events(e); + e[PGPGIN] /= 2; /* sectors -> kbytes */ + e[PGPGOUT] /= 2; #endif + } else + memset(v, 0, VMSTAT_BUFSIZE); + + if (virtinfo_notifier_call(VITYPE_GENERAL, + VIRTINFO_VMSTAT, v) & NOTIFY_FAIL) { + kfree(v); + m->private = NULL; + return ERR_PTR(-ENOMSG); + } + return v + *pos; } diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index b661f47..9150750 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -105,7 +105,7 @@ static struct vlan_group *vlan_group_alloc(struct net_device *real_dev) { struct vlan_group *grp; - grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL); + grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL_UBC); if (!grp) return NULL; @@ -127,7 +127,7 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, u16 vlan_id) return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; - array = kzalloc(size, GFP_KERNEL); + array = kzalloc(size, GFP_KERNEL_UBC); if (array == NULL) return -ENOBUFS; @@ -146,6 +146,7 @@ void unregister_vlan_dev(struct net_device *dev) struct net_device *real_dev = vlan->real_dev; struct vlan_group *grp; u16 vlan_id = vlan->vlan_id; + struct ve_struct *env; ASSERT_RTNL(); @@ -163,7 +164,9 @@ void unregister_vlan_dev(struct net_device *dev) synchronize_net(); + env = set_exec_env(dev->owner_env); unregister_netdevice(dev); + set_exec_env(env); /* If the group is now empty, kill off the group. */ if (grp->nr_vlans == 0) { @@ -532,6 +535,17 @@ static struct notifier_block vlan_notifier_block __read_mostly = { .notifier_call = vlan_device_event, }; +static inline int vlan_check_caps(void) +{ + if (capable(CAP_NET_ADMIN)) + return 1; +#ifdef CONFIG_VE + if (capable(CAP_VE_NET_ADMIN)) + return 1; +#endif + return 0; +} + /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver @@ -573,7 +587,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; vlan_dev_set_ingress_priority(dev, args.u.skb_priority, @@ -583,7 +597,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) case SET_VLAN_EGRESS_PRIORITY_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; err = vlan_dev_set_egress_priority(dev, args.u.skb_priority, @@ -592,7 +606,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) case SET_VLAN_FLAG_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; err = vlan_dev_change_flags(dev, args.vlan_qos ? args.u.flag : 0, @@ -601,7 +615,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) case SET_VLAN_NAME_TYPE_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; if ((args.u.name_type >= 0) && (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) { @@ -617,14 +631,14 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) case ADD_VLAN_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; err = register_vlan_device(dev, args.u.VID); break; case DEL_VLAN_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; unregister_vlan_dev(dev); err = 0; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 4bf014e..b3fe36b 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -292,6 +292,7 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, static int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { + struct ve_struct *env; struct net_device_stats *stats = &dev->stats; struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); @@ -323,13 +324,17 @@ static int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) stats->tx_bytes += skb->len; skb->dev = vlan_dev_info(dev)->real_dev; + skb->owner_env = skb->dev->owner_env; + env = set_exec_env(skb->owner_env); dev_queue_xmit(skb); + set_exec_env(env); return NETDEV_TX_OK; } static int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { + struct ve_struct *env; struct net_device_stats *stats = &dev->stats; u16 vlan_tci; @@ -341,7 +346,10 @@ static int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, stats->tx_bytes += skb->len; skb->dev = vlan_dev_info(dev)->real_dev; + skb->owner_env = skb->dev->owner_env; + env = set_exec_env(skb->owner_env); dev_queue_xmit(skb); + set_exec_env(env); return NETDEV_TX_OK; } @@ -697,4 +705,6 @@ void vlan_setup(struct net_device *dev) dev->ethtool_ops = &vlan_ethtool_ops; memset(dev->broadcast, 0, ETH_ALEN); + if (!ve_is_super(get_exec_env())) + dev->features |= NETIF_F_VIRTUAL; } diff --git a/net/Kconfig b/net/Kconfig index 7612cc8..29e7e09 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -27,7 +27,7 @@ menu "Networking options" config NET_NS bool "Network namespace support" default n - depends on EXPERIMENTAL && !SYSFS && NAMESPACES + depends on EXPERIMENTAL && NAMESPACES help Allow user space to create what appear to be multiple instances of the network stack. diff --git a/net/bridge/br.c b/net/bridge/br.c index 573acdf..53e3e80 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -56,6 +56,7 @@ static int __init br_init(void) brioctl_set(br_ioctl_deviceless_stub); br_handle_frame_hook = br_handle_frame; + br_hard_xmit_hook = br_xmit; br_fdb_get_hook = br_fdb_get; br_fdb_put_hook = br_fdb_put; @@ -89,6 +90,7 @@ static void __exit br_deinit(void) br_fdb_put_hook = NULL; br_handle_frame_hook = NULL; + br_hard_xmit_hook = NULL; br_fdb_fini(); } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 4f52c3d..db00163 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -32,16 +32,47 @@ int br_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); + skb->brmark = BR_ALREADY_SEEN; + if (dest[0] & 1) br_flood_deliver(br, skb); else if ((dst = __br_fdb_get(br, dest)) != NULL) - br_deliver(dst->dst, skb); + br_deliver(dst->dst, skb, 1); else br_flood_deliver(br, skb); return 0; } +int br_xmit(struct sk_buff *skb, struct net_bridge_port *port) +{ + struct net_bridge *br = port->br; + const unsigned char *dest = skb->data; + struct net_bridge_fdb_entry *dst; + + if (!br->via_phys_dev) + return 0; + + br->dev->stats.tx_packets++; + br->dev->stats.tx_bytes += skb->len; + + skb_reset_mac_header(skb); + skb_pull(skb, ETH_HLEN); + + skb->brmark = BR_ALREADY_SEEN; + + if (dest[0] & 1) + br_xmit_deliver(br, port, skb); + else if ((dst = __br_fdb_get(br, dest)) != NULL) + br_deliver(dst->dst, skb, 0); + else + br_xmit_deliver(br, port, skb); + + skb_push(skb, ETH_HLEN); + + return 0; +} + static int br_dev_open(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index bdd9cce..0e2fb77 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -76,14 +76,24 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) } /* called with rcu_read_lock */ -void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) +void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb, int free) { if (should_deliver(to, skb)) { + if (!free) { + struct sk_buff *skb2; + + if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) { + to->dev->stats.tx_dropped++; + return; + } + skb = skb2; + } __br_deliver(to, skb); return; } - kfree_skb(skb); + if (free) + kfree_skb(skb); } /* called with rcu_read_lock */ @@ -99,6 +109,7 @@ void br_forward(const struct net_bridge_port *to, struct sk_buff *skb) /* called under bridge lock */ static void br_flood(struct net_bridge *br, struct sk_buff *skb, + int free, void (*__packet_hook)(const struct net_bridge_port *p, struct sk_buff *skb)) { @@ -130,18 +141,41 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, return; } - kfree_skb(skb); + if (free) + kfree_skb(skb); } /* called with rcu_read_lock */ void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb) { - br_flood(br, skb, __br_deliver); + br_flood(br, skb, 1, __br_deliver); +} + +/* called with rcu_read_lock */ +void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port, + struct sk_buff *skb) +{ + struct net_bridge_port *p; + + list_for_each_entry_rcu(p, &br->port_list, list) { + if (p == port) + continue; + if (should_deliver(p, skb)) { + struct sk_buff *skb2; + + if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) { + br->dev->stats.tx_dropped++; + return; + } + __br_deliver(p, skb2); + } + } } /* called under bridge lock */ void br_flood_forward(struct net_bridge *br, struct sk_buff *skb) { - br_flood(br, skb, __br_forward); + skb->brmark = BR_ALREADY_SEEN; + br_flood(br, skb, 1, __br_forward); } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 63c18aa..dbab3e8 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include @@ -158,6 +159,11 @@ static void del_br(struct net_bridge *br) { struct net_bridge_port *p, *n; + if (br->master_dev) { + dev_put(br->master_dev); + br->master_dev = NULL; + } + list_for_each_entry_safe(p, n, &br->port_list, list) { del_nbp(p); } @@ -300,7 +306,7 @@ int br_del_bridge(const char *name) int ret = 0; rtnl_lock(); - dev = __dev_get_by_name(&init_net, name); + dev = __dev_get_by_name(current->nsproxy->net_ns, name); if (dev == NULL) ret = -ENXIO; /* Could not find device */ @@ -405,6 +411,10 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) && (br->dev->flags & IFF_UP)) br_stp_enable_port(p); + if (!(dev->features & NETIF_F_VIRTUAL)) { + dev_hold(dev); + br->master_dev = dev; + } spin_unlock_bh(&br->lock); br_ifinfo_notify(RTM_NEWLINK, p); @@ -440,6 +450,16 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) spin_lock_bh(&br->lock); br_stp_recalculate_bridge_id(br); br_features_recompute(br); + if (br->master_dev == dev) { + br->master_dev = NULL; + dev_put(dev); + list_for_each_entry(p, &br->port_list, list) + if (!(p->dev->features & NETIF_F_VIRTUAL)) { + dev_hold(p->dev); + br->master_dev = p->dev; + break; + } + } spin_unlock_bh(&br->lock); return 0; @@ -451,7 +471,7 @@ void __exit br_cleanup_bridges(void) rtnl_lock(); restart: - for_each_netdev(&init_net, dev) { + for_each_netdev(current->nsproxy->net_ns, dev) { if (dev->priv_flags & IFF_EBRIDGE) { del_br(dev->priv); goto restart; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 30b8877..44fb444 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -28,7 +28,13 @@ static void br_pass_frame_up(struct net_bridge *br, struct sk_buff *skb) brdev->stats.rx_bytes += skb->len; indev = skb->dev; - skb->dev = brdev; + if (!br->via_phys_dev) + skb->dev = brdev; + else { + skb->brmark = BR_ALREADY_SEEN; + if (br->master_dev) + skb->dev = br->master_dev; + } NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL, netif_receive_skb); @@ -56,7 +62,7 @@ int br_handle_frame_finish(struct sk_buff *skb) /* The packet skb2 goes to the local host (NULL to skip). */ skb2 = NULL; - if (br->dev->flags & IFF_PROMISC) + if ((br->dev->flags & IFF_PROMISC) && !br->via_phys_dev) skb2 = skb; dst = NULL; @@ -142,6 +148,8 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb) } switch (p->state) { + struct net_device *out; + case BR_STATE_FORWARDING: rhook = rcu_dereference(br_should_route_hook); if (rhook != NULL) { @@ -151,7 +159,12 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb) } /* fall through */ case BR_STATE_LEARNING: - if (!compare_ether_addr(p->br->dev->dev_addr, dest)) + if (skb->brmark == BR_ALREADY_SEEN) + return 0; + + out = p->br->via_phys_dev ? p->br->master_dev : p->br->dev; + + if (out && !compare_ether_addr(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 5bbf073..ac612ab 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -26,7 +27,7 @@ static int get_bridge_ifindices(int *indices, int num) struct net_device *dev; int i = 0; - for_each_netdev(&init_net, dev) { + for_each_netdev(current->nsproxy->net_ns, dev) { if (i >= num) break; if (dev->priv_flags & IFF_EBRIDGE) @@ -89,7 +90,7 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) if (!capable(CAP_NET_ADMIN)) return -EPERM; - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index(current->nsproxy->net_ns, ifindex); if (dev == NULL) return -EINVAL; @@ -140,6 +141,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) b.root_port = br->root_port; b.stp_enabled = (br->stp_enabled != BR_NO_STP); + b.via_phys_dev = br->via_phys_dev; b.ageing_time = jiffies_to_clock_t(br->ageing_time); b.hello_timer_value = br_timer_value(&br->hello_timer); b.tcn_timer_value = br_timer_value(&br->tcn_timer); @@ -262,6 +264,13 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) br_stp_set_enabled(br, args[1]); return 0; + case BRCTL_SET_VIA_ORIG_DEV: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + br->via_phys_dev = args[1] ? 1 : 0; + return 0; + case BRCTL_SET_BRIDGE_PRIORITY: if (!capable(CAP_NET_ADMIN)) return -EPERM; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index f155e6c..e7a1b78 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include @@ -97,10 +98,11 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, &init_net,0, RTNLGRP_LINK, NULL, GFP_ATOMIC); + err = rtnl_notify(skb, dev_net(port->dev),0, RTNLGRP_LINK, + NULL, GFP_ATOMIC); errout: if (err < 0) - rtnl_set_sk_err(&init_net, RTNLGRP_LINK, err); + rtnl_set_sk_err(dev_net(port->dev), RTNLGRP_LINK, err); } /* @@ -112,11 +114,8 @@ static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *dev; int idx; - if (net != &init_net) - return 0; - idx = 0; - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { /* not a bridge port */ if (dev->br_port == NULL || idx < cb->args[0]) goto skip; @@ -165,7 +164,7 @@ static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (new_state > BR_STATE_BLOCKING) return -EINVAL; - dev = __dev_get_by_index(&init_net, ifm->ifi_index); + dev = __dev_get_by_index(current->nsproxy->net_ns, ifm->ifi_index); if (!dev) return -ENODEV; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index c3dc18d..c4153d3 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -89,6 +89,8 @@ struct net_bridge spinlock_t lock; struct list_head port_list; struct net_device *dev; + struct net_device *master_dev; + unsigned char via_phys_dev; spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; struct list_head age_list; @@ -142,6 +144,7 @@ static inline int br_is_root_bridge(const struct net_bridge *br) /* br_device.c */ extern void br_dev_setup(struct net_device *dev); extern int br_dev_xmit(struct sk_buff *skb, struct net_device *dev); +extern int br_xmit(struct sk_buff *skb, struct net_bridge_port *port); /* br_fdb.c */ extern int br_fdb_init(void); @@ -168,12 +171,13 @@ extern void br_fdb_update(struct net_bridge *br, /* br_forward.c */ extern void br_deliver(const struct net_bridge_port *to, - struct sk_buff *skb); + struct sk_buff *skb, int free); extern int br_dev_queue_push_xmit(struct sk_buff *skb); extern void br_forward(const struct net_bridge_port *to, struct sk_buff *skb); extern int br_forward_finish(struct sk_buff *skb); extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb); +extern void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb); extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb); /* br_if.c */ diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 158dee8..5b6f301 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -181,6 +181,27 @@ static ssize_t store_stp_state(struct device *d, static DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state, store_stp_state); +static ssize_t show_via_phys_dev_state(struct device *cd, + struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(cd); + return sprintf(buf, "%d\n", br->via_phys_dev); +} + +static void set_via_phys_dev_state(struct net_bridge *br, unsigned long val) +{ + br->via_phys_dev = val; +} + +static ssize_t store_via_phys_dev_state(struct device *cd, + struct device_attribute *attr, const char *buf, size_t len) +{ + return store_bridge_parm(cd, buf, len, set_via_phys_dev_state); +} + +static DEVICE_ATTR(via_phys_dev, S_IRUGO | S_IWUSR, show_via_phys_dev_state, + store_via_phys_dev_state); + static ssize_t show_priority(struct device *d, struct device_attribute *attr, char *buf) { @@ -350,6 +371,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_max_age.attr, &dev_attr_ageing_time.attr, &dev_attr_stp_state.attr, + &dev_attr_via_phys_dev.attr, &dev_attr_priority.attr, &dev_attr_bridge_id.attr, &dev_attr_root_id.attr, diff --git a/net/core/datagram.c b/net/core/datagram.c index 52f577a..a2e01c2 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -56,6 +56,8 @@ #include #include +#include + /* * Is a socket 'connection oriented' ? */ @@ -609,6 +611,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, { struct sock *sk = sock->sk; unsigned int mask; + int no_ubc_space; poll_wait(file, sk->sk_sleep, wait); mask = 0; @@ -618,8 +621,14 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (sk->sk_shutdown == SHUTDOWN_MASK) { + no_ubc_space = 0; mask |= POLLHUP; + } else { + no_ubc_space = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); + if (no_ubc_space) + ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); + } /* readable? */ if (!skb_queue_empty(&sk->sk_receive_queue) || @@ -636,7 +645,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, } /* writable? */ - if (sock_writeable(sk)) + if (!no_ubc_space && sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); diff --git a/net/core/dev.c b/net/core/dev.c index 0ae08d3..7fd03a6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -130,6 +130,9 @@ #include "net-sysfs.h" +#include +#include + /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -207,20 +210,6 @@ DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); -#define NETDEV_HASHBITS 8 -#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) - -static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) -{ - unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; -} - -static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) -{ - return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; -} - /* Device list insertion */ static int list_netdevice(struct net_device *dev) { @@ -1620,6 +1609,23 @@ static int dev_gso_segment(struct sk_buff *skb) return 0; } +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) +int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port); +static __inline__ int bridge_hard_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct net_bridge_port *port; + + if (((port = rcu_dereference(dev->br_port)) == NULL) || + (skb->brmark == BR_ALREADY_SEEN)) + return 0; + + return br_hard_xmit_hook(skb, port); +} +#else +#define bridge_hard_start_xmit(skb, dev) (0) +#endif + int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { @@ -1634,6 +1640,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, goto gso; } + bridge_hard_start_xmit(skb, dev); + return dev->hard_start_xmit(skb, dev); } @@ -1644,6 +1652,9 @@ gso: skb->next = nskb->next; nskb->next = NULL; + + bridge_hard_start_xmit(skb, dev); + rc = dev->hard_start_xmit(nskb, dev); if (unlikely(rc)) { nskb->next = skb->next; @@ -2186,6 +2197,7 @@ int netif_receive_skb(struct sk_buff *skb) struct net_device *null_or_orig; int ret = NET_RX_DROP; __be16 type; + struct ve_struct *old_ve; /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) @@ -2212,6 +2224,16 @@ int netif_receive_skb(struct sk_buff *skb) skb_reset_transport_header(skb); skb->mac_len = skb->network_header - skb->mac_header; +#ifdef CONFIG_VE + /* + * Skb might be alloced in another VE context, than its device works. + * So, set the correct owner_env. + */ + skb->owner_env = skb->dev->owner_env; + BUG_ON(skb->owner_env == NULL); +#endif + old_ve = set_exec_env(skb->owner_env); + pt_prev = NULL; rcu_read_lock(); @@ -2274,6 +2296,7 @@ ncls: out: rcu_read_unlock(); + (void)set_exec_env(old_ve); return ret; } @@ -2947,8 +2970,13 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc) return -EOVERFLOW; } } - if (dev->flags != old_flags) { - printk(KERN_INFO "device %s %s promiscuous mode\n", + /* + * Promiscous mode on LOOPBACK/POINTTOPOINT devices does + * not mean anything + */ + if ((dev->flags != old_flags) && + !(dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))) { + ve_printk(VE_LOG, KERN_INFO "device %s %s promiscuous mode\n", dev->name, (dev->flags & IFF_PROMISC) ? "entered" : "left"); if (audit_enabled) @@ -3731,11 +3759,20 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) * - require strict serialization. * - do not return a value */ + case SIOCSIFMTU: + case SIOCSIFHWADDR: case SIOCSIFFLAGS: + if (!capable(CAP_NET_ADMIN) && + !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + return ret; + case SIOCSIFMETRIC: - case SIOCSIFMTU: case SIOCSIFMAP: - case SIOCSIFHWADDR: case SIOCSIFSLAVE: case SIOCADDMULTI: case SIOCDELMULTI: @@ -3802,12 +3839,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) */ static int dev_new_index(struct net *net) { - static int ifindex; for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(net, ifindex)) - return ifindex; + if (++net->ifindex <= 0) + net->ifindex = 1; + if (!__dev_get_by_index(net, net->ifindex)) + return net->ifindex; } } @@ -3922,6 +3958,10 @@ int register_netdevice(struct net_device *dev) BUG_ON(!dev_net(dev)); net = dev_net(dev); + ret = -EPERM; + if (!ve_is_super(get_exec_env()) && ve_is_dev_movable(dev)) + goto out; + spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); netdev_init_queue_locks(dev); @@ -4021,6 +4061,10 @@ int register_netdevice(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); + dev->owner_env = get_exec_env(); + netdev_bc(dev)->owner_ub = get_beancounter(get_exec_ub()); + netdev_bc(dev)->exec_ub = get_beancounter(get_exec_ub()); + dev_init_scheduler(dev); dev_hold(dev); list_netdevice(dev); @@ -4156,12 +4200,14 @@ static void netdev_wait_allrefs(struct net_device *dev) void netdev_run_todo(void) { struct list_head list; + struct ve_struct *old_ve; /* Snapshot list, allow later requests */ list_replace_init(&net_todo_list, &list); __rtnl_unlock(); + old_ve = get_exec_env(); while (!list_empty(&list)) { struct net_device *dev = list_entry(list.next, struct net_device, todo_list); @@ -4174,6 +4220,7 @@ void netdev_run_todo(void) continue; } + (void)set_exec_env(dev->owner_env); dev->reg_state = NETREG_UNREGISTERED; on_each_cpu(flush_backlog, dev, 1); @@ -4186,12 +4233,21 @@ void netdev_run_todo(void) WARN_ON(dev->ip6_ptr); WARN_ON(dev->dn_ptr); + put_beancounter(netdev_bc(dev)->exec_ub); + put_beancounter(netdev_bc(dev)->owner_ub); + netdev_bc(dev)->exec_ub = NULL; + netdev_bc(dev)->owner_ub = NULL; + + /* It must be the very last action, + * after this 'dev' may point to freed up memory. + */ if (dev->destructor) dev->destructor(dev); /* Free network device */ kobject_put(&dev->dev.kobj); } + (void)set_exec_env(old_ve); } static struct net_device_stats *internal_stats(struct net_device *dev) @@ -4243,7 +4299,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN_CONST; - p = kzalloc(alloc_size, GFP_KERNEL); + p = kzalloc(alloc_size, GFP_KERNEL_UBC); if (!p) { printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); return NULL; @@ -4372,11 +4428,18 @@ EXPORT_SYMBOL(unregister_netdev); * Callers must hold the rtnl semaphore. */ -int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, + struct user_beancounter *exec_ub) { char buf[IFNAMSIZ]; const char *destname; int err; + struct user_beancounter *tmp_ub; +#ifdef CONFIG_VE + struct ve_struct *cur_ve = get_exec_env(); + struct ve_struct *src_ve = dev->owner_env; + struct ve_struct *dst_ve = net->owner_ve; +#endif ASSERT_RTNL(); @@ -4427,6 +4490,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char err = -ENODEV; unlist_netdevice(dev); + dev->owner_env = dst_ve; + tmp_ub = netdev_bc(dev)->exec_ub; + netdev_bc(dev)->exec_ub = get_beancounter(exec_ub); + put_beancounter(tmp_ub); + synchronize_net(); /* Shutdown queueing discipline. */ @@ -4435,7 +4503,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Notify protocols, that we are about to destroy this device. They should clean all the things. */ + set_exec_env(src_ve); call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + (void)set_exec_env(cur_ve); /* * Flush the unicast and multicast chains @@ -4458,15 +4528,20 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char } /* Fixup kobjects */ + set_exec_env(src_ve); netdev_unregister_kobject(dev); + set_exec_env(dst_ve); err = netdev_register_kobject(dev); + set_exec_env(cur_ve); WARN_ON(err); /* Add the device back in the hashes */ list_netdevice(dev); /* Notify protocols, that a new device appeared. */ + set_exec_env(dst_ve); call_netdevice_notifiers(NETDEV_REGISTER, dev); + (void)set_exec_env(cur_ve); synchronize_net(); err = 0; @@ -4474,6 +4549,13 @@ out: return err; } +int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +{ + struct user_beancounter *ub = get_exec_ub(); + + return __dev_change_net_namespace(dev, net, pat, ub); +} + static int dev_cpu_callback(struct notifier_block *nfb, unsigned long action, void *ocpu) @@ -4679,7 +4761,7 @@ static struct hlist_head *netdev_create_hash(void) int i; struct hlist_head *hash; - hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); + hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL_UBC); if (hash != NULL) for (i = 0; i < NETDEV_HASHENTRIES; i++) INIT_HLIST_HEAD(&hash[i]); @@ -4843,6 +4925,7 @@ EXPORT_SYMBOL(__dev_remove_pack); EXPORT_SYMBOL(dev_valid_name); EXPORT_SYMBOL(dev_add_pack); EXPORT_SYMBOL(dev_alloc_name); +EXPORT_SYMBOL(__dev_change_net_namespace); EXPORT_SYMBOL(dev_close); EXPORT_SYMBOL(dev_get_by_flags); EXPORT_SYMBOL(dev_get_by_index); @@ -4874,6 +4957,7 @@ EXPORT_SYMBOL(dev_get_flags); #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) EXPORT_SYMBOL(br_handle_frame_hook); +EXPORT_SYMBOL(br_hard_xmit_hook); EXPORT_SYMBOL(br_fdb_get_hook); EXPORT_SYMBOL(br_fdb_put_hook); #endif diff --git a/net/core/dst.c b/net/core/dst.c index fe03266..ce92751 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -308,6 +308,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, void switch (event) { case NETDEV_UNREGISTER: case NETDEV_DOWN: + dst_gc_task(NULL); mutex_lock(&dst_gc_mutex); for (dst = dst_busy_list; dst; dst = dst->next) { last = dst; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 14ada53..15db122 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -859,7 +859,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GRXFH: break; default: - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 79de3b1..ee4499f 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -20,7 +20,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, { struct fib_rule *r; - r = kzalloc(ops->rule_size, GFP_KERNEL); + r = kzalloc(ops->rule_size, GFP_KERNEL_UBC); if (r == NULL) return -ENOMEM; @@ -238,7 +238,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (err < 0) goto errout; - rule = kzalloc(ops->rule_size, GFP_KERNEL); + rule = kzalloc(ops->rule_size, GFP_KERNEL_UBC); if (rule == NULL) { err = -ENOMEM; goto errout; diff --git a/net/core/filter.c b/net/core/filter.c index df37443..8a4933c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -478,7 +478,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (fprog->filter == NULL) return -EINVAL; - fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); + fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_UBC); if (!fp) return -ENOMEM; if (copy_from_user(fp->insns, fprog->filter, fsize)) { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 9d92e41..5d08fab 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -21,6 +21,7 @@ #include #include #include +#include #ifdef CONFIG_SYSCTL #include #endif @@ -35,6 +36,7 @@ #include #include #include +#include #define NEIGH_DEBUG 1 @@ -264,6 +266,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) int entries; entries = atomic_inc_return(&tbl->entries) - 1; + n = ERR_PTR(-ENOBUFS); if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { @@ -274,7 +277,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC); if (!n) - goto out_entries; + goto out_nomem; skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); @@ -291,6 +294,8 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) out: return n; +out_nomem: + n = ERR_PTR(-ENOMEM); out_entries: atomic_dec(&tbl->entries); goto out; @@ -409,12 +414,11 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, u32 hash_val; int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + struct neighbour *n1, *rc, *n; - if (!n) { - rc = ERR_PTR(-ENOBUFS); + rc = n = neigh_alloc(tbl); + if (IS_ERR(n)) goto out; - } memcpy(n->primary_key, pkey, key_len); n->dev = dev; @@ -736,10 +740,21 @@ static void neigh_periodic_timer(unsigned long arg) if (atomic_read(&n->refcnt) == 1 && (state == NUD_FAILED || time_after(now, n->used + n->parms->gc_staletime))) { + struct net_device *dev = n->dev; + struct ve_struct *ve; + struct user_beancounter *ub; + *np = n->next; n->dead = 1; write_unlock(&n->lock); + + ve = set_exec_env(dev->owner_env); + ub = set_exec_ub(netdev_bc(dev)->owner_ub); + neigh_cleanup_and_release(n); + + set_exec_ub(ub); + set_exec_env(ve); continue; } write_unlock(&n->lock); @@ -781,6 +796,11 @@ static void neigh_timer_handler(unsigned long arg) struct neighbour *neigh = (struct neighbour *)arg; unsigned state; int notify = 0; + struct ve_struct *env; + struct user_beancounter *ub; + + env = set_exec_env(neigh->dev->owner_env); + ub = set_exec_ub(netdev_bc(neigh->dev)->exec_ub); write_lock(&neigh->lock); @@ -884,6 +904,8 @@ out: neigh_update_notify(neigh); neigh_release(neigh); + (void)set_exec_ub(ub); + (void)set_exec_env(env); } int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) @@ -1273,9 +1295,16 @@ static void neigh_proxy_process(unsigned long arg) if (tdif <= 0) { struct net_device *dev = back->dev; __skb_unlink(back, &tbl->proxy_queue); - if (tbl->proxy_redo && netif_running(dev)) + if (tbl->proxy_redo && netif_running(dev)) { + struct ve_struct *ve; + struct user_beancounter *ub; + + ve = set_exec_env(dev->owner_env); + ub = set_exec_ub(netdev_bc(dev)->owner_ub); tbl->proxy_redo(back); - else + set_exec_ub(ub); + set_exec_env(ve); + } else kfree_skb(back); dev_put(dev); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c1f4e0d..cafed96 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -229,6 +229,27 @@ static struct device_attribute net_class_attributes[] = { {} }; +#ifdef CONFIG_VE +struct device_attribute ve_net_class_attributes[] = { + __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), + __ATTR(iflink, S_IRUGO, show_iflink, NULL), + __ATTR(ifindex, S_IRUGO, show_ifindex, NULL), + __ATTR(features, S_IRUGO, show_features, NULL), + __ATTR(type, S_IRUGO, show_type, NULL), + __ATTR(link_mode, S_IRUGO, show_link_mode, NULL), + __ATTR(address, S_IRUGO, show_address, NULL), + __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), + __ATTR(carrier, S_IRUGO, show_carrier, NULL), + __ATTR(dormant, S_IRUGO, show_dormant, NULL), + __ATTR(operstate, S_IRUGO, show_operstate, NULL), + __ATTR(mtu, S_IRUGO, show_mtu, NULL), + __ATTR(flags, S_IRUGO, show_flags, NULL), + __ATTR(tx_queue_len, S_IRUGO, show_tx_queue_len, NULL), + {} +}; +EXPORT_SYMBOL(ve_net_class_attributes); +#endif + /* Show a given an attribute in the statistics group */ static ssize_t netstat_show(const struct device *d, struct device_attribute *attr, char *buf, @@ -421,7 +442,7 @@ static void netdev_release(struct device *d) kfree((char *)dev - dev->padded); } -static struct class net_class = { +struct class net_class = { .name = "net", .dev_release = netdev_release, #ifdef CONFIG_SYSFS @@ -431,6 +452,13 @@ static struct class net_class = { .dev_uevent = netdev_uevent, #endif }; +EXPORT_SYMBOL(net_class); + +#ifndef CONFIG_VE +#define visible_net_class net_class +#else +#define visible_net_class (*get_exec_env()->net_class) +#endif /* Delete sysfs entries but hold kobject reference until after all * netdev references are gone. @@ -449,7 +477,7 @@ int netdev_register_kobject(struct net_device *net) struct device *dev = &(net->dev); struct attribute_group **groups = net->sysfs_groups; - dev->class = &net_class; + dev->class = &visible_net_class; dev->platform_data = net; dev->groups = groups; @@ -487,7 +515,15 @@ void netdev_initialize_kobject(struct net_device *net) device_initialize(device); } +void prepare_sysfs_netdev(void) +{ +#ifdef CONFIG_VE + get_ve0()->net_class = &net_class; +#endif +} + int netdev_kobject_init(void) { + prepare_sysfs_netdev(); return class_register(&net_class); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 7c52fe2..4ccdf17 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,10 @@ static __net_init int setup_net(struct net *net) int error; struct net_generic *ng; +#ifdef CONFIG_VE + net->owner_ve = get_exec_env(); +#endif + atomic_set(&net->count, 1); #ifdef NETNS_REFCNT_DEBUG atomic_set(&net->use_count, 0); @@ -85,6 +90,8 @@ static struct net *net_alloc(void) static void net_free(struct net *net) { + struct completion *sysfs_completion; + if (!net) return; @@ -95,8 +102,11 @@ static void net_free(struct net *net) return; } #endif - + kfree(net->gen); + sysfs_completion = net->sysfs_completion; kmem_cache_free(net_cachep, net); + if (sysfs_completion) + complete(sysfs_completion); } struct net *copy_net_ns(unsigned long flags, struct net *old_net) @@ -139,6 +149,7 @@ static void cleanup_net(struct work_struct *work) { struct pernet_operations *ops; struct net *net; + struct ve_struct *old_ve; /* Be very certain incoming network packets will not find us */ rcu_barrier(); @@ -152,11 +163,13 @@ static void cleanup_net(struct work_struct *work) list_del(&net->list); rtnl_unlock(); + old_ve = set_exec_env(net->owner_ve); /* Run all of the network namespace exit methods */ list_for_each_entry_reverse(ops, &pernet_list, list) { if (ops->exit) ops->exit(net); } + (void)set_exec_env(old_ve); mutex_unlock(&net_mutex); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d6381c2..b16c5c1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1203,6 +1203,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) if (rtnl_msg_handlers[idx] == NULL || rtnl_msg_handlers[idx][type].dumpit == NULL) continue; + if (vz_security_family_check(idx)) + continue; if (idx > s_idx) memset(&cb->args[0], 0, sizeof(cb->args)); if (rtnl_msg_handlers[idx][type].dumpit(skb, cb)) @@ -1263,13 +1265,13 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return 0; family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; - if (family >= NPROTO) + if (family >= NPROTO || vz_security_family_check(family)) return -EAFNOSUPPORT; sz_idx = type>>2; kind = type&3; - if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) + if (kind != 2 && security_netlink_recv(skb, CAP_VE_NET_ADMIN)) return -EPERM; if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { diff --git a/net/core/scm.c b/net/core/scm.c index 10f5c65..65e0983 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -36,6 +36,7 @@ #include #include +#include /* * Only allow a user to send credentials, that they could set with @@ -44,7 +45,9 @@ static __inline__ int scm_check_creds(struct ucred *creds) { - if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && + if ((creds->pid == task_tgid_vnr(current) || + creds->pid == current->tgid || + capable(CAP_VE_SYS_ADMIN)) && ((creds->uid == current->uid || creds->uid == current->euid || creds->uid == current->suid) || capable(CAP_SETUID)) && ((creds->gid == current->gid || creds->gid == current->egid || @@ -71,7 +74,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (!fpl) { - fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_UBC); if (!fpl) return -ENOMEM; *fplp = fpl; @@ -282,7 +285,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) if (!fpl) return NULL; - new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); + new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL_UBC); if (new_fpl) { for (i=fpl->count-1; i>=0; i--) get_file(fpl->fp[i]); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ca1ccdf..4058ec2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -65,6 +65,8 @@ #include #include +#include + #include "kmap_skb.h" static struct kmem_cache *skbuff_head_cache __read_mostly; @@ -191,6 +193,10 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, if (!skb) goto out; + if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA)) + goto nobc; + + /* Get the DATA. Size must match skb_add_mtu(). */ size = SKB_DATA_ALIGN(size); data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), gfp_mask, node); @@ -209,6 +215,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; + skb->owner_env = get_exec_env(); /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); atomic_set(&shinfo->dataref, 1); @@ -231,6 +238,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, out: return skb; nodata: + ub_skb_free_bc(skb); +nobc: kmem_cache_free(cache, skb); skb = NULL; goto out; @@ -337,6 +346,7 @@ static void kfree_skbmem(struct sk_buff *skb) struct sk_buff *other; atomic_t *fclone_ref; + ub_skb_free_bc(skb); switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb); @@ -370,6 +380,7 @@ static void skb_release_all(struct sk_buff *skb) #ifdef CONFIG_XFRM secpath_put(skb->sp); #endif + ub_skb_uncharge(skb); if (skb->destructor) { WARN_ON(in_irq()); skb->destructor(skb); @@ -461,6 +472,11 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif new->vlan_tci = old->vlan_tci; +#ifdef CONFIG_VE + new->accounted = old->accounted; + new->redirected = old->redirected; +#endif + skb_copy_brmark(new, old); skb_copy_secmark(new, old); } @@ -478,6 +494,10 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; n->cloned = 1; n->nohdr = 0; + C(owner_env); +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) + C(brmark); +#endif n->destructor = NULL; C(iif); C(tail); @@ -490,6 +510,11 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) #endif atomic_set(&n->users, 1); +#ifdef CONFIG_VE + C(accounted); + C(redirected); +#endif + atomic_inc(&(skb_shinfo(skb)->dataref)); skb->cloned = 1; @@ -545,6 +570,10 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) n->fclone = SKB_FCLONE_UNAVAILABLE; } + if (ub_skb_alloc_bc(n, gfp_mask)) { + kmem_cache_free(skbuff_head_cache, n); + return NULL; + } return __skb_clone(n, skb); } diff --git a/net/core/sock.c b/net/core/sock.c index 91f8bbc..a908502 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -123,6 +123,9 @@ #include #include +#include +#include + #include #ifdef CONFIG_INET @@ -248,7 +251,7 @@ static void sock_warn_obsolete_bsdism(const char *name) static char warncomm[TASK_COMM_LEN]; if (strcmp(warncomm, current->comm) && warned < 5) { strcpy(warncomm, current->comm); - printk(KERN_WARNING "process `%s' is using obsolete " + ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete " "%s SO_BSDCOMPAT\n", warncomm, name); warned++; } @@ -281,7 +284,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (err) goto out; - if (!sk_rmem_schedule(sk, skb->truesize)) { + if (!sk_rmem_schedule(sk, skb)) { err = -ENOBUFS; goto out; } @@ -919,6 +922,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) slab = prot->slab; security_sk_free(sk); + ub_sock_uncharge(sk); if (slab != NULL) kmem_cache_free(slab, sk); else @@ -947,6 +951,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, */ sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); + sk->owner_env = get_exec_env(); sock_net_set(sk, get_net(net)); } @@ -1041,14 +1046,11 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) if (filter != NULL) sk_filter_charge(newsk, filter); - if (unlikely(xfrm_sk_clone_policy(newsk))) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - sk_free(newsk); - newsk = NULL; - goto out; - } + if (ub_sock_charge(newsk, newsk->sk_family, newsk->sk_type) < 0) + goto out_err; + + if (unlikely(xfrm_sk_clone_policy(newsk))) + goto out_err; newsk->sk_err = 0; newsk->sk_priority = 0; @@ -1072,14 +1074,23 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) if (newsk->sk_prot->sockets_allocated) atomic_inc(newsk->sk_prot->sockets_allocated); } -out: return newsk; + +out_err: + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + sock_reset_flag(newsk, SOCK_TIMESTAMP); + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; } EXPORT_SYMBOL_GPL(sk_clone); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + extern int sysctl_tcp_use_sg; + __sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) @@ -1092,6 +1103,8 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_gso_max_size = dst->dev->gso_max_size; } } + if (!sysctl_tcp_use_sg) + sk->sk_route_caps &= ~NETIF_F_SG; } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -1252,11 +1265,9 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo) /* * Generic send/receive buffer handlers */ - -static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, - unsigned long header_len, - unsigned long data_len, - int noblock, int *errcode) +struct sk_buff *sock_alloc_send_skb2(struct sock *sk, unsigned long size, + unsigned long size2, int noblock, + int *errcode) { struct sk_buff *skb; gfp_t gfp_mask; @@ -1277,46 +1288,35 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, gfp_mask); - if (skb) { - int npages; - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - skb_frag_t *frag; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - frag = &skb_shinfo(skb)->frags[i]; - frag->page = page; - frag->page_offset = 0; - frag->size = (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len); - data_len -= PAGE_SIZE; - } + if (ub_sock_getwres_other(sk, skb_charge_size(size))) { + if (size2 < size) { + size = size2; + continue; + } + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = ub_sock_wait_for_space(sk, timeo, + skb_charge_size(size)); + continue; + } + if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + skb = alloc_skb(size, gfp_mask); + if (skb) /* Full success... */ break; - } + ub_sock_retwres_other(sk, skb_charge_size(size), + SOCK_MIN_UBCSPACE_CH); err = -ENOBUFS; goto failure; } + ub_sock_retwres_other(sk, + skb_charge_size(size), + SOCK_MIN_UBCSPACE_CH); set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; @@ -1327,6 +1327,7 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, timeo = sock_wait_for_wmem(sk, timeo); } + ub_skb_set_charge(skb, sk, skb_charge_size(size), UB_OTHERSOCKBUF); skb_set_owner_w(skb, sk); return skb; @@ -1336,11 +1337,12 @@ failure: *errcode = err; return NULL; } +EXPORT_SYMBOL(sock_alloc_send_skb2); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + return sock_alloc_send_skb2(sk, size, size, noblock, errcode); } static void __lock_sock(struct sock *sk) @@ -1750,10 +1752,12 @@ void lock_sock_nested(struct sock *sk, int subclass) __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock(&sk->sk_lock.slock); +#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE) /* * The sk_lock has mutex_lock() semantics here: */ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); +#endif local_bh_enable(); } @@ -1761,11 +1765,12 @@ EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { +#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE) /* * The sk_lock has mutex_unlock() semantics: */ mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - +#endif spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); @@ -2039,7 +2044,7 @@ int proto_register(struct proto *prot, int alloc_slab) if (alloc_slab) { prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); if (prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", @@ -2057,7 +2062,7 @@ int proto_register(struct proto *prot, int alloc_slab) sprintf(request_sock_slab_name, mask, prot->name); prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name, prot->rsk_prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); if (prot->rsk_prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", @@ -2078,7 +2083,7 @@ int proto_register(struct proto *prot, int alloc_slab) prot->twsk_prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, prot->twsk_prot->twsk_obj_size, - 0, SLAB_HWCACHE_ALIGN, + 0, SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; @@ -2235,10 +2240,26 @@ static const struct file_operations proto_seq_fops = { .release = seq_release, }; +static int proto_net_init(struct net *net) +{ + if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops)) + return -ENOBUFS; + return 0; +} + +static void proto_net_exit(struct net *net) +{ + proc_net_remove(net, "protocols"); +} + +static struct pernet_operations proto_net_ops = { + .init = proto_net_init, + .exit = proto_net_exit, +}; + static int __init proto_init(void) { - /* register /proc/net/protocols */ - return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0; + return register_pernet_subsys(&proto_net_ops); } subsys_initcall(proto_init); diff --git a/net/core/stream.c b/net/core/stream.c index a6b3437..100f16e 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -111,8 +111,10 @@ EXPORT_SYMBOL(sk_stream_wait_close); * sk_stream_wait_memory - Wait for more memory for a socket * @sk: socket to wait for memory * @timeo_p: for how long + * @amount - amount of memory to wait for (in UB space!) */ -int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +int __sk_stream_wait_memory(struct sock *sk, long *timeo_p, + unsigned long amount) { int err = 0; long vm_wait = 0; @@ -134,8 +136,11 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) if (signal_pending(current)) goto do_interrupted; clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - if (sk_stream_memory_free(sk) && !vm_wait) - break; + if (amount == 0) { + if (sk_stream_memory_free(sk) && !vm_wait) + break; + } else + ub_sock_sndqueueadd_tcp(sk, amount); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; @@ -144,6 +149,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) sk_stream_memory_free(sk) && vm_wait); sk->sk_write_pending--; + if (amount > 0) + ub_sock_sndqueuedel(sk); if (vm_wait) { vm_wait -= current_timeo; @@ -170,6 +177,10 @@ do_interrupted: goto out; } +int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +{ + return __sk_stream_wait_memory(sk, timeo_p, 0); +} EXPORT_SYMBOL(sk_stream_wait_memory); int sk_stream_error(struct sock *sk, int flags, int err) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 5e1ee0d..5f57513 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -582,6 +582,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, __ip6_dst_store(newsk, dst, NULL, NULL); newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + if (!sysctl_tcp_use_sg) + newsk->sk_route_caps &= ~NETIF_F_SG; newdp6 = (struct dccp6_sock *)newsk; newinet = inet_sk(newsk); newinet->pinet6 = &newdp6->inet6; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index b2804e2..92dab28 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -19,6 +19,8 @@ #include #include +#include + #include "ackvec.h" #include "ccid.h" #include "dccp.h" @@ -56,7 +58,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; - if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets) + if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets && + ub_timewait_check(sk, &dccp_death_row)) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index 6d2bd32..45567e3 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -107,7 +107,7 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb) if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); /* Eventually we might send routing messages too */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8a3ac1f..cd4d09d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -114,6 +114,7 @@ #ifdef CONFIG_IP_MROUTE #include #endif +#include extern void ip_mc_drop_socket(struct sock *sk); @@ -324,6 +325,10 @@ lookup_protocol: goto out_rcu_unlock; } + err = vz_security_protocol_check(answer->protocol); + if (err < 0) + goto out_rcu_unlock; + err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; @@ -345,6 +350,13 @@ lookup_protocol: if (sk == NULL) goto out; + err = -ENOBUFS; + if (ub_sock_charge(sk, PF_INET, sock->type)) + goto out_sk_free; + /* if charge was successful, sock_init_data() MUST be called to + * set sk->sk_type. otherwise sk will be uncharged to wrong resource + */ + err = 0; sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) @@ -402,6 +414,9 @@ out: out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_free: + sk_free(sk); + return err; } @@ -416,6 +431,9 @@ int inet_release(struct socket *sock) if (sk) { long timeout; + struct ve_struct *saved_env; + + saved_env = set_exec_env(sk->owner_env); /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); @@ -433,6 +451,8 @@ int inet_release(struct socket *sock) timeout = sk->sk_lingertime; sock->sk = NULL; sk->sk_prot->close(sk, timeout); + + (void)set_exec_env(saved_env); } return 0; } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index b043eda..31f84a0 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1136,7 +1136,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) switch (cmd) { case SIOCDARP: case SIOCSARP: - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && + !capable(CAP_VE_NET_ADMIN)) return -EPERM; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index b12dae2..1ad68cf 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -110,9 +110,9 @@ static inline void devinet_sysctl_unregister(struct in_device *idev) /* Locks all the inet devices. */ -static struct in_ifaddr *inet_alloc_ifa(void) +struct in_ifaddr *inet_alloc_ifa(void) { - struct in_ifaddr *ifa = kzalloc(sizeof(*ifa), GFP_KERNEL); + struct in_ifaddr *ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_UBC); if (ifa) { INIT_RCU_HEAD(&ifa->rcu_head); @@ -120,6 +120,7 @@ static struct in_ifaddr *inet_alloc_ifa(void) return ifa; } +EXPORT_SYMBOL_GPL(inet_alloc_ifa); static void inet_rcu_free_ifa(struct rcu_head *head) { @@ -152,7 +153,7 @@ void in_dev_finish_destroy(struct in_device *idev) } } -static struct in_device *inetdev_init(struct net_device *dev) +struct in_device *inetdev_init(struct net_device *dev) { struct in_device *in_dev; @@ -189,6 +190,7 @@ out_kfree: in_dev = NULL; goto out; } +EXPORT_SYMBOL_GPL(inetdev_init); static void in_dev_rcu_put(struct rcu_head *head) { @@ -382,7 +384,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, return 0; } -static int inet_insert_ifa(struct in_ifaddr *ifa) +int inet_insert_ifa(struct in_ifaddr *ifa) { return __inet_insert_ifa(ifa, NULL, 0); } @@ -433,6 +435,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, } endfor_ifa(in_dev); return NULL; } +EXPORT_SYMBOL_GPL(inet_insert_ifa); static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { @@ -633,7 +636,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSIFFLAGS: ret = -EACCES; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) goto out; break; case SIOCSIFADDR: /* Set interface address (and family) */ @@ -641,7 +644,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ ret = -EACCES; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) goto out; ret = -EINVAL; if (sin->sin_family != AF_INET) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 65c1503..1e87cfd 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -258,7 +258,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, net = dev_net(dev); if (fib_lookup(net, &fl, &res)) goto last_resort; - if (res.type != RTN_UNICAST) + if (res.type != RTN_UNICAST && + (!(dev->features & NETIF_F_VENET) || res.type != RTN_LOCAL)) goto e_inval_res; *spec_dst = FIB_RES_PREFSRC(res); fib_combine_itag(itag, &res); @@ -460,7 +461,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; if (copy_from_user(&rt, arg, sizeof(rt))) diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index c8cac6c..c21d89f 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -770,10 +770,10 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin void __init fib_hash_init(void) { fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node), - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC | SLAB_UBC, NULL); fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC | SLAB_UBC, NULL); } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index f70fac6..fe43dd5 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2272,6 +2272,7 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p #if defined(CONFIG_PROC_FS) struct igmp_mc_iter_state { + struct seq_net_private p; struct net_device *dev; struct in_device *in_dev; }; @@ -2282,9 +2283,10 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) { struct ip_mc_list *im = NULL; struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + struct net *net = seq_file_net(seq); state->in_dev = NULL; - for_each_netdev(&init_net, state->dev) { + for_each_netdev(net, state->dev) { struct in_device *in_dev; in_dev = in_dev_get(state->dev); if (!in_dev) @@ -2405,7 +2407,7 @@ static const struct seq_operations igmp_mc_seq_ops = { static int igmp_mc_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &igmp_mc_seq_ops, + return seq_open_net(inode, file, &igmp_mc_seq_ops, sizeof(struct igmp_mc_iter_state)); } @@ -2414,10 +2416,11 @@ static const struct file_operations igmp_mc_seq_fops = { .open = igmp_mc_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; struct igmp_mcf_iter_state { + struct seq_net_private p; struct net_device *dev; struct in_device *idev; struct ip_mc_list *im; @@ -2430,10 +2433,11 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) struct ip_sf_list *psf = NULL; struct ip_mc_list *im = NULL; struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + struct net *net = seq_file_net(seq); state->idev = NULL; state->im = NULL; - for_each_netdev(&init_net, state->dev) { + for_each_netdev(net, state->dev) { struct in_device *idev; idev = in_dev_get(state->dev); if (unlikely(idev == NULL)) @@ -2564,7 +2568,7 @@ static const struct seq_operations igmp_mcf_seq_ops = { static int igmp_mcf_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &igmp_mcf_seq_ops, + return seq_open_net(inode, file, &igmp_mcf_seq_ops, sizeof(struct igmp_mcf_iter_state)); } @@ -2573,14 +2577,37 @@ static const struct file_operations igmp_mcf_seq_fops = { .open = igmp_mcf_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; -int __init igmp_mc_proc_init(void) +static int igmp_net_init(struct net *net) { - proc_net_fops_create(&init_net, "igmp", S_IRUGO, &igmp_mc_seq_fops); - proc_net_fops_create(&init_net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops); + if (!proc_net_fops_create(net, "igmp", S_IRUGO, &igmp_mc_seq_fops)) + goto out_igmp; + if (!proc_net_fops_create(net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops)) + goto out_mcfilter; return 0; + +out_mcfilter: + proc_net_remove(net, "igmp"); +out_igmp: + return -ENOMEM; +} + +static void igmp_net_exit(struct net *net) +{ + proc_net_remove(net, "igmp"); + proc_net_remove(net, "mcfilter"); +} + +static struct pernet_operations igmp_net_ops = { + .init = igmp_net_init, + .exit = igmp_net_exit, +}; + +int __init igmp_mc_proc_init(void) +{ + return register_pernet_subsys(&igmp_net_ops); } #endif diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0c1ae68..f88f2dc 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -24,6 +24,9 @@ #include #include +#include +#include + #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); @@ -142,6 +145,8 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) goto tb_not_found; tb_found: if (!hlist_empty(&tb->owners)) { + if (sk->sk_reuse > 1) + goto success; if (tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN) { goto success; @@ -154,7 +159,7 @@ tb_found: tb_not_found: ret = 1; if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, - net, head, snum)) == NULL) + net, head, snum, sk->owner_env)) == NULL) goto fail_unlock; if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) @@ -556,7 +561,7 @@ void inet_csk_destroy_sock(struct sock *sk) sk_refcnt_debug_release(sk); - atomic_dec(sk->sk_prot->orphan_count); + ub_dec_orphan_count(sk); sock_put(sk); } @@ -636,7 +641,7 @@ void inet_csk_listen_stop(struct sock *sk) sock_orphan(child); - atomic_inc(sk->sk_prot->orphan_count); + ub_inc_orphan_count(sk); inet_csk_destroy_sock(child); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c10036e..758114b 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -706,6 +706,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) struct inet_diag_req *r = NLMSG_DATA(cb->nlh); const struct inet_diag_handler *handler; struct inet_hashinfo *hashinfo; + struct ve_struct *ve = get_exec_env(); handler = inet_diag_lock_handler(cb->nlh->nlmsg_type); if (IS_ERR(handler)) @@ -729,6 +730,8 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) sk_for_each(sk, node, &hashinfo->listening_hash[i]) { struct inet_sock *inet = inet_sk(sk); + if (!ve_accessible(sk->owner_env, ve)) + continue; if (num < s_num) { num++; continue; @@ -790,6 +793,8 @@ skip_listen_ht: sk_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); + if (!ve_accessible(sk->owner_env, ve)) + continue; if (num < s_num) goto next_normal; if (!(r->idiag_states & (1 << sk->sk_state))) @@ -814,6 +819,8 @@ next_normal: inet_twsk_for_each(tw, node, &head->twchain) { + if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve))) + continue; if (num < s_num) goto next_dying; if (r->id.idiag_sport != tw->tw_sport && diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 6c52e08..2039811 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -249,6 +249,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); q->net = nf; +#ifdef CONFIG_VE + q->owner_ve = get_exec_env(); +#endif return q; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4498190..e71191c 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -30,7 +30,8 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - const unsigned short snum) + const unsigned short snum, + struct ve_struct *ve) { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); @@ -39,6 +40,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, tb->port = snum; tb->fastreuse = 0; INIT_HLIST_HEAD(&tb->owners); + tb->owner_env = ve; hlist_add_head(&tb->node, &head->chain); } return tb; @@ -461,7 +463,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, } tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); + net, head, port, sk->owner_env); if (!tb) { spin_unlock(&head->lock); break; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 743f011..4534cdf 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -13,6 +13,8 @@ #include #include +#include + /* Must be called with locally disabled BHs. */ static void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) @@ -107,9 +109,14 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) { - struct inet_timewait_sock *tw = - kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, - GFP_ATOMIC); + struct user_beancounter *ub; + struct inet_timewait_sock *tw; + + ub = set_exec_ub(sock_bc(sk)->ub); + tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, + GFP_ATOMIC); + (void)set_exec_ub(ub); + if (tw != NULL) { const struct inet_sock *inet = inet_sk(sk); @@ -158,6 +165,7 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, rescan: inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { __inet_twsk_del_dead_node(tw); + ub_timewait_dec(tw, twdr); spin_unlock(&twdr->death_lock); __inet_twsk_kill(tw, twdr->hashinfo); #ifdef CONFIG_NET_NS @@ -258,6 +266,7 @@ void inet_twsk_deschedule(struct inet_timewait_sock *tw, { spin_lock(&twdr->death_lock); if (inet_twsk_del_dead_node(tw)) { + ub_timewait_dec(tw, twdr); inet_twsk_put(tw); if (--twdr->tw_count == 0) del_timer(&twdr->tw_timer); @@ -304,9 +313,10 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, spin_lock(&twdr->death_lock); /* Unlink it, if it was scheduled */ - if (inet_twsk_del_dead_node(tw)) + if (inet_twsk_del_dead_node(tw)) { + ub_timewait_dec(tw, twdr); twdr->tw_count--; - else + } else atomic_inc(&tw->tw_refcnt); if (slot >= INET_TWDR_RECYCLE_SLOTS) { @@ -342,6 +352,7 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, hlist_add_head(&tw->tw_death_node, list); + ub_timewait_inc(tw, twdr); if (twdr->tw_count++ == 0) mod_timer(&twdr->tw_timer, jiffies + twdr->period); spin_unlock(&twdr->death_lock); @@ -376,6 +387,7 @@ void inet_twdr_twcal_tick(unsigned long data) &twdr->twcal_row[slot]) { __inet_twsk_del_dead_node(tw); __inet_twsk_kill(tw, twdr->hashinfo); + ub_timewait_dec(tw, twdr); #ifdef CONFIG_NET_NS NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); #endif diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 450016b..962de13 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -94,6 +94,24 @@ int ip_forward(struct sk_buff *skb) goto drop; } + /* + * We try to optimize forwarding of VE packets: + * do not decrement TTL (and so save skb_cow) + * during forwarding of outgoing pkts from VE. + * For incoming pkts we still do ttl decr, + * since such skb is not cloned and does not require + * actual cow. So, there is at least one place + * in pkts path with mandatory ttl decr, that is + * sufficient to prevent routing loops. + */ + iph = ip_hdr(skb); + if ( +#ifdef CONFIG_IP_ROUTE_NAT + (rt->rt_flags & RTCF_NAT) == 0 && /* no NAT mangling expected */ +#endif /* and */ + (skb->dev->features & NETIF_F_VENET)) /* src is VENET device */ + goto no_ttl_decr; + /* We are about to mangle packet. Copy it! */ if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) goto drop; @@ -102,6 +120,8 @@ int ip_forward(struct sk_buff *skb) /* Decrease ttl after skb cow done */ ip_decrease_ttl(iph); +no_ttl_decr: + /* * We now generate an ICMP HOST REDIRECT giving the route * we calculated. diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2152d22..31abfb9 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -186,10 +186,13 @@ static void ip_evictor(struct net *net) */ static void ip_expire(unsigned long arg) { + struct inet_frag_queue *q = (struct inet_frag_queue *)arg; struct ipq *qp; struct net *net; + struct ve_struct *old_ve; - qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + qp = container_of(q, struct ipq, q); + old_ve = set_exec_env(q->owner_ve); net = container_of(qp->q.net, struct net, ipv4.frags); spin_lock(&qp->q.lock); @@ -214,6 +217,8 @@ static void ip_expire(unsigned long arg) out: spin_unlock(&qp->q.lock); ipq_put(qp); + + (void)set_exec_env(old_ve); } /* Find the correct entry in the "incomplete datagrams" queue for @@ -524,6 +529,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, clone->csum = 0; clone->ip_summed = head->ip_summed; atomic_add(clone->truesize, &qp->q.net->mem); + clone->owner_env = head->owner_env; } skb_shinfo(head)->frag_list = head->next; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index e0bed56..04035d2 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -193,6 +193,8 @@ static int ip_local_deliver_finish(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + if (skb->destructor) + skb_orphan(skb); __skb_pull(skb, ip_hdrlen(skb)); /* Point into the IP datagram, just past the header. */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d533a89..76eee76 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1345,12 +1345,13 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar char data[40]; } replyopts; struct ipcm_cookie ipc; - __be32 daddr; + __be32 saddr, daddr; struct rtable *rt = skb->rtable; if (ip_options_echo(&replyopts.opt, skb)) return; + saddr = ip_hdr(skb)->daddr; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; @@ -1365,7 +1366,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar struct flowi fl = { .oif = arg->bound_dev_if, .nl_u = { .ip4_u = { .daddr = daddr, - .saddr = rt->rt_spec_dst, + .saddr = saddr, .tos = RT_TOS(ip_hdr(skb)->tos) } }, /* Not quite clean, but right. */ .uli_u = { .ports = diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 42065ff..160fe1f 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -187,19 +187,20 @@ static int __init ic_open_devs(void) struct ic_device *d, **last; struct net_device *dev; unsigned short oflags; + struct net *net = get_exec_env()->ve_netns; last = &ic_first_dev; rtnl_lock(); /* bring loopback device up first */ - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { if (!(dev->flags & IFF_LOOPBACK)) continue; if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); } - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { if (dev->flags & IFF_LOOPBACK) continue; if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : @@ -432,9 +433,6 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt unsigned char *sha, *tha; /* s for "source", t for "target" */ struct ic_device *d; - if (!net_eq(dev_net(dev), &init_net)) - goto drop; - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) return NET_RX_DROP; @@ -852,9 +850,6 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str struct ic_device *d; int len, ext_len; - if (!net_eq(dev_net(dev), &init_net)) - goto drop; - /* Perform verifications before taking the lock. */ if (skb->pkt_type == PACKET_OTHERHOST) goto drop; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 4c6d2ca..572a117 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -106,6 +106,7 @@ #include #include #include +#include #include #include @@ -144,6 +145,9 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, struct ip_tunnel *t; struct ipip_net *ipn = net_generic(net, ipip_net_id); + if (ipn == NULL) + return NULL; + for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) @@ -771,6 +775,9 @@ static int ipip_init_net(struct net *net) int err; struct ipip_net *ipn; + if (!(get_exec_env()->features & VE_FEATURE_IPIP)) + return 0; + err = -ENOMEM; ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL); if (ipn == NULL) @@ -816,6 +823,9 @@ static void ipip_exit_net(struct net *net) struct ipip_net *ipn; ipn = net_generic(net, ipip_net_id); + if (ipn == NULL) /* no VE_FEATURE_IPIP */ + return; + rtnl_lock(); ipip_destroy_tunnels(ipn); unregister_netdevice(ipn->fb_tunnel_dev); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c519b8d..e8abf43 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -146,9 +147,10 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) static struct net_device *ipmr_new_tunnel(struct vifctl *v) { + struct net *net = get_exec_env()->ve_netns; struct net_device *dev; - dev = __dev_get_by_name(&init_net, "tunl0"); + dev = __dev_get_by_name(net, "tunl0"); if (dev) { int err; @@ -172,7 +174,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) dev = NULL; - if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) { + if (err == 0 && (dev = __dev_get_by_name(net, p.name)) != NULL) { dev->flags |= IFF_MULTICAST; in_dev = __in_dev_get_rtnl(dev); @@ -1124,9 +1126,6 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v struct vif_device *v; int ct; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; v=&vif_table[0]; diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 44a6872..dec8193 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -979,7 +979,7 @@ int __init ip_vs_conn_init(void) /* Allocate ip_vs_conn slab cache */ ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); if (!ip_vs_conn_cachep) { vfree(ip_vs_conn_tab); return -ENOMEM; diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index a652da2..2b4c316 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -475,7 +476,8 @@ static int set_mcast_if(struct sock *sk, char *ifname) struct net_device *dev; struct inet_sock *inet = inet_sk(sk); - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname); + if (!dev) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) @@ -496,11 +498,12 @@ static int set_mcast_if(struct sock *sk, char *ifname) */ static int set_sync_mesg_maxlen(int sync_state) { + struct net *net = get_exec_env()->ve_netns; struct net_device *dev; int num; if (sync_state == IP_VS_STATE_MASTER) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) + if ((dev = __dev_get_by_name(net, ip_vs_master_mcast_ifn)) == NULL) return -ENODEV; num = (dev->mtu - sizeof(struct iphdr) - @@ -511,7 +514,7 @@ static int set_sync_mesg_maxlen(int sync_state) IP_VS_DBG(7, "setting the maximum length of sync sending " "message %d.\n", sync_send_mesg_maxlen); } else if (sync_state == IP_VS_STATE_BACKUP) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) + if ((dev = __dev_get_by_name(net, ip_vs_backup_mcast_ifn)) == NULL) return -ENODEV; sync_recv_mesg_maxlen = dev->mtu - @@ -539,7 +542,8 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) memset(&mreq, 0, sizeof(mreq)); memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname); + if (!dev) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) return -EINVAL; @@ -560,7 +564,8 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) __be32 addr; struct sockaddr_in sin; - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname); + if (!dev) return -ENODEV; addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 432ce9d..ac3b375 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -436,7 +436,7 @@ __ipq_rcv_skb(struct sk_buff *skb) if (type <= IPQM_BASE) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); write_lock_bh(&queue_lock); @@ -466,8 +466,12 @@ __ipq_rcv_skb(struct sk_buff *skb) static void ipq_rcv_skb(struct sk_buff *skb) { + struct ve_struct *old_ve; + mutex_lock(&ipqnl_mutex); + old_ve = set_exec_env(skb->owner_env); __ipq_rcv_skb(skb); + (void)set_exec_env(old_ve); mutex_unlock(&ipqnl_mutex); } @@ -477,9 +481,6 @@ ipq_rcv_dev_event(struct notifier_block *this, { struct net_device *dev = ptr; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) ipq_dev_drop(dev->ifindex); @@ -499,7 +500,7 @@ ipq_rcv_nl_event(struct notifier_block *this, if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL && n->pid) { write_lock_bh(&queue_lock); - if ((n->net == &init_net) && (n->pid == peer_pid)) + if (n->pid == peer_pid) __ipq_reset(); write_unlock_bh(&queue_lock); } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4e7c719..18e2717 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -337,6 +337,9 @@ ipt_do_table(struct sk_buff *skb, struct ipt_entry *e, *back; struct xt_table_info *private; + if (!table) /* VE is not allowed to have this xtable */ + return NF_ACCEPT; + /* Initialization */ ip = ip_hdr(skb); datalen = skb->len - ip->ihl * 4; @@ -488,8 +491,8 @@ mark_source_chains(struct xt_table_info *newinfo, int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { - printk("iptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + ve_printk(VE_LOG, "iptables: loop hook %u pos " + "%u %08X.\n", hook, pos, e->comefrom); return 0; } e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); @@ -932,7 +935,7 @@ static struct xt_counters * alloc_counters(struct xt_table *table) (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc_node(countersize, numa_node_id()); + counters = ub_vmalloc_node(countersize, numa_node_id()); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -1202,7 +1205,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, void *loc_cpu_old_entry; ret = 0; - counters = vmalloc(num_counters * sizeof(struct xt_counters)); + counters = ub_vmalloc_best(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; @@ -1374,7 +1377,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat if (len != size + num_counters * sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc_node(len - size, numa_node_id()); + paddc = ub_vmalloc_node(len - size, numa_node_id()); if (!paddc) return -ENOMEM; @@ -1841,13 +1844,15 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return ret; } +static int do_ipt_set_ctl(struct sock *, int, void __user *, unsigned int); + static int compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1860,8 +1865,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, break; default: - duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); - ret = -EINVAL; + ret = do_ipt_set_ctl(sk, cmd, user, len); } return ret; @@ -1958,7 +1962,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1980,7 +1984,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2005,7 +2009,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2057,7 +2061,7 @@ struct xt_table *ipt_register_table(struct net *net, struct xt_table *table, int ret; struct xt_table_info *newinfo; struct xt_table_info bootstrap - = { 0, 0, 0, { 0 }, { 0 }, { } }; + = { 0, 0, 0, 0, { 0 }, { 0 }, { } }; void *loc_cpu_entry; struct xt_table *new_table; @@ -2216,11 +2220,22 @@ static struct xt_match icmp_matchstruct __read_mostly = { static int __net_init ip_tables_net_init(struct net *net) { - return xt_proto_init(net, AF_INET); + int res; + + if (!net_ipt_module_permitted(net, VE_IP_IPTABLES)) + return 0; + + res = xt_proto_init(net, AF_INET); + if (!res) + net_ipt_module_set(net, VE_IP_IPTABLES); + return res; } static void __net_exit ip_tables_net_exit(struct net *net) { + if (!net_is_ipt_module_set(net, VE_IP_IPTABLES)) + return; + xt_proto_fini(net, AF_INET); } diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index fafe8eb..1563e5c 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -388,7 +389,8 @@ clusterip_tg_check(const char *tablename, const void *e_void, return false; } - dev = dev_get_by_name(&init_net, e->ip.iniface); + dev = dev_get_by_name(get_exec_env()->ve_netns, + e->ip.iniface); if (!dev) { printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface); return false; diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 0af1413..08a4bcd 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -47,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info, ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (ih == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Important fields: * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ - printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", + ve_printk(VE_LOG, "SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ - printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ve_printk(VE_LOG, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ if (ntohs(ih->frag_off) & IP_CE) - printk("CE "); + ve_printk(VE_LOG, "CE "); if (ntohs(ih->frag_off) & IP_DF) - printk("DF "); + ve_printk(VE_LOG, "DF "); if (ntohs(ih->frag_off) & IP_MF) - printk("MF "); + ve_printk(VE_LOG, "MF "); /* Max length: 11 "FRAG:65535 " */ if (ntohs(ih->frag_off) & IP_OFFSET) - printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + ve_printk(VE_LOG, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); if ((logflags & IPT_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { @@ -84,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info, op = skb_header_pointer(skb, iphoff+sizeof(_iph), optsize, _opt); if (op == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + ve_printk(VE_LOG, "OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + ve_printk(VE_LOG, "%02X", op[i]); + ve_printk(VE_LOG, ") "); } switch (ih->protocol) { @@ -101,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info, const struct tcphdr *th; /* Max length: 10 "PROTO=TCP " */ - printk("PROTO=TCP "); + ve_printk(VE_LOG, "PROTO=TCP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -110,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info, th = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_tcph), &_tcph); if (th == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u ", + ve_printk(VE_LOG, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ if (logflags & IPT_LOG_TCPSEQ) - printk("SEQ=%u ACK=%u ", + ve_printk(VE_LOG, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ - printk("WINDOW=%u ", ntohs(th->window)); + ve_printk(VE_LOG, "WINDOW=%u ", ntohs(th->window)); /* Max length: 9 "RES=0x3F " */ - printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + ve_printk(VE_LOG, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ if (th->cwr) - printk("CWR "); + ve_printk(VE_LOG, "CWR "); if (th->ece) - printk("ECE "); + ve_printk(VE_LOG, "ECE "); if (th->urg) - printk("URG "); + ve_printk(VE_LOG, "URG "); if (th->ack) - printk("ACK "); + ve_printk(VE_LOG, "ACK "); if (th->psh) - printk("PSH "); + ve_printk(VE_LOG, "PSH "); if (th->rst) - printk("RST "); + ve_printk(VE_LOG, "RST "); if (th->syn) - printk("SYN "); + ve_printk(VE_LOG, "SYN "); if (th->fin) - printk("FIN "); + ve_printk(VE_LOG, "FIN "); /* Max length: 11 "URGP=65535 " */ - printk("URGP=%u ", ntohs(th->urg_ptr)); + ve_printk(VE_LOG, "URGP=%u ", ntohs(th->urg_ptr)); if ((logflags & IPT_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { @@ -157,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info, iphoff+ih->ihl*4+sizeof(_tcph), optsize, _opt); if (op == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + ve_printk(VE_LOG, "OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + ve_printk(VE_LOG, "%02X", op[i]); + ve_printk(VE_LOG, ") "); } break; } @@ -176,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info, if (ih->protocol == IPPROTO_UDP) /* Max length: 10 "PROTO=UDP " */ - printk("PROTO=UDP " ); + ve_printk(VE_LOG, "PROTO=UDP " ); else /* Max length: 14 "PROTO=UDPLITE " */ - printk("PROTO=UDPLITE "); + ve_printk(VE_LOG, "PROTO=UDPLITE "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -187,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info, uh = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_udph), &_udph); if (uh == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u LEN=%u ", + ve_printk(VE_LOG, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); break; @@ -220,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info, [ICMP_ADDRESSREPLY] = 12 }; /* Max length: 11 "PROTO=ICMP " */ - printk("PROTO=ICMP "); + ve_printk(VE_LOG, "PROTO=ICMP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -229,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info, ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_icmph), &_icmph); if (ich == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 18 "TYPE=255 CODE=255 " */ - printk("TYPE=%u CODE=%u ", ich->type, ich->code); + ve_printk(VE_LOG, "TYPE=%u CODE=%u ", ich->type, ich->code); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ if (ich->type <= NR_ICMP_TYPES && required_len[ich->type] && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } @@ -250,19 +250,19 @@ static void dump_packet(const struct nf_loginfo *info, case ICMP_ECHOREPLY: case ICMP_ECHO: /* Max length: 19 "ID=65535 SEQ=65535 " */ - printk("ID=%u SEQ=%u ", + ve_printk(VE_LOG, "ID=%u SEQ=%u ", ntohs(ich->un.echo.id), ntohs(ich->un.echo.sequence)); break; case ICMP_PARAMETERPROB: /* Max length: 14 "PARAMETER=255 " */ - printk("PARAMETER=%u ", + ve_printk(VE_LOG, "PARAMETER=%u ", ntohl(ich->un.gateway) >> 24); break; case ICMP_REDIRECT: /* Max length: 24 "GATEWAY=255.255.255.255 " */ - printk("GATEWAY=%u.%u.%u.%u ", + ve_printk(VE_LOG, "GATEWAY=%u.%u.%u.%u ", NIPQUAD(ich->un.gateway)); /* Fall through */ case ICMP_DEST_UNREACH: @@ -270,16 +270,16 @@ static void dump_packet(const struct nf_loginfo *info, case ICMP_TIME_EXCEEDED: /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ - printk("["); + ve_printk(VE_LOG, "["); dump_packet(info, skb, iphoff + ih->ihl*4+sizeof(_icmph)); - printk("] "); + ve_printk(VE_LOG, "] "); } /* Max length: 10 "MTU=65535 " */ if (ich->type == ICMP_DEST_UNREACH && ich->code == ICMP_FRAG_NEEDED) - printk("MTU=%u ", ntohs(ich->un.frag.mtu)); + ve_printk(VE_LOG, "MTU=%u ", ntohs(ich->un.frag.mtu)); } break; } @@ -292,19 +292,19 @@ static void dump_packet(const struct nf_loginfo *info, break; /* Max length: 9 "PROTO=AH " */ - printk("PROTO=AH "); + ve_printk(VE_LOG, "PROTO=AH "); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ah = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_ahdr), &_ahdr); if (ah == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(ah->spi)); + ve_printk(VE_LOG, "SPI=0x%x ", ntohl(ah->spi)); break; } case IPPROTO_ESP: { @@ -312,7 +312,7 @@ static void dump_packet(const struct nf_loginfo *info, const struct ip_esp_hdr *eh; /* Max length: 10 "PROTO=ESP " */ - printk("PROTO=ESP "); + ve_printk(VE_LOG, "PROTO=ESP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -321,25 +321,25 @@ static void dump_packet(const struct nf_loginfo *info, eh = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_esph), &_esph); if (eh == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(eh->spi)); + ve_printk(VE_LOG, "SPI=0x%x ", ntohl(eh->spi)); break; } /* Max length: 10 "PROTO 255 " */ default: - printk("PROTO=%u ", ih->protocol); + ve_printk(VE_LOG, "PROTO=%u ", ih->protocol); } /* Max length: 15 "UID=4294967295 " */ if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u GID=%u ", + ve_printk(VE_LOG, "UID=%u GID=%u ", skb->sk->sk_socket->file->f_uid, skb->sk->sk_socket->file->f_gid); read_unlock_bh(&skb->sk->sk_callback_lock); @@ -387,7 +387,7 @@ ipt_log_packet(unsigned int pf, loginfo = &default_loginfo; spin_lock_bh(&log_lock); - printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + ve_printk(VE_LOG, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); @@ -398,30 +398,30 @@ ipt_log_packet(unsigned int pf, physindev = skb->nf_bridge->physindev; if (physindev && in != physindev) - printk("PHYSIN=%s ", physindev->name); + ve_printk(VE_LOG, "PHYSIN=%s ", physindev->name); physoutdev = skb->nf_bridge->physoutdev; if (physoutdev && out != physoutdev) - printk("PHYSOUT=%s ", physoutdev->name); + ve_printk(VE_LOG, "PHYSOUT=%s ", physoutdev->name); } #endif if (in && !out) { /* MAC logging for input chain only. */ - printk("MAC="); + ve_printk(VE_LOG, "MAC="); if (skb->dev && skb->dev->hard_header_len && skb->mac_header != skb->network_header) { int i; const unsigned char *p = skb_mac_header(skb); for (i = 0; i < skb->dev->hard_header_len; i++,p++) - printk("%02x%c", *p, + ve_printk(VE_LOG, "%02x%c", *p, i==skb->dev->hard_header_len - 1 ? ' ':':'); } else - printk(" "); + ve_printk(VE_LOG, " "); } dump_packet(loginfo, skb, 0); - printk("\n"); + ve_printk(VE_LOG, "\n"); spin_unlock_bh(&log_lock); } diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 0841aef..85e4a69 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -98,6 +98,7 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in, return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC); } +#if 0 static int device_cmp(struct nf_conn *i, void *ifindex) { @@ -120,9 +121,6 @@ static int masq_device_event(struct notifier_block *this, { const struct net_device *dev = ptr; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - if (event == NETDEV_DOWN) { /* Device was downed. Search entire table for conntracks which were associated with that device, @@ -150,6 +148,7 @@ static struct notifier_block masq_dev_notifier = { static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; +#endif static struct xt_target masquerade_tg_reg __read_mostly = { .name = "MASQUERADE", @@ -168,12 +167,16 @@ static int __init masquerade_tg_init(void) ret = xt_register_target(&masquerade_tg_reg); +#if 0 +/* These notifiers are unnecessary and may + lead to oops in virtual environments */ if (ret == 0) { /* Register for device down reports */ register_netdevice_notifier(&masq_dev_notifier); /* Register IP address change reports */ register_inetaddr_notifier(&masq_inet_notifier); } +#endif return ret; } @@ -181,8 +184,8 @@ static int __init masquerade_tg_init(void) static void __exit masquerade_tg_exit(void) { xt_unregister_target(&masquerade_tg_reg); - unregister_netdevice_notifier(&masq_dev_notifier); - unregister_inetaddr_notifier(&masq_inet_notifier); +/* unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier);*/ } module_init(masquerade_tg_init); diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index 5c62924..99dfc92 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -72,8 +72,13 @@ redirect_tg(struct sk_buff *skb, const struct net_device *in, rcu_read_lock(); indev = __in_dev_get_rcu(skb->dev); - if (indev && (ifa = indev->ifa_list)) + if (indev && (ifa = indev->ifa_list)) { + /* because of venet device specific, we should use + * second ifa in the list */ + if (IN_LOOPBACK(ntohl(ifa->ifa_local)) && ifa->ifa_next) + ifa = ifa->ifa_next; newdst = ifa->ifa_local; + } rcu_read_unlock(); if (!newdst) diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 2639872..6b1fcf8 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -186,13 +186,13 @@ reject_tg_check(const char *tablename, const void *e_void, const struct ipt_entry *e = e_void; if (rejinfo->with == IPT_ICMP_ECHOREPLY) { - printk("ipt_REJECT: ECHOREPLY no longer supported.\n"); + ve_printk(VE_LOG, "ipt_REJECT: ECHOREPLY no longer supported.\n"); return false; } else if (rejinfo->with == IPT_TCP_RESET) { /* Must specify that it's a TCP packet */ if (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO)) { - printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n"); + ve_printk(VE_LOG, "ipt_REJECT: TCP_RESET invalid for non-tcp\n"); return false; } } diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 3974d7c..11ae8fd 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,19 @@ MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files"); MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files"); +#include + +#if defined(CONFIG_VE_IPTABLES) +#define tables (get_exec_env()->_ipt_recent->tables) +#define proc_dir (get_exec_env()->_ipt_recent->proc_dir) +#else +static LIST_HEAD(tables); +static struct proc_dir_entry *proc_dir; +#endif /* CONFIG_VE_IPTABLES */ + +static int init_ipt_recent(struct ve_struct *ve); +static void fini_ipt_recent(struct ve_struct *ve); + struct recent_entry { struct list_head list; struct list_head lru_list; @@ -74,12 +88,10 @@ struct recent_table { struct list_head iphash[0]; }; -static LIST_HEAD(tables); static DEFINE_SPINLOCK(recent_lock); static DEFINE_MUTEX(recent_mutex); #ifdef CONFIG_PROC_FS -static struct proc_dir_entry *proc_dir; static const struct file_operations recent_fops; #endif @@ -258,6 +270,9 @@ recent_mt_check(const char *tablename, const void *ip, strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) return false; + if (init_ipt_recent(get_exec_env())) + return 0; + mutex_lock(&recent_mutex); t = recent_table_lookup(info->name); if (t != NULL) { @@ -298,6 +313,13 @@ static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) { const struct ipt_recent_info *info = matchinfo; struct recent_table *t; + struct ve_struct *ve; + + ve = get_exec_env(); +#ifdef CONFIG_VE_IPTABLES + if (!ve->_ipt_recent) + return; +#endif mutex_lock(&recent_mutex); t = recent_table_lookup(info->name); @@ -312,6 +334,8 @@ static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) kfree(t); } mutex_unlock(&recent_mutex); + if (!ve_is_super(ve) && list_empty(&tables)) + fini_ipt_recent(ve); } #ifdef CONFIG_PROC_FS @@ -467,6 +491,49 @@ static struct xt_match recent_mt_reg __read_mostly = { .me = THIS_MODULE, }; +static int init_ipt_recent(struct ve_struct *ve) +{ + int err = 0; + +#ifdef CONFIG_VE_IPTABLES + if (ve->_ipt_recent) + return 0; + + ve->_ipt_recent = kzalloc(sizeof(struct ve_ipt_recent), GFP_KERNEL); + if (!ve->_ipt_recent) { + err = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&tables); +#endif +#ifdef CONFIG_PROC_FS + if (err) + return err; + proc_dir = proc_mkdir("ipt_recent", ve->ve_netns->proc_net); + if (proc_dir == NULL) { + err = -ENOMEM; + goto out_mem; + } +#endif +out: + return err; +out_mem: +#ifdef CONFIG_VE_IPTABLES + kfree(ve->_ipt_recent); +#endif + goto out; +} + +static void fini_ipt_recent(struct ve_struct *ve) +{ + remove_proc_entry("ipt_recent", ve->ve_netns->proc_net); +#ifdef CONFIG_VE_IPTABLES + kfree(ve->_ipt_recent); + ve->_ipt_recent = NULL; +#endif +} + static int __init recent_mt_init(void) { int err; @@ -476,25 +543,24 @@ static int __init recent_mt_init(void) ip_list_hash_size = 1 << fls(ip_list_tot); err = xt_register_match(&recent_mt_reg); -#ifdef CONFIG_PROC_FS if (err) return err; - proc_dir = proc_mkdir("ipt_recent", init_net.proc_net); - if (proc_dir == NULL) { + + err = init_ipt_recent(&ve0); + if (err) { xt_unregister_match(&recent_mt_reg); - err = -ENOMEM; + return err; } -#endif - return err; + + return 0; } static void __exit recent_mt_exit(void) { BUG_ON(!list_empty(&tables)); + + fini_ipt_recent(&ve0); xt_unregister_match(&recent_mt_reg); -#ifdef CONFIG_PROC_FS - remove_proc_entry("ipt_recent", init_net.proc_net); -#endif } module_init(recent_mt_init); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 1ea677d..12c4c2b 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -134,16 +134,24 @@ module_param(forward, bool, 0000); static int __net_init iptable_filter_net_init(struct net *net) { + if (!net_ipt_module_permitted(net, VE_IP_FILTER)) + return 0; + /* Register table */ net->ipv4.iptable_filter = ipt_register_table(net, &packet_filter, &initial_table.repl); if (IS_ERR(net->ipv4.iptable_filter)) return PTR_ERR(net->ipv4.iptable_filter); + + net_ipt_module_set(net, VE_IP_FILTER); return 0; } static void __net_exit iptable_filter_net_exit(struct net *net) { + if (!net_is_ipt_module_set(net, VE_IP_FILTER)) + return; + ipt_unregister_table(net->ipv4.iptable_filter); } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index da59182..f6343d8 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -203,16 +203,24 @@ static struct nf_hook_ops ipt_ops[] __read_mostly = { static int __net_init iptable_mangle_net_init(struct net *net) { + if (!net_ipt_module_permitted(net, VE_IP_MANGLE)) + return 0; + /* Register table */ net->ipv4.iptable_mangle = ipt_register_table(net, &packet_mangler, &initial_table.repl); if (IS_ERR(net->ipv4.iptable_mangle)) return PTR_ERR(net->ipv4.iptable_mangle); + + net_ipt_module_set(net, VE_IP_MANGLE); return 0; } static void __net_exit iptable_mangle_net_exit(struct net *net) { + if (!net_is_ipt_module_set(net, VE_IP_MANGLE)) + return; + ipt_unregister_table(net->ipv4.iptable_mangle); } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 5a955c4..b4bb436 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -417,66 +418,214 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); MODULE_ALIAS("ip_conntrack"); MODULE_LICENSE("GPL"); -static int __init nf_conntrack_l3proto_ipv4_init(void) +#ifdef CONFIG_VE_IPTABLES +#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) +static int nf_ct_proto_ipv4_sysctl_init(void) { - int ret = 0; + struct nf_conntrack_l3proto *ipv4 = ve_nf_conntrack_l3proto_ipv4; + struct ctl_table *ct_table; + struct net *net = get_exec_env()->ve_netns; - need_conntrack(); + ct_table = ip_ct_sysctl_table; - ret = nf_register_sockopt(&so_getorigdst); - if (ret < 0) { - printk(KERN_ERR "Unable to register netfilter socket option\n"); - return ret; + if (net != &init_net) { + ct_table = kmemdup(ct_table, sizeof(ip_ct_sysctl_table), + GFP_KERNEL); + if (!ct_table) + return -ENOMEM; + } + + ipv4->ctl_table_header = NULL; + ipv4->ctl_table_path = nf_net_ipv4_netfilter_sysctl_path; + ipv4->ctl_table = ct_table; + + ipv4->ctl_table[0].data = &ve_nf_conntrack_max; + ipv4->ctl_table[1].data = &ve_nf_conntrack_count; + ipv4->ctl_table[3].data = &ve_nf_conntrack_checksum; + ipv4->ctl_table[4].data = &ve_nf_ct_log_invalid; + + return 0; +} + +static void nf_ct_proto_ipv4_sysctl_cleanup(void) +{ + struct net *net = get_exec_env()->ve_netns; + + if (net != &init_net) { + kfree(ve_nf_conntrack_l3proto_ipv4->ctl_table); + } +} +#else +static inline int nf_ct_proto_ipv4_sysctl_init(void) +{ + return 0; +} +static inline void nf_ct_proto_ipv4_sysctl_cleanup(void) +{ +} +#endif /* SYSCTL && NF_CONNTRACK_PROC_COMPAT */ + +/* + * Functions init/fini_nf_ct_l3proto_ipv4 glue distributed nf_conntrack + * virtualization efforts. They are to be called from 2 places: + * + * 1) on loading/unloading module nf_conntrack_ipv4 from + * nf_conntrack_l3proto_ipv4_init/fini + * 2) on start/stop ve - from do_ve_iptables + */ +static int nf_ct_proto_ipv4_init(void) +{ + struct nf_conntrack_l3proto *ipv4; + + if (ve_is_super(get_exec_env())) { + ipv4 = &nf_conntrack_l3proto_ipv4; + goto out; } + ipv4 = kmemdup(&nf_conntrack_l3proto_ipv4, + sizeof(struct nf_conntrack_l3proto), GFP_KERNEL); + if (!ipv4) + return -ENOMEM; +out: + ve_nf_conntrack_l3proto_ipv4 = ipv4; + return 0; +} + +static void nf_ct_proto_ipv4_fini(void) +{ + if (!ve_is_super(get_exec_env())) + kfree(ve_nf_conntrack_l3proto_ipv4); +} +#endif + +int init_nf_ct_l3proto_ipv4(void) +{ + int ret = -ENOMEM; + int do_hooks = ve_is_super(get_exec_env()); + +#ifdef CONFIG_VE_IPTABLES + if (!ve_is_super(get_exec_env())) + __module_get(THIS_MODULE); - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); + ret = nf_ct_proto_ipv4_init(); + if (ret < 0) + goto err_out; + ret = nf_ct_proto_ipv4_sysctl_init(); + if (ret < 0) + goto no_mem_ipv4; + ret = nf_ct_proto_icmp_sysctl_init(); + if (ret < 0) + goto no_mem_icmp; +#endif /* CONFIG_VE_IPTABLES */ + + ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_tcp4); if (ret < 0) { printk("nf_conntrack_ipv4: can't register tcp.\n"); - goto cleanup_sockopt; + goto cleanup_sys; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4); + ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_udp4); if (ret < 0) { printk("nf_conntrack_ipv4: can't register udp.\n"); - goto cleanup_tcp; + goto unreg_tcp; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp); + ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_icmp); if (ret < 0) { printk("nf_conntrack_ipv4: can't register icmp.\n"); - goto cleanup_udp; + goto unreg_udp; } - ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); + ret = nf_conntrack_l3proto_register(ve_nf_conntrack_l3proto_ipv4); if (ret < 0) { printk("nf_conntrack_ipv4: can't register ipv4\n"); - goto cleanup_icmp; + goto unreg_icmp; } - ret = nf_register_hooks(ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - if (ret < 0) { - printk("nf_conntrack_ipv4: can't register hooks.\n"); - goto cleanup_ipv4; + if (do_hooks) { + ret = nf_register_hooks(ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register hooks.\n"); + goto unreg_ipv4; + } } -#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) ret = nf_conntrack_ipv4_compat_init(); if (ret < 0) - goto cleanup_hooks; -#endif + goto unreg_hooks; + return 0; + +unreg_hooks: + if (do_hooks) + nf_unregister_hooks(ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); +unreg_ipv4: + nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv4); +unreg_icmp: + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmp); +unreg_udp: + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp4); +unreg_tcp: + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp4); +cleanup_sys: +#ifdef CONFIG_VE_IPTABLES +no_mem_icmp: + nf_ct_proto_ipv4_sysctl_cleanup(); +no_mem_ipv4: + nf_ct_proto_ipv4_fini(); +err_out: + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); +#endif /* CONFIG_VE_IPTABLES */ return ret; -#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - cleanup_hooks: - nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); -#endif - cleanup_ipv4: - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - cleanup_icmp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); - cleanup_udp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); - cleanup_tcp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); +} +EXPORT_SYMBOL(init_nf_ct_l3proto_ipv4); + +void fini_nf_ct_l3proto_ipv4(void) +{ + int do_hooks = ve_is_super(get_exec_env()); + + nf_conntrack_ipv4_compat_fini(); + if (do_hooks) + nf_unregister_hooks(ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + + nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv4); + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmp); + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp4); + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp4); + +#ifdef CONFIG_VE_IPTABLES + nf_ct_proto_icmp_sysctl_cleanup(); + nf_ct_proto_ipv4_sysctl_cleanup(); + nf_ct_proto_ipv4_fini(); + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); +#endif /* CONFIG_VE_IPTABLES */ +} +EXPORT_SYMBOL(fini_nf_ct_l3proto_ipv4); + +static int __init nf_conntrack_l3proto_ipv4_init(void) +{ + int ret = 0; + + need_conntrack(); + + ret = nf_register_sockopt(&so_getorigdst); + if (ret < 0) { + printk(KERN_ERR "Unable to register netfilter socket option\n"); + return ret; + } + + ret = init_nf_ct_l3proto_ipv4(); + if (ret < 0) { + printk(KERN_ERR "Unable to initialize netfilter protocols\n"); + goto cleanup_sockopt; + } + KSYMRESOLVE(init_nf_ct_l3proto_ipv4); + KSYMRESOLVE(fini_nf_ct_l3proto_ipv4); + KSYMMODRESOLVE(nf_conntrack_ipv4); + return ret; + cleanup_sockopt: nf_unregister_sockopt(&so_getorigdst); return ret; @@ -485,14 +634,12 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) static void __exit nf_conntrack_l3proto_ipv4_fini(void) { synchronize_net(); -#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - nf_conntrack_ipv4_compat_fini(); -#endif - nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); + + KSYMMODUNRESOLVE(nf_conntrack_ipv4); + KSYMUNRESOLVE(init_nf_ct_l3proto_ipv4); + KSYMUNRESOLVE(fini_nf_ct_l3proto_ipv4); + + fini_nf_ct_l3proto_ipv4(); nf_unregister_sockopt(&so_getorigdst); } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 3a02072..7a3129b 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -9,7 +9,9 @@ */ #include #include +#include #include +#include #include #include @@ -32,7 +34,7 @@ static struct hlist_node *ct_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < nf_conntrack_htable_size; st->bucket++) { - n = rcu_dereference(nf_conntrack_hash[st->bucket].first); + n = rcu_dereference(ve_nf_conntrack_hash[st->bucket].first); if (n) return n; } @@ -48,7 +50,7 @@ static struct hlist_node *ct_get_next(struct seq_file *seq, while (head == NULL) { if (++st->bucket >= nf_conntrack_htable_size) return NULL; - head = rcu_dereference(nf_conntrack_hash[st->bucket].first); + head = rcu_dereference(ve_nf_conntrack_hash[st->bucket].first); } return head; } @@ -181,7 +183,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(nf_ct_expect_hash[st->bucket].first); + n = rcu_dereference(ve_nf_ct_expect_hash[st->bucket].first); if (n) return n; } @@ -197,7 +199,7 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(nf_ct_expect_hash[st->bucket].first); + head = rcu_dereference(ve_nf_ct_expect_hash[st->bucket].first); } return head; } @@ -314,7 +316,7 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) static int ct_cpu_seq_show(struct seq_file *seq, void *v) { - unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); + unsigned int nr_conntracks = atomic_read(&ve_nf_conntrack_count); const struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { @@ -365,36 +367,91 @@ static const struct file_operations ct_cpu_seq_fops = { .release = seq_release, }; -int __init nf_conntrack_ipv4_compat_init(void) +#ifdef CONFIG_VE_IPTABLES +#define ve_ip_ct_netfilter_table (get_exec_env()->_nf_conntrack->_ip_ct_netfilter_table) +#define ve_ip_ct_sysctl_header (get_exec_env()->_nf_conntrack->_ip_ct_sysctl_header) +#else +#define ve_ip_ct_netfilter_table ip_ct_netfilter_table +#define ve_ip_ct_sysctl_header ip_ct_sysctl_header +#endif + +static ctl_table ip_ct_netfilter_table[] = { + { + .procname = "ip_conntrack_max", + .data = &nf_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; + +static struct ctl_path ip_ct_net_table_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + {}, +}; + +int nf_conntrack_ipv4_compat_init(void) { + struct net *net = get_exec_env()->ve_netns; struct proc_dir_entry *proc, *proc_exp, *proc_stat; + static ctl_table *table; - proc = proc_net_fops_create(&init_net, "ip_conntrack", 0440, &ct_file_ops); + proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops); if (!proc) goto err1; - proc_exp = proc_net_fops_create(&init_net, "ip_conntrack_expect", 0440, + proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440, &ip_exp_file_ops); if (!proc_exp) goto err2; proc_stat = proc_create("ip_conntrack", S_IRUGO, - init_net.proc_net_stat, &ct_cpu_seq_fops); + net->proc_net_stat, &ct_cpu_seq_fops); if (!proc_stat) goto err3; + + table = ip_ct_netfilter_table; + if (net != &init_net) { + table = kmemdup(table, + sizeof(ip_ct_netfilter_table), + GFP_KERNEL); + if (!table) + goto err4; + } + + table[0].data = &ve_nf_conntrack_max; + ve_ip_ct_sysctl_header = register_net_sysctl_table(net, + ip_ct_net_table_path, + table); + if (!ve_ip_ct_sysctl_header) + goto err5; + return 0; +err5: + if (net != &init_net) + kfree(table); +err4: + remove_proc_entry("ip_conntrack", net->proc_net_stat); err3: - proc_net_remove(&init_net, "ip_conntrack_expect"); + proc_net_remove(net, "ip_conntrack_expect"); err2: - proc_net_remove(&init_net, "ip_conntrack"); + proc_net_remove(net, "ip_conntrack"); err1: return -ENOMEM; } -void __exit nf_conntrack_ipv4_compat_fini(void) +void nf_conntrack_ipv4_compat_fini(void) { - remove_proc_entry("ip_conntrack", init_net.proc_net_stat); - proc_net_remove(&init_net, "ip_conntrack_expect"); - proc_net_remove(&init_net, "ip_conntrack"); + struct net *net = get_exec_env()->ve_netns; + struct ctl_table *table = ve_ip_ct_sysctl_header->ctl_table_arg; + + unregister_net_sysctl_table(ve_ip_ct_sysctl_header); + if (net != &init_net) + kfree(table); + remove_proc_entry("ip_conntrack", net->proc_net_stat); + proc_net_remove(net, "ip_conntrack_expect"); + proc_net_remove(net, "ip_conntrack"); } diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 9779104..df39929 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -20,7 +21,7 @@ #include #include -static unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ; +unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ; static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct nf_conntrack_tuple *tuple) @@ -92,7 +93,7 @@ static int icmp_packet(struct nf_conn *ct, } else { atomic_inc(&ct->proto.icmp.count); nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); - nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); + nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_icmp_timeout); } return NF_ACCEPT; @@ -148,7 +149,7 @@ icmp_error_message(struct sk_buff *skb, /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ if (!nf_ct_invert_tuple(&innertuple, &origtuple, - &nf_conntrack_l3proto_ipv4, innerproto)) { + ve_nf_conntrack_l3proto_ipv4, innerproto)) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; } @@ -320,3 +321,64 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = #endif #endif }; + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +int nf_ct_proto_icmp_sysctl_init(void) +{ + struct nf_conntrack_l4proto *icmp; + + if (ve_is_super(get_exec_env())) { + icmp = &nf_conntrack_l4proto_icmp; + goto out; + } + + icmp = kmemdup(&nf_conntrack_l4proto_icmp, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (!icmp) + goto no_mem_ct; + + icmp->ctl_table_header = &ve_icmp_sysctl_header; + icmp->ctl_table = kmemdup(icmp_sysctl_table, + sizeof(icmp_sysctl_table), GFP_KERNEL); + if (icmp->ctl_table == NULL) + goto no_mem_sys; + icmp->ctl_table[0].data = &ve_nf_ct_icmp_timeout; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + icmp->ctl_compat_table_header = ve_icmp_compat_sysctl_header; + icmp->ctl_compat_table = kmemdup(icmp_compat_sysctl_table, + sizeof(icmp_compat_sysctl_table), + GFP_KERNEL); + if (icmp->ctl_compat_table == NULL) + goto no_mem_compat; + icmp->ctl_compat_table[0].data = &ve_nf_ct_icmp_timeout; +#endif +out: + ve_nf_ct_icmp_timeout = nf_ct_icmp_timeout; + + ve_nf_conntrack_l4proto_icmp = icmp; + return 0; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +no_mem_compat: + kfree(icmp->ctl_table); +#endif +no_mem_sys: + kfree(icmp); +no_mem_ct: + return -ENOMEM; +} +EXPORT_SYMBOL(nf_ct_proto_icmp_sysctl_init); + +void nf_ct_proto_icmp_sysctl_cleanup(void) +{ + if (!ve_is_super(get_exec_env())) { +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + kfree(ve_nf_conntrack_l4proto_icmp->ctl_compat_table); +#endif + kfree(ve_nf_conntrack_l4proto_icmp->ctl_table); + kfree(ve_nf_conntrack_l4proto_icmp); + } +} +EXPORT_SYMBOL(nf_ct_proto_icmp_sysctl_cleanup); +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 6c6a3cb..94c86f5 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -33,22 +35,34 @@ static DEFINE_SPINLOCK(nf_nat_lock); -static struct nf_conntrack_l3proto *l3proto __read_mostly; /* Calculated at init based on memory size */ static unsigned int nf_nat_htable_size __read_mostly; -static int nf_nat_vmalloced; +#define MAX_IP_NAT_PROTO 256 + +#ifdef CONFIG_VE_IPTABLES +#define ve_nf_nat_protos (get_exec_env()->_nf_conntrack->_nf_nat_protos) +#define ve_nf_nat_l3proto (get_exec_env()->_nf_conntrack->_nf_nat_l3proto) +#define ve_bysource (get_exec_env()->_nf_conntrack->_bysource) +#define ve_nf_nat_vmalloced (get_exec_env()->_nf_conntrack->_nf_nat_vmalloced) +#else +static struct nf_conntrack_l3proto *l3proto __read_mostly; +static int nf_nat_vmalloced; static struct hlist_head *bysource __read_mostly; -#define MAX_IP_NAT_PROTO 256 static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] __read_mostly; +#define ve_nf_nat_protos nf_nat_protos +#define ve_nf_nat_l3proto l3proto +#define ve_bysource bysource +#define ve_nf_nat_vmalloced nf_nat_vmalloced +#endif static inline const struct nf_nat_protocol * __nf_nat_proto_find(u_int8_t protonum) { - return rcu_dereference(nf_nat_protos[protonum]); + return rcu_dereference(ve_nf_nat_protos[protonum]); } const struct nf_nat_protocol * @@ -155,7 +169,7 @@ find_appropriate_src(const struct nf_conntrack_tuple *tuple, const struct hlist_node *n; rcu_read_lock(); - hlist_for_each_entry_rcu(nat, n, &bysource[h], bysource) { + hlist_for_each_entry_rcu(nat, n, &ve_bysource[h], bysource) { ct = nat->ct; if (same_src(ct, tuple)) { /* Copy source part from reply tuple. */ @@ -278,6 +292,22 @@ out: rcu_read_unlock(); } +void nf_nat_hash_conntrack(struct nf_conn *ct) +{ + struct nf_conn_nat *nat; + unsigned int srchash; + + srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + spin_lock_bh(&nf_nat_lock); + /* nf_conntrack_alter_reply might re-allocate exntension aera */ + nat = nfct_nat(ct); + nat->ct = ct; + hlist_add_head_rcu(&nat->bysource, &ve_bysource[srchash]); + spin_unlock_bh(&nf_nat_lock); + +} +EXPORT_SYMBOL_GPL(nf_nat_hash_conntrack); + unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, @@ -326,17 +356,8 @@ nf_nat_setup_info(struct nf_conn *ct, } /* Place in source hash if this is the first time. */ - if (have_to_hash) { - unsigned int srchash; - - srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_lock); - /* nf_conntrack_alter_reply might re-allocate exntension aera */ - nat = nfct_nat(ct); - nat->ct = ct; - hlist_add_head_rcu(&nat->bysource, &bysource[srchash]); - spin_unlock_bh(&nf_nat_lock); - } + if (have_to_hash) + nf_nat_hash_conntrack(ct); /* It's done. */ if (maniptype == IP_NAT_MANIP_DST) @@ -426,7 +447,6 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, struct icmphdr icmp; struct iphdr ip; } *inside; - const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple inner, target; int hdrlen = ip_hdrlen(skb); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -463,16 +483,14 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, "dir %s\n", skb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); - /* rcu_read_lock()ed by nf_hook_slow */ - l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); - if (!nf_ct_get_tuple(skb, ip_hdrlen(skb) + sizeof(struct icmphdr), (ip_hdrlen(skb) + sizeof(struct icmphdr) + inside->ip.ihl * 4), (u_int16_t)AF_INET, inside->ip.protocol, - &inner, l3proto, l4proto)) + &inner, ve_nf_nat_l3proto, + __nf_ct_l4proto_find(PF_INET, inside->ip.protocol))) return 0; /* Change inner back to look like incoming packet. We do the @@ -522,11 +540,11 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto) int ret = 0; spin_lock_bh(&nf_nat_lock); - if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { + if (ve_nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { ret = -EBUSY; goto out; } - rcu_assign_pointer(nf_nat_protos[proto->protonum], proto); + rcu_assign_pointer(ve_nf_nat_protos[proto->protonum], proto); out: spin_unlock_bh(&nf_nat_lock); return ret; @@ -537,7 +555,7 @@ EXPORT_SYMBOL(nf_nat_protocol_register); void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) { spin_lock_bh(&nf_nat_lock); - rcu_assign_pointer(nf_nat_protos[proto->protonum], + rcu_assign_pointer(ve_nf_nat_protos[proto->protonum], &nf_nat_unknown_protocol); spin_unlock_bh(&nf_nat_lock); synchronize_rcu(); @@ -583,47 +601,62 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { .flags = NF_CT_EXT_F_PREALLOC, }; -static int __init nf_nat_init(void) +int nf_nat_init(void) { size_t i; int ret; need_ipv4_conntrack(); - ret = nf_ct_extend_register(&nat_extend); - if (ret < 0) { - printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); - return ret; + if (ve_is_super(get_exec_env())) { + ret = nf_ct_extend_register(&nat_extend); + if (ret < 0) { + printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); + return ret; + } } /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; - bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, - &nf_nat_vmalloced); - if (!bysource) { + ve_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, + &ve_nf_nat_vmalloced); + if (!ve_bysource) { ret = -ENOMEM; goto cleanup_extend; } +#ifdef CONFIG_VE_IPTABLES + ve_nf_nat_protos = kcalloc(MAX_IP_NAT_PROTO, sizeof(void *), GFP_KERNEL); + if (!ve_nf_nat_protos) { + ret = -ENOMEM; + goto cleanup_hash; + } +#endif /* Sew in builtin protocols. */ spin_lock_bh(&nf_nat_lock); for (i = 0; i < MAX_IP_NAT_PROTO; i++) - rcu_assign_pointer(nf_nat_protos[i], &nf_nat_unknown_protocol); - rcu_assign_pointer(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp); - rcu_assign_pointer(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp); - rcu_assign_pointer(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp); + rcu_assign_pointer(ve_nf_nat_protos[i], &nf_nat_unknown_protocol); + rcu_assign_pointer(ve_nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp); + rcu_assign_pointer(ve_nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp); + rcu_assign_pointer(ve_nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp); spin_unlock_bh(&nf_nat_lock); - /* Initialize fake conntrack so that NAT will skip it */ - nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK; + if (ve_is_super(get_exec_env())) { + /* Initialize fake conntrack so that NAT will skip it */ + nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK; + BUG_ON(nf_nat_seq_adjust_hook != NULL); + rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); + } - l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); + ve_nf_nat_l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); - BUG_ON(nf_nat_seq_adjust_hook != NULL); - rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); return 0; +#ifdef CONFIG_VE_IPTABLES +cleanup_hash: +#endif + nf_ct_free_hashtable(ve_bysource, ve_nf_nat_vmalloced, nf_nat_htable_size); cleanup_extend: nf_ct_extend_unregister(&nat_extend); return ret; @@ -641,18 +674,46 @@ static int clean_nat(struct nf_conn *i, void *data) return 0; } -static void __exit nf_nat_cleanup(void) +void nf_nat_cleanup(void) { nf_ct_iterate_cleanup(&clean_nat, NULL); synchronize_rcu(); - nf_ct_free_hashtable(bysource, nf_nat_vmalloced, nf_nat_htable_size); - nf_ct_l3proto_put(l3proto); - nf_ct_extend_unregister(&nat_extend); - rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); + nf_ct_free_hashtable(ve_bysource, ve_nf_nat_vmalloced, nf_nat_htable_size); + nf_ct_l3proto_put(ve_nf_nat_l3proto); +#ifdef CONFIG_VE_IPTABLES + kfree(ve_nf_nat_protos); +#endif + if (ve_is_super(get_exec_env())) { + nf_ct_extend_unregister(&nat_extend); + rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); + } synchronize_net(); } +static int __init init(void) +{ + int rv; + + rv = nf_nat_init(); + if (rv < 0) + return rv; + + KSYMRESOLVE(nf_nat_init); + KSYMRESOLVE(nf_nat_cleanup); + KSYMMODRESOLVE(nf_nat); + return 0; +} + +static void __exit fini(void) +{ + KSYMMODUNRESOLVE(nf_nat); + KSYMUNRESOLVE(nf_nat_cleanup); + KSYMUNRESOLVE(nf_nat_init); + + nf_nat_cleanup(); +} + MODULE_LICENSE("GPL"); -module_init(nf_nat_init); -module_exit(nf_nat_cleanup); +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index e8b4d0d..f301178 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -33,7 +34,7 @@ static struct struct ipt_replace repl; struct ipt_standard entries[3]; struct ipt_error term; -} nat_initial_table __initdata = { +} nat_initial_table = { .repl = { .name = "nat", .valid_hooks = NAT_VALID_HOOKS, @@ -65,7 +66,12 @@ static struct xt_table __nat_table = { .me = THIS_MODULE, .af = AF_INET, }; +#ifdef CONFIG_VE_IPTABLES +#define nat_table \ + (get_exec_env()->_nf_conntrack->_nf_nat_table) +#else static struct xt_table *nat_table; +#endif /* Source NAT */ static unsigned int ipt_snat_target(struct sk_buff *skb, @@ -226,14 +232,20 @@ static struct xt_target ipt_dnat_reg __read_mostly = { .family = AF_INET, }; -int __init nf_nat_rule_init(void) +int nf_nat_rule_init(void) { int ret; + struct net *net = get_exec_env()->ve_netns; - nat_table = ipt_register_table(&init_net, &__nat_table, + nat_table = ipt_register_table(net, &__nat_table, &nat_initial_table.repl); if (IS_ERR(nat_table)) return PTR_ERR(nat_table); + + ret = 0; + if (!ve_is_super(get_exec_env())) + goto done; + ret = xt_register_target(&ipt_snat_reg); if (ret != 0) goto unregister_table; @@ -242,19 +254,26 @@ int __init nf_nat_rule_init(void) if (ret != 0) goto unregister_snat; +done: return ret; unregister_snat: xt_unregister_target(&ipt_snat_reg); unregister_table: ipt_unregister_table(nat_table); + nat_table = NULL; return ret; } void nf_nat_rule_cleanup(void) { + if (!ve_is_super(get_exec_env())) + goto skip; + xt_unregister_target(&ipt_dnat_reg); xt_unregister_target(&ipt_snat_reg); +skip: ipt_unregister_table(nat_table); + nat_table = NULL; } diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index b7dd695..72f45db 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -282,6 +283,45 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = { }, }; +int init_nftable_nat(void) +{ + int ret; + + if (!ve_is_super(get_exec_env())) + __module_get(THIS_MODULE); + + ret = nf_nat_rule_init(); + if (ret < 0) { + printk("nf_nat_init: can't setup rules.\n"); + goto out_modput; + } + + if (ve_is_super(get_exec_env())) { + ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); + if (ret < 0) { + printk("nf_nat_init: can't register hooks.\n"); + goto cleanup_rule_init; + } + } + return 0; + +cleanup_rule_init: + nf_nat_rule_cleanup(); +out_modput: + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); + return ret; +} + +void fini_nftable_nat(void) +{ + if (ve_is_super(get_exec_env())) + nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); + nf_nat_rule_cleanup(); + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); +} + static int __init nf_nat_standalone_init(void) { int ret = 0; @@ -292,20 +332,19 @@ static int __init nf_nat_standalone_init(void) BUG_ON(ip_nat_decode_session != NULL); rcu_assign_pointer(ip_nat_decode_session, nat_decode_session); #endif - ret = nf_nat_rule_init(); - if (ret < 0) { - printk("nf_nat_init: can't setup rules.\n"); - goto cleanup_decode_session; - } - ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); - if (ret < 0) { - printk("nf_nat_init: can't register hooks.\n"); - goto cleanup_rule_init; + + if (!ip_conntrack_disable_ve0) { + ret = init_nftable_nat(); + if (ret < 0) + goto cleanup_decode_session; } + + KSYMRESOLVE(init_nftable_nat); + KSYMRESOLVE(fini_nftable_nat); + KSYMMODRESOLVE(iptable_nat); + return ret; - cleanup_rule_init: - nf_nat_rule_cleanup(); cleanup_decode_session: #ifdef CONFIG_XFRM rcu_assign_pointer(ip_nat_decode_session, NULL); @@ -316,8 +355,12 @@ static int __init nf_nat_standalone_init(void) static void __exit nf_nat_standalone_fini(void) { - nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); - nf_nat_rule_cleanup(); + KSYMMODUNRESOLVE(iptable_nat); + KSYMUNRESOLVE(init_nftable_nat); + KSYMUNRESOLVE(fini_nftable_nat); + + if (!ip_conntrack_disable_ve0) + fini_nftable_nat(); #ifdef CONFIG_XFRM rcu_assign_pointer(ip_nat_decode_session, NULL); synchronize_net(); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8f5a403..c42d9a5 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -51,6 +51,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; + if (!ve_is_super(get_exec_env())) + return 0; + socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", sock_prot_inuse_get(net, &tcp_prot), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6ee5354..3bbe823 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -115,6 +116,7 @@ #define RT_GC_TIMEOUT (300*HZ) +int ip_rt_src_check = 1; static int ip_rt_max_size; static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; static int ip_rt_gc_interval __read_mostly = 60 * HZ; @@ -1272,6 +1274,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rt->u.dst.hh = NULL; rt->u.dst.xfrm = NULL; rt->rt_genid = rt_genid(net); +#ifdef CONFIG_VE + rt->fl.owner_env = get_exec_env(); +#endif rt->rt_flags |= RTCF_REDIRECTED; /* Gateway is different ... */ @@ -1729,9 +1734,12 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #ifdef CONFIG_NET_CLS_ROUTE rth->u.dst.tclassid = itag; #endif +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = init_net.loopback_dev; + rth->u.dst.dev = get_exec_env()->ve_netns->loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->fl.oif = 0; @@ -1868,6 +1876,9 @@ static int __mkroute_input(struct sk_buff *skb, rth->fl.fl4_src = saddr; rth->rt_src = saddr; rth->rt_gateway = daddr; +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->rt_iif = rth->fl.iif = in_dev->dev->ifindex; rth->u.dst.dev = (out_dev)->dev; @@ -2062,6 +2073,9 @@ local_input: rth->idev = in_dev_get(rth->u.dst.dev); rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->u.dst.input= ip_local_deliver; rth->rt_flags = flags|RTCF_LOCAL; if (res.type == RTN_UNREACHABLE) { @@ -2251,6 +2265,9 @@ static int __mkroute_output(struct rtable **result, rth->fl.mark = oldflp->mark; rth->rt_dst = fl->fl4_dst; rth->rt_src = fl->fl4_src; +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->rt_iif = oldflp->oif ? : dev_out->ifindex; /* get references to the devices that are to be hold by the routing cache entry */ @@ -2356,10 +2373,13 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, ipv4_is_zeronet(oldflp->fl4_src)) goto out; - /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(net, oldflp->fl4_src); - if (dev_out == NULL) - goto out; + if (ip_rt_src_check) { + /* It is equivalent to + inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(net, oldflp->fl4_src); + if (dev_out == NULL) + goto out; + } /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: @@ -2387,6 +2407,12 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, Luckily, this hack is good workaround. */ + if (dev_out == NULL) { + dev_out = ip_dev_find(net, oldflp->fl4_src); + if (dev_out == NULL) + goto out; + } + fl.oif = dev_out->ifindex; goto make_route; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e0689fd..f6c0adf 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -26,6 +26,9 @@ static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; +int sysctl_tcp_use_sg = 1; +EXPORT_SYMBOL(sysctl_tcp_use_sg); + extern seqlock_t sysctl_port_range_lock; extern int sysctl_local_port_range[2]; @@ -411,6 +414,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, + { + .procname = "tcp_use_sg", + .data = &sysctl_tcp_use_sg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif { @@ -578,6 +588,20 @@ static struct ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { + .procname = "tcp_max_tw_kmem_fraction", + .data = &sysctl_tcp_max_tw_kmem_fraction, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_max_tw_buckets_ub", + .data = &sysctl_tcp_max_tw_buckets_ub, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", .data = &sysctl_tcp_nometrics_save, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1ab341e..8387637 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -272,6 +272,10 @@ #include #include +#include +#include +#include + #include #include @@ -336,6 +340,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) unsigned int mask; struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); + int check_send_space; poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == TCP_LISTEN) @@ -350,6 +355,21 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk->sk_err) mask = POLLERR; + check_send_space = 1; +#ifdef CONFIG_BEANCOUNTERS + if (!(sk->sk_shutdown & SEND_SHUTDOWN) && sock_has_ubc(sk)) { + unsigned long size; + size = MAX_TCP_HEADER + tp->mss_cache; + if (size > SOCK_MIN_UBCSPACE) + size = SOCK_MIN_UBCSPACE; + size = skb_charge_size(size); + if (ub_sock_makewres_tcp(sk, size)) { + check_send_space = 0; + ub_sock_sndqueueadd_tcp(sk, size); + } + } +#endif + /* * POLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a @@ -393,7 +413,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) mask |= POLLIN | POLLRDNORM; - if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (check_send_space && !(sk->sk_shutdown & SEND_SHUTDOWN)) { if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ @@ -637,7 +657,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); if (skb) { - if (sk_wmem_schedule(sk, skb->truesize)) { + if (sk_wmem_schedule(sk, skb->truesize, skb)) { /* * Make sure that we have exactly size bytes * available to the caller, no more, no less. @@ -683,15 +703,22 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse int copy, i, can_coalesce; int offset = poffset % PAGE_SIZE; int size = min_t(size_t, psize, PAGE_SIZE - offset); + unsigned long chargesize = 0; if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { new_segment: + chargesize = 0; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; + chargesize = skb_charge_size(MAX_TCP_HEADER + + tp->mss_cache); + if (ub_sock_getwres_tcp(sk, chargesize) < 0) + goto wait_for_ubspace; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); if (!skb) goto wait_for_memory; + ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); skb_entail(sk, skb); copy = size_goal; @@ -706,7 +733,7 @@ new_segment: tcp_mark_push(tp, skb); goto new_segment; } - if (!sk_wmem_schedule(sk, copy)) + if (!sk_wmem_schedule(sk, copy, skb)) goto wait_for_memory; if (can_coalesce) { @@ -747,10 +774,15 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: + ub_sock_retwres_tcp(sk, chargesize, + skb_charge_size(MAX_TCP_HEADER + tp->mss_cache)); + chargesize = 0; +wait_for_ubspace: if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = __sk_stream_wait_memory(sk, &timeo, chargesize); + if (err != 0) goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); @@ -787,12 +819,8 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, return res; } -#define TCP_PAGE(sk) (sk->sk_sndmsg_page) -#define TCP_OFF(sk) (sk->sk_sndmsg_off) - -static inline int select_size(struct sock *sk) +static inline int select_size(struct sock *sk, struct tcp_sock *tp) { - struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; if (sk->sk_route_caps & NETIF_F_SG) { @@ -851,6 +879,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, while (--iovlen >= 0) { int seglen = iov->iov_len; unsigned char __user *from = iov->iov_base; + unsigned long chargesize = 0; iov++; @@ -861,18 +890,27 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { + unsigned long size; new_segment: /* Allocate new segment. If the interface is SG, * allocate skb fitting to single page. */ + chargesize = 0; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_skb(sk, select_size(sk), + size = select_size(sk, tp); + chargesize = skb_charge_size(MAX_TCP_HEADER + + size); + if (ub_sock_getwres_tcp(sk, chargesize) < 0) + goto wait_for_ubspace; + skb = sk_stream_alloc_skb(sk, size, sk->sk_allocation); if (!skb) goto wait_for_memory; + ub_skb_set_charge(skb, sk, chargesize, + UB_TCPSNDBUF); /* * Check whether we can use HW checksum. @@ -918,6 +956,7 @@ new_segment: } else if (page) { if (off == PAGE_SIZE) { put_page(page); + ub_sock_tcp_detachpage(sk); TCP_PAGE(sk) = page = NULL; off = 0; } @@ -927,10 +966,13 @@ new_segment: if (copy > PAGE_SIZE - off) copy = PAGE_SIZE - off; - if (!sk_wmem_schedule(sk, copy)) + if (!sk_wmem_schedule(sk, copy, skb)) goto wait_for_memory; if (!page) { + chargesize = PAGE_SIZE; + if (ub_sock_tcp_chargepage(sk) < 0) + goto wait_for_ubspace; /* Allocate new cache page. */ if (!(page = sk_stream_alloc_page(sk))) goto wait_for_memory; @@ -962,7 +1004,8 @@ new_segment: } else if (off + copy < PAGE_SIZE) { get_page(page); TCP_PAGE(sk) = page; - } + } else + ub_sock_tcp_detachpage(sk); } TCP_OFF(sk) = off + copy; @@ -993,10 +1036,15 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: + ub_sock_retwres_tcp(sk, chargesize, + skb_charge_size(MAX_TCP_HEADER+tp->mss_cache)); + chargesize = 0; +wait_for_ubspace: if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = __sk_stream_wait_memory(sk, &timeo, chargesize); + if (err != 0) goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); @@ -1096,7 +1144,18 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) #if TCP_DEBUG struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); + if (WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq))) { + printk("KERNEL: assertion: skb==NULL || " + "before(tp->copied_seq, skb->end_seq)\n"); + printk("VE%u pid %d comm %.16s\n", + (get_exec_env() ? VEID(get_exec_env()) : 0), + current->pid, current->comm); + printk("copied=%d, copied_seq=%d, rcv_nxt=%d\n", copied, + tp->copied_seq, tp->rcv_nxt); + printk("skb->len=%d, skb->seq=%d, skb->end_seq=%d\n", + skb->len, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + } #endif if (inet_csk_ack_scheduled(sk)) { @@ -1358,7 +1417,23 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto found_ok_skb; if (tcp_hdr(skb)->fin) goto found_fin_ok; - WARN_ON(!(flags & MSG_PEEK)); + if (WARN_ON(!(flags & MSG_PEEK))) { + printk("KERNEL: assertion: flags&MSG_PEEK\n"); + printk("VE%u pid %d comm %.16s\n", + (get_exec_env() ? + VEID(get_exec_env()) : 0), + current->pid, current->comm); + printk("flags=0x%x, len=%d, copied_seq=%d, " + "rcv_nxt=%d\n", flags, + (int)len, tp->copied_seq, + tp->rcv_nxt); + printk("skb->len=%d, *seq=%d, skb->seq=%d, " + "skb->end_seq=%d, offset=%d\n", + skb->len, *seq, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + offset); + } skb = skb->next; } while (skb != (struct sk_buff *)&sk->sk_receive_queue); @@ -1421,8 +1496,19 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, tp->ucopy.len = len; - WARN_ON(tp->copied_seq != tp->rcv_nxt && - !(flags & (MSG_PEEK | MSG_TRUNC))); + if (WARN_ON(tp->copied_seq != tp->rcv_nxt && + !(flags & (MSG_PEEK | MSG_TRUNC)))) { + printk("KERNEL: assertion: tp->copied_seq == " + "tp->rcv_nxt || ...\n"); + printk("VE%u pid %d comm %.16s\n", + (get_exec_env() ? + VEID(get_exec_env()) : 0), + current->pid, current->comm); + printk("flags=0x%x, len=%d, copied_seq=%d, " + "rcv_nxt=%d\n", flags, + (int)len, tp->copied_seq, + tp->rcv_nxt); + } /* Ugly... If prequeue is not empty, we have to * process it before releasing socket, otherwise @@ -1833,7 +1919,7 @@ adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); - atomic_inc(sk->sk_prot->orphan_count); + ub_inc_orphan_count(sk); /* It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); @@ -1884,12 +1970,19 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { + int orphans = ub_get_orphan_count(sk); + sk_mem_reclaim(sk); - if (tcp_too_many_orphans(sk, - atomic_read(sk->sk_prot->orphan_count))) { - if (net_ratelimit()) + if (ub_too_many_orphans(sk, orphans)) { + if (net_ratelimit()) { + int ubid = 0; +#ifdef CONFIG_USER_RESOURCE + ubid = sock_has_ubc(sk) ? + top_beancounter(sock_bc(sk)->ub)->ub_uid : 0; +#endif printk(KERN_INFO "TCP: too many of orphaned " - "sockets\n"); + "sockets (%d in CT%d)\n", orphans, ubid); + } tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(sock_net(sk), @@ -1966,6 +2059,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; tp->bytes_acked = 0; + tp->advmss = 65535; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); @@ -2687,7 +2781,7 @@ void __init tcp_init(void) tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); /* Size and allocate the main established and bind bucket * hash tables. @@ -2756,6 +2850,11 @@ void __init tcp_init(void) sysctl_tcp_mem[1] = limit; sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 4096) + sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 4096; + if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 4096) + sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 4096; + /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7abc6b8..84c400c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -71,6 +71,8 @@ #include #include +#include + int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; @@ -306,7 +308,7 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && - !tcp_memory_pressure) { + ub_tcp_rmem_allows_expand(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -376,6 +378,8 @@ static void tcp_init_buffer_space(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_time_stamp; + + ub_tcp_update_maxadvmss(sk); } /* 5. Recalculate window clamp after socket hit its memory bounds. */ @@ -388,7 +392,7 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && + !ub_tcp_memory_pressure(sk) && atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); @@ -3936,19 +3940,19 @@ static void tcp_ofo_queue(struct sock *sk) static int tcp_prune_ofo_queue(struct sock *sk); static int tcp_prune_queue(struct sock *sk); -static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) +static inline int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_rmem_schedule(sk, size)) { + !sk_rmem_schedule(sk, skb)) { if (tcp_prune_queue(sk) < 0) return -1; - if (!sk_rmem_schedule(sk, size)) { + if (!sk_rmem_schedule(sk, skb)) { if (!tcp_prune_ofo_queue(sk)) return -1; - if (!sk_rmem_schedule(sk, size)) + if (!sk_rmem_schedule(sk, skb)) return -1; } } @@ -4003,8 +4007,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten <= 0) { queue_and_out: if (eaten < 0 && - tcp_try_rmem_schedule(sk, skb->truesize)) - goto drop; + tcp_try_rmem_schedule(sk, skb)) + goto drop_part; skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); @@ -4048,6 +4052,12 @@ out_of_window: drop: __kfree_skb(skb); return; + +drop_part: + if (after(tp->copied_seq, tp->rcv_nxt)) + tp->rcv_nxt = tp->copied_seq; + __kfree_skb(skb); + return; } /* Out of window. F.e. zero window probe. */ @@ -4074,7 +4084,7 @@ drop: TCP_ECN_check_ce(tp, skb); - if (tcp_try_rmem_schedule(sk, skb->truesize)) + if (tcp_try_rmem_schedule(sk, skb)) goto drop; /* Disable header prediction. */ @@ -4218,6 +4228,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, nskb = alloc_skb(copy + header, GFP_ATOMIC); if (!nskb) return; + if (ub_tcprcvbuf_charge_forced(skb->sk, nskb) < 0) { + kfree_skb(nskb); + return; + } skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); skb_set_network_header(nskb, (skb_network_header(skb) - @@ -4345,7 +4359,7 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); - else if (tcp_memory_pressure) + else if (ub_tcp_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); @@ -4410,7 +4424,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk) return 0; /* If we are under global TCP memory pressure, do not expand. */ - if (tcp_memory_pressure) + if (ub_tcp_memory_pressure(sk)) return 0; /* If we are under soft global TCP memory pressure, do not expand. */ @@ -4859,6 +4873,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; + /* This is OK not to try to free memory here. + * Do this below on slow path. Den */ + if (ub_tcprcvbuf_charge(sk, skb) < 0) + goto step5; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 011478e..153901c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -71,6 +71,8 @@ #include #include +#include + #include #include #include @@ -678,7 +680,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) struct tcp_timewait_sock *tcptw = tcp_twsk(sk); tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, - tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcptw->tw_rcv_wnd >> + (tw->tw_rcv_wscale & TW_WSCALE_MASK), tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw) @@ -1155,6 +1158,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, }; +EXPORT_SYMBOL_GPL(tcp_request_sock_ops); #ifdef CONFIG_TCP_MD5SIG static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { @@ -1460,6 +1464,10 @@ static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; + struct user_beancounter *ub; + + ub = set_exec_ub(sock_bc(sk)->ub); + #ifdef CONFIG_TCP_MD5SIG /* * We really want to reject the packet as early as possible @@ -1478,7 +1486,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto reset; } TCP_CHECK_TIMER(sk); - return 0; + goto restore_context; } if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) @@ -1494,7 +1502,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) rsk = nsk; goto reset; } - return 0; + goto restore_context; } } @@ -1504,6 +1512,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto reset; } TCP_CHECK_TIMER(sk); + +restore_context: + (void)set_exec_ub(ub); return 0; reset: @@ -1515,7 +1526,7 @@ discard: * might be destroyed here. This current version compiles correctly, * but you have been warned. */ - return 0; + goto restore_context; csum_err: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); @@ -1778,6 +1789,8 @@ static int tcp_v4_init_sock(struct sock *sk) tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536; + tp->advmss = 65535; /* max value */ + tp->reordering = sysctl_tcp_reordering; icsk->icsk_ca_ops = &tcp_init_congestion_ops; @@ -1839,6 +1852,8 @@ void tcp_v4_destroy_sock(struct sock *sk) * If sendmsg cached page exists, toss it. */ if (sk->sk_sndmsg_page) { + /* queue is empty, uncharge */ + ub_sock_tcp_detachpage(sk); __free_page(sk->sk_sndmsg_page); sk->sk_sndmsg_page = NULL; } @@ -2390,6 +2405,87 @@ void __init tcp_v4_init(void) panic("Failed to create the TCP control socket.\n"); } +#ifdef CONFIG_VE +static void tcp_kill_ve_onesk(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Check the assumed state of the socket. */ + if (!sock_flag(sk, SOCK_DEAD)) { + static int printed; +invalid: + if (!printed) + printk(KERN_DEBUG "Killing sk: dead %d, state %d, " + "wrseq %u unseq %u, wrqu %d.\n", + sock_flag(sk, SOCK_DEAD), sk->sk_state, + tp->write_seq, tp->snd_una, + !skb_queue_empty(&sk->sk_write_queue)); + printed = 1; + return; + } + + tcp_send_active_reset(sk, GFP_ATOMIC); + switch (sk->sk_state) { + case TCP_FIN_WAIT1: + case TCP_CLOSING: + /* In these 2 states the peer may want us to retransmit + * some data and/or FIN. Entering "resetting mode" + * instead. + */ + tcp_time_wait(sk, TCP_CLOSE, 0); + break; + case TCP_FIN_WAIT2: + /* By some reason the socket may stay in this state + * without turning into a TW bucket. Fix it. + */ + tcp_time_wait(sk, TCP_FIN_WAIT2, 0); + break; + case TCP_LAST_ACK: + /* Just jump into CLOSED state. */ + tcp_done(sk); + break; + default: + /* The socket must be already close()d. */ + goto invalid; + } +} + +void tcp_v4_kill_ve_sockets(struct ve_struct *envid) +{ + struct inet_ehash_bucket *head; + int i; + + /* alive */ + local_bh_disable(); + head = tcp_hashinfo.ehash; + for (i = 0; i < tcp_hashinfo.ehash_size; i++) { + struct sock *sk; + struct hlist_node *node; + rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i); +more_work: + write_lock(lock); + sk_for_each(sk, node, &head[i].chain) { + if (ve_accessible_strict(sk->owner_env, envid)) { + sock_hold(sk); + write_unlock(lock); + + bh_lock_sock(sk); + /* sk might have disappeared from the hash before + * we got the lock */ + if (sk->sk_state != TCP_CLOSE) + tcp_kill_ve_onesk(sk); + bh_unlock_sock(sk); + sock_put(sk); + goto more_work; + } + } + write_unlock(lock); + } + local_bh_enable(); +} +EXPORT_SYMBOL(tcp_v4_kill_ve_sockets); +#endif + EXPORT_SYMBOL(ipv4_specific); EXPORT_SYMBOL(tcp_hashinfo); EXPORT_SYMBOL(tcp_prot); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f976fc5..5ce52dd 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -26,6 +26,9 @@ #include #include +#include +#include + #ifdef CONFIG_SYSCTL #define SYNC_INIT 0 /* let the user enable it */ #else @@ -36,6 +39,11 @@ int sysctl_tcp_syncookies __read_mostly = SYNC_INIT; EXPORT_SYMBOL(sysctl_tcp_syncookies); int sysctl_tcp_abort_on_overflow __read_mostly; +int sysctl_tcp_max_tw_kmem_fraction __read_mostly = 384; +int sysctl_tcp_max_tw_buckets_ub __read_mostly = 16536; + +EXPORT_SYMBOL(sysctl_tcp_max_tw_kmem_fraction); +EXPORT_SYMBOL(sysctl_tcp_max_tw_buckets_ub); struct inet_timewait_death_row tcp_death_row = { .sysctl_max_tw_buckets = NR_FILE * 2, @@ -51,6 +59,7 @@ struct inet_timewait_death_row tcp_death_row = { .twcal_hand = -1, .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, (unsigned long)&tcp_death_row), + .ub_managed = 1, }; EXPORT_SYMBOL_GPL(tcp_death_row); @@ -279,7 +288,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); - if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) + if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets && + ub_timewait_check(sk, &tcp_death_row)) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { @@ -292,6 +302,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + if (sk->sk_user_data != NULL) + tw->tw_rcv_wscale |= TW_WSCALE_SPEC; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { @@ -326,6 +338,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } } while (0); #endif + tw->tw_owner_env = VEID(sk->owner_env); /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); @@ -346,11 +359,16 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) TCP_TIMEWAIT_LEN); inet_twsk_put(tw); } else { + int ubid = 0; /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ - LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) + ubid = top_beancounter(sock_bc(sk)->ub)->ub_uid; +#endif + LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow (CT%d)\n", ubid); } tcp_update_metrics(sk); @@ -391,6 +409,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *newtp; /* Now setup tcp_sock */ + newsk->owner_env = sk->owner_env; + newtp = tcp_sk(newsk); newtp->pred_flags = 0; newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8165f5a..b46e764 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -39,6 +39,9 @@ #include #include +#include +#include + /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse __read_mostly = 1; @@ -565,6 +568,13 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, return size; } +static int skb_header_size(struct sock *sk, int tcp_hlen) +{ + struct ip_options *opt = inet_sk(sk)->opt; + return tcp_hlen + sizeof(struct iphdr) + + (opt ? opt->optlen : 0) + ETH_HLEN /* For hard header */; +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -589,6 +599,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, __u8 *md5_hash_location; struct tcphdr *th; int err; + int header_size; BUG_ON(!skb || !tcp_skb_pcount(skb)); @@ -619,6 +630,20 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); + /* Unfortunately, we can have skb from outside world here + * with size insufficient for header. It is impossible to make + * guess when we queue skb, so the decision should be made + * here. Den + */ + header_size = skb_header_size(sk, tcp_header_size); + if (skb->data - header_size < skb->head) { + int delta = header_size - skb_headroom(skb); + err = pskb_expand_head(skb, SKB_DATA_ALIGN(delta), + 0, GFP_ATOMIC); + if (err) + return err; + } + if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); @@ -755,15 +780,21 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, if (nsize < 0) nsize = 0; - if (skb_cloned(skb) && - skb_is_nonlinear(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return -ENOMEM; + if (skb_cloned(skb) && skb_is_nonlinear(skb)) { + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + ub_skb_uncharge(skb); + ub_tcpsndbuf_charge_forced(sk, skb); + } /* Get a new skb... force flag on. */ buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ + if (ub_tcpsndbuf_charge(sk, buff) < 0) { + kfree_skb(buff); + return -ENOMEM; + } sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); @@ -1270,6 +1301,11 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (unlikely(buff == NULL)) return -ENOMEM; + if (ub_tcpsndbuf_charge(sk, buff) < 0) { + kfree_skb(buff); + return -ENOMEM; + } + sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; @@ -1705,7 +1741,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; - if (tcp_memory_pressure) + if (ub_tcp_shrink_rcvbuf(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); @@ -2153,6 +2189,7 @@ void tcp_send_fin(struct sock *sk) break; yield(); } + ub_tcpsndbuf_charge_forced(sk, skb); /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); @@ -2211,6 +2248,10 @@ int tcp_send_synack(struct sock *sk) struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); if (nskb == NULL) return -ENOMEM; + if (ub_tcpsndbuf_charge(sk, skb) < 0) { + kfree_skb(nskb); + return -ENOMEM; + } tcp_unlink_write_queue(skb, sk); skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); @@ -2320,6 +2361,7 @@ static void tcp_connect_init(struct sock *sk) struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; + static int once = 0; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. @@ -2339,9 +2381,23 @@ static void tcp_connect_init(struct sock *sk) tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); + if (!once && dst_metric(dst, RTAX_ADVMSS) == 0) { + once = 1; + + printk("Oops in connect_init! dst->advmss=%d\n", + dst_metric(dst, RTAX_ADVMSS)); + printk("dst: pmtu=%u\n", dst_metric(dst, RTAX_MTU)); + printk("sk->state=%d, tp: ack.rcv_mss=%d, mss_cache=%d, " + "advmss=%d, user_mss=%d\n", + sk->sk_state, inet_csk(sk)->icsk_ack.rcv_mss, + tp->mss_cache, tp->advmss, tp->rx_opt.user_mss); + } + if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); + if (tp->advmss == 0) + tp->advmss = 1460; tcp_initialize_rcv_mss(sk); tcp_select_initial_window(tcp_full_space(sk), @@ -2382,6 +2438,10 @@ int tcp_connect(struct sock *sk) buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); if (unlikely(buff == NULL)) return -ENOBUFS; + if (ub_tcpsndbuf_charge(sk, buff) < 0) { + kfree_skb(buff); + return -ENOBUFS; + } /* Reserve space for headers. */ skb_reserve(buff, MAX_TCP_HEADER); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5ab6ba1..25ed21e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -20,6 +20,8 @@ #include #include +#include +#include int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES; @@ -65,7 +67,8 @@ static void tcp_write_err(struct sock *sk) static int tcp_out_of_resources(struct sock *sk, int do_reset) { struct tcp_sock *tp = tcp_sk(sk); - int orphans = atomic_read(&tcp_orphan_count); + int orphans = ub_get_orphan_count(sk); + int orph = orphans; /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ @@ -76,10 +79,16 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) if (sk->sk_err_soft) orphans <<= 1; - if (tcp_too_many_orphans(sk, orphans)) { - if (net_ratelimit()) - printk(KERN_INFO "Out of socket memory\n"); - + if (ub_too_many_orphans(sk, orphans)) { + if (net_ratelimit()) { + int ubid = 0; +#ifdef CONFIG_USER_RESOURCE + ubid = sock_has_ubc(sk) ? + top_beancounter(sock_bc(sk)->ub)->ub_uid : 0; +#endif + printk(KERN_INFO "Orphaned socket dropped " + "(%d,%d in CT%d)\n", orph, orphans, ubid); + } /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || @@ -172,9 +181,12 @@ static int tcp_write_timeout(struct sock *sk) static void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; + struct ve_struct *env; struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + env = set_exec_env(sk->owner_env); + bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ @@ -223,11 +235,12 @@ static void tcp_delack_timer(unsigned long data) TCP_CHECK_TIMER(sk); out: - if (tcp_memory_pressure) + if (ub_tcp_memory_pressure(sk)) sk_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); + (void)set_exec_env(env); } static void tcp_probe_timer(struct sock *sk) @@ -282,8 +295,11 @@ static void tcp_probe_timer(struct sock *sk) static void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct ve_struct *env; struct inet_connection_sock *icsk = inet_csk(sk); + env = set_exec_env(sk->owner_env); + if (!tp->packets_out) goto out; @@ -391,15 +407,19 @@ out_reset_timer: if (icsk->icsk_retransmits > sysctl_tcp_retries1) __sk_dst_reset(sk); -out:; +out: + (void)set_exec_env(env); } static void tcp_write_timer(unsigned long data) { struct sock *sk = (struct sock*)data; + struct ve_struct *env; struct inet_connection_sock *icsk = inet_csk(sk); int event; + env = set_exec_env(sk->owner_env); + bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later */ @@ -433,6 +453,7 @@ out: out_unlock: bh_unlock_sock(sk); sock_put(sk); + (void)set_exec_env(env); } /* @@ -460,10 +481,13 @@ void tcp_set_keepalive(struct sock *sk, int val) static void tcp_keepalive_timer (unsigned long data) { struct sock *sk = (struct sock *) data; + struct ve_struct *env; struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); __u32 elapsed; + env = set_exec_env(sk->owner_env); + /* Only process if socket is not in use. */ bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -535,4 +559,5 @@ death: out: bh_unlock_sock(sk); sock_put(sk); + (void)set_exec_env(env); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 7b6a584..03be399 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -404,9 +404,8 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) dev->type == ARPHRD_TUNNEL6 || dev->type == ARPHRD_SIT || dev->type == ARPHRD_NONE) { - printk(KERN_INFO - "%s: Disabled Privacy Extensions\n", - dev->name); + ADBG((KERN_INFO "%s: Disabled Privacy Extensions\n", + dev->name)); ndev->cnf.use_tempaddr = -1; } else { in6_dev_hold(ndev); @@ -612,7 +611,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, goto out; } - ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); + ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC_UBC); if (ifa == NULL) { ADBG(("ipv6_add_addr: malloc failed\n")); @@ -2070,7 +2069,7 @@ err_exit: /* * Manual configuration of address on an interface */ -static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, +int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, unsigned int plen, __u8 ifa_flags, __u32 prefered_lft, __u32 valid_lft) { @@ -2142,6 +2141,7 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, return PTR_ERR(ifp); } +EXPORT_SYMBOL_GPL(inet6_addr_add); static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx, unsigned int plen) @@ -2187,7 +2187,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) struct in6_ifreq ireq; int err; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) @@ -2206,7 +2206,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) struct in6_ifreq ireq; int err; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) @@ -2709,6 +2709,9 @@ static int addrconf_ifdown(struct net_device *dev, int how) static void addrconf_rs_timer(unsigned long data) { struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct ve_struct *old_env; + + old_env = set_exec_env(ifp->idev->dev->owner_env); if (ifp->idev->cnf.forwarding) goto out; @@ -2743,6 +2746,7 @@ static void addrconf_rs_timer(unsigned long data) out: in6_ifa_put(ifp); + (void)set_exec_env(old_env); } /* @@ -2819,7 +2823,9 @@ static void addrconf_dad_timer(unsigned long data) struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; struct inet6_dev *idev = ifp->idev; struct in6_addr mcaddr; + struct ve_struct *old_env; + old_env = set_exec_env(ifp->idev->dev->owner_env); read_lock_bh(&idev->lock); if (idev->dead) { read_unlock_bh(&idev->lock); @@ -2855,6 +2861,7 @@ static void addrconf_dad_timer(unsigned long data) ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); out: in6_ifa_put(ifp); + (void)set_exec_env(old_env); } static void addrconf_dad_completed(struct inet6_ifaddr *ifp) @@ -3077,6 +3084,7 @@ static void addrconf_verify(unsigned long foo) struct inet6_ifaddr *ifp; unsigned long now, next; int i; + struct ve_struct *old_env; spin_lock_bh(&addrconf_verify_lock); now = jiffies; @@ -3097,6 +3105,8 @@ restart: if (ifp->flags & IFA_F_PERMANENT) continue; + old_env = set_exec_env(ifp->idev->dev->owner_env); + spin_lock(&ifp->lock); age = (now - ifp->tstamp) / HZ; @@ -3112,9 +3122,11 @@ restart: in6_ifa_hold(ifp); read_unlock(&addrconf_hash_lock); ipv6_del_addr(ifp); + (void)set_exec_env(old_env); goto restart; } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) { spin_unlock(&ifp->lock); + set_exec_env(old_env); continue; } else if (age >= ifp->prefered_lft) { /* jiffies - ifp->tsamp > age >= ifp->prefered_lft */ @@ -3136,6 +3148,7 @@ restart: ipv6_ifa_notify(0, ifp); in6_ifa_put(ifp); + (void)set_exec_env(old_env); goto restart; } #ifdef CONFIG_IPV6_PRIVACY @@ -3157,6 +3170,7 @@ restart: ipv6_create_tempaddr(ifpub, ifp); in6_ifa_put(ifpub); in6_ifa_put(ifp); + (void)set_exec_env(old_env); goto restart; } } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) @@ -3169,6 +3183,7 @@ restart: next = ifp->tstamp + ifp->prefered_lft * HZ; spin_unlock(&ifp->lock); } + (void)set_exec_env(old_env); } read_unlock(&addrconf_hash_lock); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 95055f8..58036a8 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -56,6 +56,10 @@ #ifdef CONFIG_IPV6_TUNNEL #include #endif +#ifdef CONFIG_IPV6_MIP6 +#include +#endif +#include #include #include @@ -140,6 +144,10 @@ lookup_protocol: goto out_rcu_unlock; } + err = vz_security_protocol_check(answer->protocol); + if (err < 0) + goto out_rcu_unlock; + err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; @@ -157,6 +165,13 @@ lookup_protocol: if (sk == NULL) goto out; + err = -ENOBUFS; + if (ub_sock_charge(sk, PF_INET6, sock->type)) + goto out_sk_free; + /* if charge was successful, sock_init_data() MUST be called to + * set sk->sk_type. otherwise sk will be uncharged to wrong resource + */ + sock_init_data(sock, sk); err = 0; @@ -231,6 +246,9 @@ out: out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_free: + sk_free(sk); + return err; } @@ -794,45 +812,48 @@ static void ipv6_packet_cleanup(void) dev_remove_pack(&ipv6_packet_type); } -static int __init init_ipv6_mibs(void) +int init_ipv6_mibs(void) { - if (snmp_mib_init((void **)ipv6_statistics, + if (snmp_mib_init((void **)ve_ipv6_statistics, sizeof(struct ipstats_mib)) < 0) goto err_ip_mib; - if (snmp_mib_init((void **)icmpv6_statistics, + if (snmp_mib_init((void **)ve_icmpv6_statistics, sizeof(struct icmpv6_mib)) < 0) goto err_icmp_mib; - if (snmp_mib_init((void **)icmpv6msg_statistics, + if (snmp_mib_init((void **)ve_icmpv6msg_statistics, sizeof(struct icmpv6msg_mib)) < 0) goto err_icmpmsg_mib; - if (snmp_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib)) < 0) + if (snmp_mib_init((void **)ve_udp_stats_in6, + sizeof (struct udp_mib)) < 0) goto err_udp_mib; - if (snmp_mib_init((void **)udplite_stats_in6, + if (snmp_mib_init((void **)ve_udplite_stats_in6, sizeof (struct udp_mib)) < 0) goto err_udplite_mib; return 0; err_udplite_mib: - snmp_mib_free((void **)udp_stats_in6); + snmp_mib_free((void **)ve_udp_stats_in6); err_udp_mib: - snmp_mib_free((void **)icmpv6msg_statistics); + snmp_mib_free((void **)ve_icmpv6msg_statistics); err_icmpmsg_mib: - snmp_mib_free((void **)icmpv6_statistics); + snmp_mib_free((void **)ve_icmpv6_statistics); err_icmp_mib: - snmp_mib_free((void **)ipv6_statistics); + snmp_mib_free((void **)ve_ipv6_statistics); err_ip_mib: return -ENOMEM; } +EXPORT_SYMBOL(init_ipv6_mibs); -static void cleanup_ipv6_mibs(void) +void cleanup_ipv6_mibs(void) { - snmp_mib_free((void **)ipv6_statistics); - snmp_mib_free((void **)icmpv6_statistics); - snmp_mib_free((void **)icmpv6msg_statistics); - snmp_mib_free((void **)udp_stats_in6); - snmp_mib_free((void **)udplite_stats_in6); + snmp_mib_free((void **)ve_ipv6_statistics); + snmp_mib_free((void **)ve_icmpv6_statistics); + snmp_mib_free((void **)ve_icmpv6msg_statistics); + snmp_mib_free((void **)ve_udp_stats_in6); + snmp_mib_free((void **)ve_udplite_stats_in6); } +EXPORT_SYMBOL(cleanup_ipv6_mibs); static int inet6_net_init(struct net *net) { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 29c7c99..2990d66 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -182,11 +182,9 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb) h = tb->tb6_id & (FIB_TABLE_HASHSZ - 1); - /* - * No protection necessary, this is the only list mutatation - * operation, tables never disappear once they exist. - */ + write_lock_bh(&tb->tb6_lock); hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); + write_unlock_bh(&tb->tb6_lock); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -1370,10 +1368,14 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), for (h = 0; h < FIB_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, node, head, tb6_hlist) { + struct ve_struct *old_env; + + old_env = set_exec_env(table->owner_env); write_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, prune, arg); write_unlock_bh(&table->tb6_lock); + (void)set_exec_env(old_env); } } rcu_read_unlock(); @@ -1493,6 +1495,9 @@ static int fib6_net_init(struct net *net) if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; +#ifdef CONFIG_VE + net->ipv6.fib6_main_tbl->owner_env = get_exec_env(); +#endif net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; net->ipv6.fib6_main_tbl->tb6_root.fn_flags = @@ -1503,6 +1508,10 @@ static int fib6_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; + +#ifdef CONFIG_VE + net->ipv6.fib6_local_tbl->owner_env = get_exec_env(); +#endif net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; net->ipv6.fib6_local_tbl->tb6_root.fn_flags = @@ -1548,7 +1557,7 @@ int __init fib6_init(void) fib6_node_kmem = kmem_cache_create("fib6_nodes", sizeof(struct fib6_node), - 0, SLAB_HWCACHE_ALIGN, + 0, SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); if (!fib6_node_kmem) goto out; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 3df2c44..19dcecc 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -529,6 +529,20 @@ int ip6_forward(struct sk_buff *skb) return -EMSGSIZE; } + /* + * We try to optimize forwarding of VE packets: + * do not decrement TTL (and so save skb_cow) + * during forwarding of outgoing pkts from VE. + * For incoming pkts we still do ttl decr, + * since such skb is not cloned and does not require + * actual cow. So, there is at least one place + * in pkts path with mandatory ttl decr, that is + * sufficient to prevent routing loops. + */ + hdr = ipv6_hdr(skb); + if (skb->dev->features & NETIF_F_VENET) /* src is VENET device */ + goto no_ttl_decr; + if (skb_cow(skb, dst->dev->hard_header_len)) { IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); goto drop; @@ -540,6 +554,7 @@ int ip6_forward(struct sk_buff *skb) hdr->hop_limit--; +no_ttl_decr: IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index e7c03bc..614a5b4 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -243,6 +243,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) return 0; } +EXPORT_SYMBOL_GPL(ipv6_sock_mc_join); /* * socket leave on multicast group @@ -2195,15 +2196,18 @@ static void igmp6_leave_group(struct ifmcaddr6 *ma) static void mld_gq_timer_expire(unsigned long data) { struct inet6_dev *idev = (struct inet6_dev *)data; + struct ve_struct *old_env = set_exec_env(idev->dev->owner_env); idev->mc_gq_running = 0; mld_send_report(idev, NULL); __in6_dev_put(idev); + set_exec_env(old_env); } static void mld_ifc_timer_expire(unsigned long data) { struct inet6_dev *idev = (struct inet6_dev *)data; + struct ve_struct *old_env = set_exec_env(idev->dev->owner_env); mld_send_cr(idev); if (idev->mc_ifc_count) { @@ -2212,6 +2216,7 @@ static void mld_ifc_timer_expire(unsigned long data) mld_ifc_start_timer(idev, idev->mc_maxdelay); } __in6_dev_put(idev); + set_exec_env(old_env); } static void mld_ifc_event(struct inet6_dev *idev) @@ -2226,6 +2231,7 @@ static void mld_ifc_event(struct inet6_dev *idev) static void igmp6_timer_handler(unsigned long data) { struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; + struct ve_struct *old_env = set_exec_env(ma->idev->dev->owner_env); if (MLD_V1_SEEN(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); @@ -2237,6 +2243,7 @@ static void igmp6_timer_handler(unsigned long data) ma->mca_flags &= ~MAF_TIMER_RUNNING; spin_unlock(&ma->mca_lock); ma_put(ma); + set_exec_env(old_env); } /* Device going down */ diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index 5859c04..5bbe792 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -439,7 +439,7 @@ __ipq_rcv_skb(struct sk_buff *skb) if (type <= IPQM_BASE) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); write_lock_bh(&queue_lock); @@ -469,8 +469,12 @@ __ipq_rcv_skb(struct sk_buff *skb) static void ipq_rcv_skb(struct sk_buff *skb) { + struct ve_struct *old_ve; + mutex_lock(&ipqnl_mutex); + old_ve = set_exec_env(skb->owner_env); __ipq_rcv_skb(skb); + (void)set_exec_env(old_ve); mutex_unlock(&ipqnl_mutex); } @@ -480,9 +484,6 @@ ipq_rcv_dev_event(struct notifier_block *this, { struct net_device *dev = ptr; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) ipq_dev_drop(dev->ifindex); @@ -502,7 +503,7 @@ ipq_rcv_nl_event(struct notifier_block *this, if (event == NETLINK_URELEASE && n->protocol == NETLINK_IP6_FW && n->pid) { write_lock_bh(&queue_lock); - if ((n->net == &init_net) && (n->pid == peer_pid)) + if (n->pid == peer_pid) __ipq_reset(); write_unlock_bh(&queue_lock); } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 0b4557e..8244d02 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -365,6 +365,9 @@ ip6t_do_table(struct sk_buff *skb, struct ip6t_entry *e, *back; struct xt_table_info *private; + if (!table) /* VE is not allowed to have this xtable */ + return NF_ACCEPT; + /* Initialization */ indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; @@ -1874,7 +1877,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1985,7 +1988,7 @@ compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2084,7 +2087,7 @@ struct xt_table *ip6t_register_table(struct net *net, struct xt_table *table, int ret; struct xt_table_info *newinfo; struct xt_table_info bootstrap - = { 0, 0, 0, { 0 }, { 0 }, { } }; + = { 0, 0, 0, 0, { 0 }, { 0 }, { } }; void *loc_cpu_entry; struct xt_table *new_table; @@ -2241,11 +2244,22 @@ static struct xt_match icmp6_matchstruct __read_mostly = { static int __net_init ip6_tables_net_init(struct net *net) { - return xt_proto_init(net, AF_INET6); + int res; + + if (!net_ipt_module_permitted(net, VE_IP_IPTABLES6)) + return 0; + + res = xt_proto_init(net, AF_INET6); + if (!res) + net_ipt_module_set(net, VE_IP_IPTABLES6); + return res; } static void __net_exit ip6_tables_net_exit(struct net *net) { + if (!net_is_ipt_module_set(net, VE_IP_IPTABLES6)) + return; + xt_proto_fini(net, AF_INET6); } diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 55a2c29..91bcf27 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -133,16 +133,24 @@ module_param(forward, bool, 0000); static int __net_init ip6table_filter_net_init(struct net *net) { + if (!net_ipt_module_permitted(net, VE_IP_FILTER6)) + return 0; + /* Register table */ net->ipv6.ip6table_filter = ip6t_register_table(net, &packet_filter, &initial_table.repl); if (IS_ERR(net->ipv6.ip6table_filter)) return PTR_ERR(net->ipv6.ip6table_filter); + + net_ipt_module_set(net, VE_IP_FILTER6); return 0; } static void __net_exit ip6table_filter_net_exit(struct net *net) { + if (!net_is_ipt_module_set(net, VE_IP_FILTER6)) + return; + ip6t_unregister_table(net->ipv6.ip6table_filter); } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index f405cea..a4727b3 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -160,16 +160,24 @@ static struct nf_hook_ops ip6t_ops[] __read_mostly = { static int __net_init ip6table_mangle_net_init(struct net *net) { + if (!net_ipt_module_permitted(net, VE_IP_MANGLE6)) + return 0; + /* Register table */ net->ipv6.ip6table_mangle = ip6t_register_table(net, &packet_mangler, &initial_table.repl); if (IS_ERR(net->ipv6.ip6table_mangle)) return PTR_ERR(net->ipv6.ip6table_mangle); + + net_ipt_module_set(net, VE_IP_MANGLE6); return 0; } static void __net_exit ip6table_mangle_net_exit(struct net *net) { + if (!net_is_ipt_module_set(net, VE_IP_MANGLE6)) + return; + ip6t_unregister_table(net->ipv6.ip6table_mangle); } diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 85050c0..a782710 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -359,72 +360,157 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI "); -static int __init nf_conntrack_l3proto_ipv6_init(void) +static int nf_ct_proto_ipv6_init_net(struct net *net) { - int ret = 0; + struct nf_conntrack_l3proto *ipv6; + + ipv6 = &nf_conntrack_l3proto_ipv6; + if (net != &init_net) { + ipv6 = kmemdup(ipv6, + sizeof(struct nf_conntrack_l3proto), GFP_KERNEL); + if (!ipv6) + return -ENOMEM; + } - need_conntrack(); + net->ipv6.nf_conntrack_l3proto_ipv6 = ipv6; + return 0; +} - ret = nf_ct_frag6_init(); - if (ret < 0) { - printk("nf_conntrack_ipv6: can't initialize frag6.\n"); - return ret; - } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6); +static void nf_ct_proto_ipv6_exit_net(struct net *net) +{ + if (net != &init_net) + kfree(net->ipv6.nf_conntrack_l3proto_ipv6); +} + +static struct pernet_operations nf_ct_ipv6_ops = { + .init = nf_ct_proto_ipv6_init_net, + .exit = nf_ct_proto_ipv6_exit_net, +}; + +int init_nf_ct_l3proto_ipv6(void) +{ + struct net *net = get_exec_env()->ve_netns; + + int ret = -ENOMEM; + +#ifdef CONFIG_VE_IPTABLES + if (!ve_is_super(get_exec_env())) + __module_get(THIS_MODULE); + + ret = nf_ct_proto_icmpv6_sysctl_init(); + if (ret < 0) + goto no_mem_icmp; +#endif /* CONFIG_VE_IPTABLES */ + ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_tcp6); if (ret < 0) { printk("nf_conntrack_ipv6: can't register tcp.\n"); - goto cleanup_frag6; + goto cleanup_sys; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6); + ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_udp6); if (ret < 0) { printk("nf_conntrack_ipv6: can't register udp.\n"); - goto cleanup_tcp; + goto unreg_tcp; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmpv6); + ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_icmpv6); if (ret < 0) { printk("nf_conntrack_ipv6: can't register icmpv6.\n"); - goto cleanup_udp; + goto unreg_udp; } - ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6); + ret = nf_conntrack_l3proto_register(net->ipv6.nf_conntrack_l3proto_ipv6); if (ret < 0) { printk("nf_conntrack_ipv6: can't register ipv6\n"); - goto cleanup_icmpv6; + goto unreg_icmpv6; + } + + return 0; + +unreg_icmpv6: + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmpv6); +unreg_udp: + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp6); +unreg_tcp: + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp6); +cleanup_sys: +#ifdef CONFIG_VE_IPTABLES +no_mem_icmp: + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); +#endif /* CONFIG_VE_IPTABLES */ + return ret; +} +EXPORT_SYMBOL(init_nf_ct_l3proto_ipv6); + +void fini_nf_ct_l3proto_ipv6(void) +{ + struct net *net = get_exec_env()->ve_netns; + + nf_conntrack_l3proto_unregister(net->ipv6.nf_conntrack_l3proto_ipv6); + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmpv6); + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp6); + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp6); + +#ifdef CONFIG_VE_IPTABLES + nf_ct_proto_icmpv6_sysctl_cleanup(); + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); +#endif /* CONFIG_VE_IPTABLES */ +} +EXPORT_SYMBOL(fini_nf_ct_l3proto_ipv6); + +static int __init nf_conntrack_l3proto_ipv6_init(void) +{ + int ret = 0; + + need_conntrack(); + + register_pernet_subsys(&nf_ct_ipv6_ops); + + ret = nf_ct_frag6_init(); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't initialize frag6.\n"); + goto unreg_subsys; + } + + ret = init_nf_ct_l3proto_ipv6(); + if (ret < 0) { + printk(KERN_ERR "Unable to initialize netfilter protocols\n"); + goto cleanup_frag6; } ret = nf_register_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); if (ret < 0) { - printk("nf_conntrack_ipv6: can't register pre-routing defrag " - "hook.\n"); - goto cleanup_ipv6; + printk(KERN_ERR "nf_conntrack_ipv6: can't register pre-routing " + "defrag hook.\n"); + goto cleanup_l3proto; } - return ret; + KSYMRESOLVE(init_nf_ct_l3proto_ipv6); + KSYMRESOLVE(fini_nf_ct_l3proto_ipv6); + KSYMMODRESOLVE(nf_conntrack_ipv6); + return 0; - cleanup_ipv6: - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); - cleanup_icmpv6: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); - cleanup_udp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); - cleanup_tcp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); - cleanup_frag6: +cleanup_l3proto: + fini_nf_ct_l3proto_ipv6(); +cleanup_frag6: nf_ct_frag6_cleanup(); +unreg_subsys: + unregister_pernet_subsys(&nf_ct_ipv6_ops); return ret; } static void __exit nf_conntrack_l3proto_ipv6_fini(void) { synchronize_net(); + KSYMMODUNRESOLVE(nf_conntrack_ipv6); + KSYMUNRESOLVE(init_nf_ct_l3proto_ipv6); + KSYMUNRESOLVE(fini_nf_ct_l3proto_ipv6); nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); + fini_nf_ct_l3proto_ipv6(); nf_ct_frag6_cleanup(); + unregister_pernet_subsys(&nf_ct_ipv6_ops); } module_init(nf_conntrack_l3proto_ipv6_init); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 14d47d8..438b543 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -10,6 +10,7 @@ */ #include +#include #include #include #include @@ -94,7 +95,7 @@ static int icmpv6_packet(struct nf_conn *ct, } else { atomic_inc(&ct->proto.icmp.count); nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); - nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout); + nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_icmpv6_timeout); } return NF_ACCEPT; @@ -130,6 +131,7 @@ icmpv6_error_message(struct sk_buff *skb, struct nf_conntrack_tuple intuple, origtuple; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_l4proto *inproto; + struct net *net = get_exec_env()->ve_netns; NF_CT_ASSERT(skb->nfct == NULL); @@ -149,7 +151,7 @@ icmpv6_error_message(struct sk_buff *skb, /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ if (!nf_ct_invert_tuple(&intuple, &origtuple, - &nf_conntrack_l3proto_ipv6, inproto)) { + net->ipv6.nf_conntrack_l3proto_ipv6, inproto)) { pr_debug("icmpv6_error: Can't invert tuple\n"); return -NF_ACCEPT; } @@ -281,3 +283,48 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = .ctl_table = icmpv6_sysctl_table, #endif }; + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +int nf_ct_proto_icmpv6_sysctl_init(void) +{ + struct nf_conntrack_l4proto *icmp6; + + if (ve_is_super(get_exec_env())) { + icmp6 = &nf_conntrack_l4proto_icmpv6; + goto out; + } + + icmp6 = kmemdup(&nf_conntrack_l4proto_icmpv6, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (!icmp6) + goto no_mem_ct; + + icmp6->ctl_table_header = &ve_icmpv6_sysctl_header; + icmp6->ctl_table = kmemdup(icmpv6_sysctl_table, + sizeof(icmpv6_sysctl_table), GFP_KERNEL); + if (!icmp6->ctl_table) + goto no_mem_sys; + + icmp6->ctl_table[0].data = &ve_nf_ct_icmpv6_timeout; +out: + ve_nf_ct_icmpv6_timeout = nf_ct_icmpv6_timeout; + + ve_nf_conntrack_l4proto_icmpv6 = icmp6; + return 0; + +no_mem_sys: + kfree(icmp6); +no_mem_ct: + return -ENOMEM; +} +EXPORT_SYMBOL(nf_ct_proto_icmpv6_sysctl_init); + +void nf_ct_proto_icmpv6_sysctl_cleanup(void) +{ + if (!ve_is_super(get_exec_env())) { + kfree(ve_nf_conntrack_l4proto_icmpv6->ctl_table); + kfree(ve_nf_conntrack_l4proto_icmpv6); + } +} +EXPORT_SYMBOL(nf_ct_proto_icmpv6_sysctl_cleanup); +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 52d06dd..fa67851 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -145,11 +146,12 @@ static void nf_skb_free(struct sk_buff *skb) } /* Memory Tracking Functions. */ -static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work) +static inline void frag_kfree_skb(struct netns_frags *nf, + struct sk_buff *skb, unsigned int *work) { if (work) *work -= skb->truesize; - atomic_sub(skb->truesize, &nf_init_frags.mem); + atomic_sub(skb->truesize, &nf->mem); nf_skb_free(skb); kfree_skb(skb); } @@ -169,10 +171,10 @@ static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq) inet_frag_kill(&fq->q, &nf_frags); } -static void nf_ct_frag6_evictor(void) +static void nf_ct_frag6_evictor(struct netns_frags *nf) { local_bh_disable(); - inet_frag_evictor(&nf_init_frags, &nf_frags); + inet_frag_evictor(nf, &nf_frags); local_bh_enable(); } @@ -198,7 +200,7 @@ out: /* Creation primitives. */ static __inline__ struct nf_ct_frag6_queue * -fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst) +fq_find(struct net *net, __be32 id, struct in6_addr *src, struct in6_addr *dst) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -211,7 +213,7 @@ fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst) read_lock_bh(&nf_frags.lock); hash = ip6qhashfn(id, src, dst); - q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash); + q = inet_frag_find(&net->ipv6.ct_frags, &nf_frags, &arg, hash); local_bh_enable(); if (q == NULL) goto oom; @@ -224,7 +226,8 @@ oom: } -static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, +static int nf_ct_frag6_queue(struct net *net, struct nf_ct_frag6_queue *fq, + struct sk_buff *skb, const struct frag_hdr *fhdr, int nhoff) { struct sk_buff *prev, *next; @@ -365,7 +368,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, fq->q.fragments = next; fq->q.meat -= free_it->len; - frag_kfree_skb(free_it, NULL); + frag_kfree_skb(fq->q.net, free_it, NULL); } } @@ -381,7 +384,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, skb->dev = NULL; fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; - atomic_add(skb->truesize, &nf_init_frags.mem); + atomic_add(skb->truesize, &net->ipv6.ct_frags.mem); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -391,7 +394,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, fq->q.last_in |= INET_FRAG_FIRST_IN; } write_lock(&nf_frags.lock); - list_move_tail(&fq->q.lru_list, &nf_init_frags.lru_list); + list_move_tail(&fq->q.lru_list, &net->ipv6.ct_frags.lru_list); write_unlock(&nf_frags.lock); return 0; @@ -409,7 +412,8 @@ err: * the last and the first frames arrived and all the bits are here. */ static struct sk_buff * -nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) +nf_ct_frag6_reasm(struct net *net, struct nf_ct_frag6_queue *fq, + struct net_device *dev) { struct sk_buff *fp, *op, *head = fq->q.fragments; int payload_len; @@ -458,7 +462,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) clone->ip_summed = head->ip_summed; NFCT_FRAG6_CB(clone)->orig = NULL; - atomic_add(clone->truesize, &nf_init_frags.mem); + atomic_add(clone->truesize, &net->ipv6.ct_frags.mem); } /* We have to remove fragment header from datagram and to relocate @@ -472,7 +476,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) skb_shinfo(head)->frag_list = head->next; skb_reset_transport_header(head); skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &nf_init_frags.mem); + atomic_sub(head->truesize, &net->ipv6.ct_frags.mem); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; @@ -482,7 +486,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; - atomic_sub(fp->truesize, &nf_init_frags.mem); + atomic_sub(fp->truesize, &net->ipv6.ct_frags.mem); } head->next = NULL; @@ -599,6 +603,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) int fhoff, nhoff; u8 prevhdr; struct sk_buff *ret_skb = NULL; + struct net *net = dev_net(dev); /* Jumbo payload inhibits frag. header */ if (ipv6_hdr(skb)->payload_len == 0) { @@ -632,10 +637,11 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) goto ret_orig; } - if (atomic_read(&nf_init_frags.mem) > nf_init_frags.high_thresh) - nf_ct_frag6_evictor(); + if (atomic_read(&net->ipv6.ct_frags.mem) > + net->ipv6.ct_frags.high_thresh) + nf_ct_frag6_evictor(&net->ipv6.ct_frags); - fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr); + fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); goto ret_orig; @@ -643,7 +649,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) spin_lock_bh(&fq->q.lock); - if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { + if (nf_ct_frag6_queue(net, fq, clone, fhdr, nhoff) < 0) { spin_unlock_bh(&fq->q.lock); pr_debug("Can't insert skb to queue\n"); fq_put(fq); @@ -652,7 +658,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { - ret_skb = nf_ct_frag6_reasm(fq, dev); + ret_skb = nf_ct_frag6_reasm(net, fq, dev); if (ret_skb == NULL) pr_debug("Can't reassemble fragmented packets\n"); } @@ -687,8 +693,54 @@ void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, nf_conntrack_put_reasm(skb); } +static int nf_ct_frag6_init_net(struct net *net) +{ + struct netns_frags *frags = &net->ipv6.ct_frags; + +#ifdef CONFIG_SYSCTL + if (net != &init_net) { + struct nf_conntrack_l3proto *ipv6 = + net->ipv6.nf_conntrack_l3proto_ipv6; + + ipv6->ctl_table = kmemdup(nf_ct_ipv6_sysctl_table, + sizeof(nf_ct_ipv6_sysctl_table), + GFP_KERNEL); + if (!ipv6->ctl_table) + return -ENOMEM; + + ipv6->ctl_table_header = NULL; + ipv6->ctl_table_path = nf_net_netfilter_sysctl_path; + + ipv6->ctl_table[0].data = &frags->timeout; + ipv6->ctl_table[1].data = &frags->low_thresh; + ipv6->ctl_table[2].data = &frags->high_thresh; + } +#endif + frags->timeout = IPV6_FRAG_TIMEOUT; + frags->high_thresh = 256 * 1024; + frags->low_thresh = 192 * 1024; + inet_frags_init_net(frags); + + return 0; +} + +static void nf_ct_frag6_exit_net(struct net *net) +{ + inet_frags_exit_net(&net->ipv6.ct_frags, &nf_frags); + if (net != &init_net) + kfree(net->ipv6.nf_conntrack_l3proto_ipv6->ctl_table); + +} + +static struct pernet_operations nf_ct_frag6_ops = { + .init = nf_ct_frag6_init_net, + .exit = nf_ct_frag6_exit_net, +}; + int nf_ct_frag6_init(void) { + register_pernet_subsys(&nf_ct_frag6_ops); + nf_frags.hashfn = nf_hashfn; nf_frags.constructor = ip6_frag_init; nf_frags.destructor = NULL; @@ -697,10 +749,6 @@ int nf_ct_frag6_init(void) nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.secret_interval = 10 * 60 * HZ; - nf_init_frags.timeout = IPV6_FRAG_TIMEOUT; - nf_init_frags.high_thresh = 256 * 1024; - nf_init_frags.low_thresh = 192 * 1024; - inet_frags_init_net(&nf_init_frags); inet_frags_init(&nf_frags); return 0; @@ -709,7 +757,5 @@ int nf_ct_frag6_init(void) void nf_ct_frag6_cleanup(void) { inet_frags_fini(&nf_frags); - - nf_init_frags.low_thresh = 0; - nf_ct_frag6_evictor(); + unregister_pernet_subsys(&nf_ct_frag6_ops); } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 0179b66..652b92b 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -29,8 +29,6 @@ #include #include -static struct proc_dir_entry *proc_net_devsnmp6; - static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; @@ -172,11 +170,11 @@ static int snmp6_seq_show(struct seq_file *seq, void *v) snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list); snmp6_seq_show_icmpv6msg(seq, (void **)idev->stats.icmpv6msg); } else { - snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list); - snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list); - snmp6_seq_show_icmpv6msg(seq, (void **)icmpv6msg_statistics); - snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list); - snmp6_seq_show_item(seq, (void **)udplite_stats_in6, snmp6_udplite6_list); + snmp6_seq_show_item(seq, (void **)ve_ipv6_statistics, snmp6_ipstats_list); + snmp6_seq_show_item(seq, (void **)ve_icmpv6_statistics, snmp6_icmp6_list); + snmp6_seq_show_icmpv6msg(seq, (void **)ve_icmpv6msg_statistics); + snmp6_seq_show_item(seq, (void **)ve_udp_stats_in6, snmp6_udp6_list); + snmp6_seq_show_item(seq, (void **)ve_udplite_stats_in6, snmp6_udplite6_list); } return 0; } @@ -210,18 +208,17 @@ static const struct file_operations snmp6_seq_fops = { int snmp6_register_dev(struct inet6_dev *idev) { struct proc_dir_entry *p; + struct net *net; if (!idev || !idev->dev) return -EINVAL; - if (!net_eq(dev_net(idev->dev), &init_net)) - return 0; - - if (!proc_net_devsnmp6) + net = dev_net(idev->dev); + if (!net->ipv6.proc_dev_snmp) return -ENOENT; p = proc_create_data(idev->dev->name, S_IRUGO, - proc_net_devsnmp6, &snmp6_seq_fops, idev); + net->ipv6.proc_dev_snmp, &snmp6_seq_fops, idev); if (!p) return -ENOMEM; @@ -231,12 +228,14 @@ int snmp6_register_dev(struct inet6_dev *idev) int snmp6_unregister_dev(struct inet6_dev *idev) { - if (!proc_net_devsnmp6) + struct net *net = dev_net(idev->dev); + + if (!net->ipv6.proc_dev_snmp) return -ENOENT; if (!idev || !idev->stats.proc_dir_entry) return -EINVAL; remove_proc_entry(idev->stats.proc_dir_entry->name, - proc_net_devsnmp6); + net->ipv6.proc_dev_snmp); idev->stats.proc_dir_entry = NULL; return 0; } @@ -245,12 +244,24 @@ static int ipv6_proc_init_net(struct net *net) { if (!proc_net_fops_create(net, "sockstat6", S_IRUGO, &sockstat6_seq_fops)) - return -ENOMEM; + goto err_sockstat; + + net->ipv6.proc_dev_snmp = proc_net_mkdir(net, + "dev_snmp6", net->proc_net); + if (!net->ipv6.proc_dev_snmp) + goto err_dev_snmp; + return 0; + +err_dev_snmp: + proc_net_remove(net, "sockstat6"); +err_sockstat: + return -ENOMEM; } static void ipv6_proc_exit_net(struct net *net) { + proc_net_remove(net, "dev_snmp6"); proc_net_remove(net, "sockstat6"); } @@ -269,14 +280,9 @@ int __init ipv6_misc_proc_init(void) if (!proc_net_fops_create(&init_net, "snmp6", S_IRUGO, &snmp6_seq_fops)) goto proc_snmp6_fail; - proc_net_devsnmp6 = proc_mkdir("dev_snmp6", init_net.proc_net); - if (!proc_net_devsnmp6) - goto proc_dev_snmp6_fail; out: return rc; -proc_dev_snmp6_fail: - proc_net_remove(&init_net, "snmp6"); proc_snmp6_fail: unregister_pernet_subsys(&ipv6_proc_ops); proc_net_fail: @@ -286,7 +292,6 @@ proc_net_fail: void ipv6_misc_proc_exit(void) { - proc_net_remove(&init_net, "dev_snmp6"); proc_net_remove(&init_net, "snmp6"); unregister_pernet_subsys(&ipv6_proc_ops); } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 89184b5..e01bbb1 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -196,8 +196,10 @@ static void ip6_frag_expire(unsigned long data) struct frag_queue *fq; struct net_device *dev = NULL; struct net *net; + struct ve_struct *old_ve; fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + old_ve = set_exec_env(fq->q.owner_ve); spin_lock(&fq->q.lock); @@ -232,6 +234,8 @@ out: dev_put(dev); spin_unlock(&fq->q.lock); fq_put(fq); + + (void)set_exec_env(old_ve); } static __inline__ struct frag_queue * @@ -508,6 +512,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, clone->csum = 0; clone->ip_summed = head->ip_summed; atomic_add(clone->truesize, &fq->q.net->mem); + clone->owner_env = head->owner_env; } /* We have to remove fragment header from datagram and to relocate diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 63442a1..803129d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1889,10 +1889,12 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_flags |= RTF_ANYCAST; else rt->rt6i_flags |= RTF_LOCAL; - rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); - if (rt->rt6i_nexthop == NULL) { - dst_free(&rt->u.dst); - return ERR_PTR(-ENOMEM); + rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev); + if (IS_ERR(rt->rt6i_nexthop)) { + void *err = rt->rt6i_nexthop; + rt->rt6i_nexthop = NULL; + dst_free((struct dst_entry *) rt); + return err; } ipv6_addr_copy(&rt->rt6i_dst.addr, addr); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index b7a50e9..9a50bc6 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -87,6 +88,9 @@ static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net, struct ip_tunnel *t; struct sit_net *sitn = net_generic(net, sit_net_id); + if (sitn == NULL) + return NULL; + for (t = sitn->tunnels_r_l[h0^h1]; t; t = t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) @@ -1005,6 +1009,9 @@ static int sit_init_net(struct net *net) int err; struct sit_net *sitn; + if (!(get_exec_env()->features & VE_FEATURE_SIT)) + return 0; + err = -ENOMEM; sitn = kzalloc(sizeof(struct sit_net), GFP_KERNEL); if (sitn == NULL) @@ -1049,6 +1056,9 @@ static void sit_exit_net(struct net *net) struct sit_net *sitn; sitn = net_generic(net, sit_net_id); + if (sitn == NULL) /* no VE_FEATURE_SIT */ + return; + rtnl_lock(); sit_destroy_tunnels(sitn); unregister_netdevice(sitn->fb_tunnel_dev); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 10e22fd..f259d80 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -60,6 +60,8 @@ #include #include +#include + #include #include @@ -74,7 +76,7 @@ static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); -static struct inet_connection_sock_af_ops ipv6_mapped; +struct inet_connection_sock_af_ops ipv6_mapped; static struct inet_connection_sock_af_ops ipv6_specific; #ifdef CONFIG_TCP_MD5SIG static struct tcp_sock_af_ops tcp_sock_ipv6_specific; @@ -1521,6 +1523,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp; struct sk_buff *opt_skb = NULL; + struct user_beancounter *ub; /* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. @@ -1533,6 +1536,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); + ub = set_exec_ub(sock_bc(sk)->ub); + #ifdef CONFIG_TCP_MD5SIG if (tcp_v6_inbound_md5_hash (sk, skb)) goto discard; @@ -1569,7 +1574,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) TCP_CHECK_TIMER(sk); if (opt_skb) goto ipv6_pktoptions; - return 0; + goto restore_context; } if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) @@ -1590,7 +1595,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) goto reset; if (opt_skb) __kfree_skb(opt_skb); - return 0; + goto restore_context; } } @@ -1600,6 +1605,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) TCP_CHECK_TIMER(sk); if (opt_skb) goto ipv6_pktoptions; + +restore_context: + (void)set_exec_ub(ub); return 0; reset: @@ -1608,7 +1616,7 @@ discard: if (opt_skb) __kfree_skb(opt_skb); kfree_skb(skb); - return 0; + goto restore_context; csum_err: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1640,7 +1648,7 @@ ipv6_pktoptions: if (opt_skb) kfree_skb(opt_skb); - return 0; + goto restore_context; } static int tcp_v6_rcv(struct sk_buff *skb) @@ -1823,7 +1831,7 @@ static struct tcp_sock_af_ops tcp_sock_ipv6_specific = { * TCP over IPv4 via INET6 API */ -static struct inet_connection_sock_af_ops ipv6_mapped = { +struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1842,6 +1850,8 @@ static struct inet_connection_sock_af_ops ipv6_mapped = { #endif }; +EXPORT_SYMBOL_GPL(ipv6_mapped); + #ifdef CONFIG_TCP_MD5SIG static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { .md5_lookup = tcp_v4_md5_lookup, diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 08e4cbb..77e5248 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -38,7 +39,7 @@ static struct dst_entry *xfrm6_dst_lookup(int tos, xfrm_address_t *saddr, if (saddr) memcpy(&fl.fl6_src, saddr, sizeof(fl.fl6_src)); - dst = ip6_route_output(&init_net, NULL, &fl); + dst = ip6_route_output(get_exec_env()->ve_netns, NULL, &fl); err = dst->error; if (dst->error) { diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 292fa28..6bf46b5 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -60,6 +60,8 @@ int nf_register_hook(struct nf_hook_ops *reg) struct nf_hook_ops *elem; int err; + BUG_ON(!ve_is_super(get_exec_env())); + err = mutex_lock_interruptible(&nf_hook_mutex); if (err < 0) return err; @@ -75,6 +77,8 @@ EXPORT_SYMBOL(nf_register_hook); void nf_unregister_hook(struct nf_hook_ops *reg) { + BUG_ON(!ve_is_super(get_exec_env())); + mutex_lock(&nf_hook_mutex); list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); @@ -169,8 +173,6 @@ int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb, struct net *net; net = indev == NULL ? dev_net(outdev) : dev_net(indev); - if (net != &init_net) - return 1; #endif /* We may already have this, but read-locks nest anyway */ diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index 59bd8b9..25aca04 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -68,6 +68,9 @@ int nf_conntrack_acct_init(void) { int ret; + if (!ve_is_super(get_exec_env())) + return 0; + #ifdef CONFIG_NF_CT_ACCT printk(KERN_WARNING "CONFIG_NF_CT_ACCT is deprecated and will be removed soon. Plase use\n"); printk(KERN_WARNING "nf_conntrack.acct=1 kernel paramater, acct=1 nf_conntrack module option or\n"); @@ -97,6 +100,8 @@ int nf_conntrack_acct_init(void) void nf_conntrack_acct_fini(void) { + if (!ve_is_super(get_exec_env())) + return; #ifdef CONFIG_SYSCTL unregister_sysctl_table(acct_sysctl_header); #endif diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9d1830d..ee7a3a4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -30,6 +30,8 @@ #include #include +#include + #include #include #include @@ -54,8 +56,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); +#ifndef CONFIG_VE_IPTABLES struct hlist_head *nf_conntrack_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_hash); +#endif struct nf_conn nf_conntrack_untracked __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_untracked); @@ -180,7 +184,14 @@ static void destroy_conntrack(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; + struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; +#ifdef CONFIG_VE_IPTABLES + struct ve_struct *old_ve; + + old_ve = set_exec_env(ct->ct_owner_env); +#endif pr_debug("destroy_conntrack(%p)\n", ct); NF_CT_ASSERT(atomic_read(&nfct->use) == 0); @@ -189,10 +200,17 @@ destroy_conntrack(struct nf_conntrack *nfct) nf_conntrack_event(IPCT_DESTROY, ct); set_bit(IPS_DYING_BIT, &ct->status); + if (help && help->helper && help->helper->destroy) + help->helper->destroy(ct); + /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to nf_conntrack_lock!!! -HW */ rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); + if (l3proto && l3proto->destroy) + l3proto->destroy(ct); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto && l4proto->destroy) l4proto->destroy(ct); @@ -220,6 +238,9 @@ destroy_conntrack(struct nf_conntrack *nfct) pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); nf_conntrack_free(ct); +#ifdef CONFIG_VE_IPTABLES + (void)set_exec_env(old); +#endif } static void death_by_timeout(unsigned long ul_conntrack) @@ -256,7 +277,7 @@ __nf_conntrack_find(const struct nf_conntrack_tuple *tuple) * at least once for the stats anyway. */ local_bh_disable(); - hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) { + hlist_for_each_entry_rcu(h, n, &ve_nf_conntrack_hash[hash], hnode) { if (nf_ct_tuple_equal(tuple, &h->tuple)) { NF_CT_STAT_INC(found); local_bh_enable(); @@ -295,9 +316,9 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int repl_hash) { hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, - &nf_conntrack_hash[hash]); + &ve_nf_conntrack_hash[hash]); hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnode, - &nf_conntrack_hash[repl_hash]); + &ve_nf_conntrack_hash[repl_hash]); } void nf_conntrack_hash_insert(struct nf_conn *ct) @@ -351,11 +372,11 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) + hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[hash], hnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple)) goto out; - hlist_for_each_entry(h, n, &nf_conntrack_hash[repl_hash], hnode) + hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[repl_hash], hnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple)) goto out; @@ -406,7 +427,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, * least once for the stats anyway. */ rcu_read_lock_bh(); - hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) { + hlist_for_each_entry_rcu(h, n, &ve_nf_conntrack_hash[hash], hnode) { if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack && nf_ct_tuple_equal(tuple, &h->tuple)) { NF_CT_STAT_INC(found); @@ -436,7 +457,7 @@ static noinline int early_drop(unsigned int hash) rcu_read_lock(); for (i = 0; i < nf_conntrack_htable_size; i++) { - hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], + hlist_for_each_entry_rcu(h, n, &ve_nf_conntrack_hash[hash], hnode) { tmp = nf_ct_tuplehash_to_ctrack(h); if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) @@ -466,9 +487,11 @@ static noinline int early_drop(unsigned int hash) struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, + struct user_beancounter *ub, gfp_t gfp) { struct nf_conn *ct = NULL; + struct user_beancounter *old_ub; if (unlikely(!nf_conntrack_hash_rnd_initted)) { get_random_bytes(&nf_conntrack_hash_rnd, 4); @@ -476,25 +499,28 @@ struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, } /* We don't want any race condition at early drop stage */ - atomic_inc(&nf_conntrack_count); + atomic_inc(&ve_nf_conntrack_count); - if (nf_conntrack_max && - unlikely(atomic_read(&nf_conntrack_count) > nf_conntrack_max)) { + if (ve_nf_conntrack_max && + unlikely(atomic_read(&ve_nf_conntrack_count) > + ve_nf_conntrack_max)) { unsigned int hash = hash_conntrack(orig); if (!early_drop(hash)) { - atomic_dec(&nf_conntrack_count); + atomic_dec(&ve_nf_conntrack_count); if (net_ratelimit()) - printk(KERN_WARNING - "nf_conntrack: table full, dropping" - " packet.\n"); + ve_printk(VE_LOG_BOTH, KERN_WARNING + "nf_conntrack: CT %d: table full, dropping" + " packet.\n", VEID(get_exec_env())); return ERR_PTR(-ENOMEM); } } + old_ub = set_exec_ub(ub); ct = kmem_cache_zalloc(nf_conntrack_cachep, gfp); + (void)set_exec_ub(old_ub); if (ct == NULL) { pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n"); - atomic_dec(&nf_conntrack_count); + atomic_dec(&ve_nf_conntrack_count); return ERR_PTR(-ENOMEM); } @@ -504,6 +530,9 @@ struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, /* Don't set timer yet: wait for confirmation */ setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); INIT_RCU_HEAD(&ct->rcu); +#ifdef CONFIG_VE_IPTABLES + ct->ct_owner_env = get_exec_env(); +#endif return ct; } @@ -512,10 +541,16 @@ EXPORT_SYMBOL_GPL(nf_conntrack_alloc); static void nf_conntrack_free_rcu(struct rcu_head *head) { struct nf_conn *ct = container_of(head, struct nf_conn, rcu); +#ifdef CONFIG_VE_IPTABLES + struct ve_struct *ve = set_exec_env(ct->ct_owner_env); +#endif nf_ct_ext_free(ct); kmem_cache_free(nf_conntrack_cachep, ct); - atomic_dec(&nf_conntrack_count); + atomic_dec(&ve_nf_conntrack_count); +#ifdef CONFIG_VE_IPTABLES + set_exec_env(ve); +#endif } void nf_conntrack_free(struct nf_conn *ct) @@ -538,13 +573,20 @@ init_conntrack(const struct nf_conntrack_tuple *tuple, struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; struct nf_conntrack_expect *exp; + struct user_beancounter *ub = NULL; if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { pr_debug("Can't invert tuple.\n"); return NULL; } - ct = nf_conntrack_alloc(tuple, &repl_tuple, GFP_ATOMIC); +#ifdef CONFIG_BEANCOUNTERS + if (skb->dev != NULL) /* received skb */ + ub = netdev_bc(skb->dev)->exec_ub; + else if (skb->sk != NULL) /* sent skb */ + ub = sock_bc(skb->sk)->ub; +#endif + ct = nf_conntrack_alloc(tuple, &repl_tuple, ub, GFP_ATOMIC); if (ct == NULL || IS_ERR(ct)) { pr_debug("Can't allocate conntrack.\n"); return (struct nf_conntrack_tuple_hash *)ct; @@ -593,7 +635,8 @@ init_conntrack(const struct nf_conntrack_tuple *tuple, } /* Overload tuple linked list to put us in unconfirmed list. */ - hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, &unconfirmed); + hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, + &ve_unconfirmed); spin_unlock_bh(&nf_conntrack_lock); @@ -947,13 +990,13 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), spin_lock_bh(&nf_conntrack_lock); for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { - hlist_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnode) { + hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[*bucket], hnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) goto found; } } - hlist_for_each_entry(h, n, &unconfirmed, hnode) { + hlist_for_each_entry(h, n, &ve_unconfirmed, hnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) set_bit(IPS_DYING_BIT, &ct->status); @@ -1008,7 +1051,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_flush); supposed to kill the mall. */ void nf_conntrack_cleanup(void) { - rcu_assign_pointer(ip_ct_attach, NULL); + struct ve_struct *ve = get_exec_env(); + + if (ve_is_super(ve)) + rcu_assign_pointer(ip_ct_attach, NULL); /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module @@ -1018,10 +1064,12 @@ void nf_conntrack_cleanup(void) nf_ct_event_cache_flush(); i_see_dead_people: nf_conntrack_flush(); - if (atomic_read(&nf_conntrack_count) != 0) { + if (atomic_read(&ve_nf_conntrack_count) != 0) { schedule(); goto i_see_dead_people; } + if (!ve_is_super(ve)) + goto skip_ct_cache; /* wait until all references to nf_conntrack_untracked are dropped */ while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) schedule(); @@ -1029,13 +1077,19 @@ void nf_conntrack_cleanup(void) rcu_assign_pointer(nf_ct_destroy, NULL); kmem_cache_destroy(nf_conntrack_cachep); - nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc, - nf_conntrack_htable_size); +skip_ct_cache: + nf_ct_free_hashtable(ve_nf_conntrack_hash, ve_nf_conntrack_vmalloc, + nf_conntrack_htable_size); nf_conntrack_acct_fini(); nf_conntrack_expect_fini(); nf_conntrack_helper_fini(); nf_conntrack_proto_fini(); + + nf_ct_proto_generic_sysctl_cleanup(); +#ifdef CONFIG_VE_IPTABLES + kfree(ve->_nf_conntrack); +#endif } struct hlist_head *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced) @@ -1046,13 +1100,13 @@ struct hlist_head *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced) *vmalloced = 0; size = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_head)); - hash = (void*)__get_free_pages(GFP_KERNEL|__GFP_NOWARN, + hash = (void*)__get_free_pages(GFP_KERNEL_UBC|__GFP_NOWARN, get_order(sizeof(struct hlist_head) * size)); if (!hash) { *vmalloced = 1; printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); - hash = vmalloc(sizeof(struct hlist_head) * size); + hash = ub_vmalloc(sizeof(struct hlist_head) * size); } if (hash) @@ -1094,8 +1148,8 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) */ spin_lock_bh(&nf_conntrack_lock); for (i = 0; i < nf_conntrack_htable_size; i++) { - while (!hlist_empty(&nf_conntrack_hash[i])) { - h = hlist_entry(nf_conntrack_hash[i].first, + while (!hlist_empty(&ve_nf_conntrack_hash[i])) { + h = hlist_entry(ve_nf_conntrack_hash[i].first, struct nf_conntrack_tuple_hash, hnode); hlist_del_rcu(&h->hnode); bucket = __hash_conntrack(&h->tuple, hashsize, rnd); @@ -1103,12 +1157,12 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) } } old_size = nf_conntrack_htable_size; - old_vmalloced = nf_conntrack_vmalloc; - old_hash = nf_conntrack_hash; + old_vmalloced = ve_nf_conntrack_vmalloc; + old_hash = ve_nf_conntrack_hash; nf_conntrack_htable_size = hashsize; - nf_conntrack_vmalloc = vmalloced; - nf_conntrack_hash = hash; + ve_nf_conntrack_vmalloc = vmalloced; + ve_nf_conntrack_hash = hash; nf_conntrack_hash_rnd = rnd; spin_unlock_bh(&nf_conntrack_lock); @@ -1120,53 +1174,82 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); -int __init nf_conntrack_init(void) +int nf_conntrack_init(void) { + struct ve_struct *ve = get_exec_env(); int max_factor = 8; - int ret; + int ret = 0, i; + + if (ve_is_super(ve)) { + + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB + * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ + if (!nf_conntrack_htable_size) { + nf_conntrack_htable_size + = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct hlist_head)); + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + nf_conntrack_htable_size = 16384; + if (nf_conntrack_htable_size < 32) + nf_conntrack_htable_size = 32; + + /* Use a max. factor of four by default to get the same + * max as with the old struct list_heads. When a table + * size is given we use the old value of 8 to avoid + * reducing the max. entries. */ + max_factor = 4; + } + nf_conntrack_max = max_factor * nf_conntrack_htable_size; - /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB - * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ - if (!nf_conntrack_htable_size) { - nf_conntrack_htable_size - = (((num_physpages << PAGE_SHIFT) / 16384) - / sizeof(struct hlist_head)); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) - nf_conntrack_htable_size = 16384; - if (nf_conntrack_htable_size < 32) - nf_conntrack_htable_size = 32; - - /* Use a max. factor of four by default to get the same max as - * with the old struct list_heads. When a table size is given - * we use the old value of 8 to avoid reducing the max. - * entries. */ - max_factor = 4; + printk("nf_conntrack version %s (%u buckets, %d max)\n", + NF_CONNTRACK_VERSION, nf_conntrack_htable_size, + nf_conntrack_max); } - nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, - &nf_conntrack_vmalloc); - if (!nf_conntrack_hash) { + +#ifdef CONFIG_VE_IPTABLES + ve->_nf_conntrack = kzalloc(sizeof(struct ve_nf_conntrack), GFP_KERNEL); + if (!ve->_nf_conntrack) { + ret = -ENOMEM; + goto out; + } + + ve_nf_conntrack_max = nf_conntrack_max; + ve_nf_conntrack_checksum = nf_conntrack_checksum; + ve_nf_ct_expect_max = nf_ct_expect_max; + atomic_set(&ve_nf_conntrack_count, 0); + INIT_HLIST_HEAD(&ve_unconfirmed); +#endif + ve_nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, + &ve_nf_conntrack_vmalloc); + if (!ve_nf_conntrack_hash) { printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); goto err_out; } - nf_conntrack_max = max_factor * nf_conntrack_htable_size; - - printk("nf_conntrack version %s (%u buckets, %d max)\n", - NF_CONNTRACK_VERSION, nf_conntrack_htable_size, - nf_conntrack_max); - - nf_conntrack_cachep = kmem_cache_create("nf_conntrack", + if (ve_is_super(ve)) { + nf_conntrack_cachep = kmem_cache_create("nf_conntrack", sizeof(struct nf_conn), - 0, 0, NULL); - if (!nf_conntrack_cachep) { - printk(KERN_ERR "Unable to create nf_conn slab cache\n"); - goto err_free_hash; + 0, SLAB_UBC, NULL); + if (!nf_conntrack_cachep) { + printk(KERN_ERR "Unable to create nf_conn slab cache\n"); + goto err_free_hash; + } } - ret = nf_conntrack_proto_init(); + ret = nf_ct_proto_generic_sysctl_init(); if (ret < 0) goto err_free_conntrack_slab; + ret = nf_conntrack_proto_init(); + if (ret < 0) + goto err_generic_proto; + + /* Don't NEED lock here, but good form anyway. */ + spin_lock_bh(&nf_conntrack_lock); + for (i = 0; i < AF_MAX; i++) + ve_nf_ct_l3protos[i] = &nf_conntrack_l3proto_generic; + spin_unlock_bh(&nf_conntrack_lock); + ret = nf_conntrack_expect_init(); if (ret < 0) goto out_fini_proto; @@ -1179,17 +1262,19 @@ int __init nf_conntrack_init(void) if (ret < 0) goto out_fini_helper; - /* For use by REJECT target */ - rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); - rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); + if (ve_is_super(ve)) { + /* For use by REJECT target */ + rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); + rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); - /* Set up fake conntrack: - - to never be deleted, not in any hashes */ - atomic_set(&nf_conntrack_untracked.ct_general.use, 1); - /* - and look it like as a confirmed connection */ - set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); + /* Set up fake conntrack: + - to never be deleted, not in any hashes */ + atomic_set(&nf_conntrack_untracked.ct_general.use, 1); + /* - and look it like as a confirmed connection */ + set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); + } - return ret; + return 0; out_fini_helper: nf_conntrack_helper_fini(); @@ -1197,11 +1282,18 @@ out_fini_expect: nf_conntrack_expect_fini(); out_fini_proto: nf_conntrack_proto_fini(); +err_generic_proto: + nf_ct_proto_generic_sysctl_cleanup(); err_free_conntrack_slab: - kmem_cache_destroy(nf_conntrack_cachep); + if (ve_is_super(ve)) + kmem_cache_destroy(nf_conntrack_cachep); err_free_hash: - nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc, + nf_ct_free_hashtable(ve_nf_conntrack_hash, nf_conntrack_vmalloc, nf_conntrack_htable_size); err_out: - return -ENOMEM; +#ifdef CONFIG_VE_IPTABLES + kfree(ve->_nf_conntrack); +out: +#endif + return ret; } diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 83c41ac..d0ddfb6 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -53,6 +53,9 @@ void nf_ct_deliver_cached_events(const struct nf_conn *ct) { struct nf_conntrack_ecache *ecache; + if (!ve_is_super(get_exec_env())) + return; + local_bh_disable(); ecache = &__get_cpu_var(nf_conntrack_ecache); if (ecache->ct == ct) @@ -66,6 +69,9 @@ void __nf_ct_event_cache_init(struct nf_conn *ct) { struct nf_conntrack_ecache *ecache; + if (!ve_is_super(get_exec_env())) + return; + /* take care of delivering potentially old events */ ecache = &__get_cpu_var(nf_conntrack_ecache); BUG_ON(ecache->ct == ct); @@ -84,6 +90,9 @@ void nf_ct_event_cache_flush(void) struct nf_conntrack_ecache *ecache; int cpu; + if (!ve_is_super(get_exec_env())) + return; + for_each_possible_cpu(cpu) { ecache = &per_cpu(nf_conntrack_ecache, cpu); if (ecache->ct) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index e8f0dea..88f3fa8 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -28,17 +28,26 @@ #include #include +#ifndef CONFIG_VE_IPTABLES struct hlist_head *nf_ct_expect_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hash); +#endif unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); static unsigned int nf_ct_expect_hash_rnd __read_mostly; -static unsigned int nf_ct_expect_count; unsigned int nf_ct_expect_max __read_mostly; static int nf_ct_expect_hash_rnd_initted __read_mostly; +#ifdef CONFIG_VE_IPTABLES +#define ve_nf_ct_expect_count (get_exec_env()->_nf_conntrack->_nf_ct_expect_count) +#define ve_nf_ct_expect_vmalloc (get_exec_env()->_nf_conntrack->_nf_ct_expect_vmalloc) +#else +static unsigned int nf_ct_expect_count; static int nf_ct_expect_vmalloc; +#define ve_nf_ct_expect_count nf_ct_expect_count +#define ve_nf_ct_expect_vmalloc nf_ct_expect_vmalloc +#endif static struct kmem_cache *nf_ct_expect_cachep __read_mostly; @@ -51,7 +60,7 @@ void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) NF_CT_ASSERT(!timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); - nf_ct_expect_count--; + ve_nf_ct_expect_count--; hlist_del(&exp->lnode); master_help->expecting[exp->class]--; @@ -93,11 +102,11 @@ __nf_ct_expect_find(const struct nf_conntrack_tuple *tuple) struct hlist_node *n; unsigned int h; - if (!nf_ct_expect_count) + if (!ve_nf_ct_expect_count) return NULL; h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry_rcu(i, n, &nf_ct_expect_hash[h], hnode) { + hlist_for_each_entry_rcu(i, n, &ve_nf_ct_expect_hash[h], hnode) { if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) return i; } @@ -130,11 +139,11 @@ nf_ct_find_expectation(const struct nf_conntrack_tuple *tuple) struct hlist_node *n; unsigned int h; - if (!nf_ct_expect_count) + if (!ve_nf_ct_expect_count) return NULL; h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) { + hlist_for_each_entry(i, n, &ve_nf_ct_expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { exp = i; @@ -308,7 +317,7 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp) } EXPORT_SYMBOL_GPL(nf_ct_expect_put); -static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) +void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { struct nf_conn_help *master_help = nfct_help(exp->master); const struct nf_conntrack_expect_policy *p; @@ -319,8 +328,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) hlist_add_head(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; - hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); - nf_ct_expect_count++; + hlist_add_head_rcu(&exp->hnode, &ve_nf_ct_expect_hash[h]); + ve_nf_ct_expect_count++; setup_timer(&exp->timeout, nf_ct_expectation_timed_out, (unsigned long)exp); @@ -331,6 +340,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) atomic_inc(&exp->use); NF_CT_STAT_INC(expect_create); } +EXPORT_SYMBOL_GPL(nf_ct_expect_insert); /* Race with expectations being used means we could have none to find; OK. */ static void evict_oldest_expect(struct nf_conn *master, @@ -383,7 +393,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect) goto out; } h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) { + hlist_for_each_entry(i, n, &ve_nf_ct_expect_hash[h], hnode) { if (expect_matches(i, expect)) { /* Refresh timer: if it's dying, ignore.. */ if (refresh_timer(i)) { @@ -406,7 +416,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect) } } - if (nf_ct_expect_count >= nf_ct_expect_max) { + if (ve_nf_ct_expect_count >= ve_nf_ct_expect_max) { if (net_ratelimit()) printk(KERN_WARNING "nf_conntrack: expectation table full\n"); @@ -434,7 +444,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(nf_ct_expect_hash[st->bucket].first); + n = rcu_dereference(ve_nf_ct_expect_hash[st->bucket].first); if (n) return n; } @@ -450,7 +460,7 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(nf_ct_expect_hash[st->bucket].first); + head = rcu_dereference(ve_nf_ct_expect_hash[st->bucket].first); } return head; } @@ -537,12 +547,13 @@ static const struct file_operations exp_file_ops = { }; #endif /* CONFIG_PROC_FS */ -static int __init exp_proc_init(void) +static int exp_proc_init(void) { #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc; - proc = proc_net_fops_create(&init_net, "nf_conntrack_expect", 0440, &exp_file_ops); + proc = proc_net_fops_create(get_exec_env()->ve_netns, + "nf_conntrack_expect", 0440, &exp_file_ops); if (!proc) return -ENOMEM; #endif /* CONFIG_PROC_FS */ @@ -552,13 +563,13 @@ static int __init exp_proc_init(void) static void exp_proc_remove(void) { #ifdef CONFIG_PROC_FS - proc_net_remove(&init_net, "nf_conntrack_expect"); + proc_net_remove(get_exec_env()->ve_netns, "nf_conntrack_expect"); #endif /* CONFIG_PROC_FS */ } module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0600); -int __init nf_conntrack_expect_init(void) +int nf_conntrack_expect_init(void) { int err = -ENOMEM; @@ -569,16 +580,20 @@ int __init nf_conntrack_expect_init(void) } nf_ct_expect_max = nf_ct_expect_hsize * 4; - nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, - &nf_ct_expect_vmalloc); - if (nf_ct_expect_hash == NULL) + ve_nf_ct_expect_count = 0; + ve_nf_ct_expect_max = nf_ct_expect_max; + ve_nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, + &ve_nf_ct_expect_vmalloc); + if (ve_nf_ct_expect_hash == NULL) goto err1; - nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", + if (ve_is_super(get_exec_env())) { + nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", sizeof(struct nf_conntrack_expect), - 0, 0, NULL); - if (!nf_ct_expect_cachep) - goto err2; + 0, SLAB_UBC, NULL); + if (!nf_ct_expect_cachep) + goto err2; + } err = exp_proc_init(); if (err < 0) @@ -587,9 +602,10 @@ int __init nf_conntrack_expect_init(void) return 0; err3: - kmem_cache_destroy(nf_ct_expect_cachep); + if (ve_is_super(get_exec_env())) + kmem_cache_destroy(nf_ct_expect_cachep); err2: - nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc, + nf_ct_free_hashtable(ve_nf_ct_expect_hash, ve_nf_ct_expect_vmalloc, nf_ct_expect_hsize); err1: return err; @@ -598,7 +614,8 @@ err1: void nf_conntrack_expect_fini(void) { exp_proc_remove(); - kmem_cache_destroy(nf_ct_expect_cachep); - nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc, + if (ve_is_super(get_exec_env())) + kmem_cache_destroy(nf_ct_expect_cachep); + nf_ct_free_hashtable(ve_nf_ct_expect_hash, ve_nf_ct_expect_vmalloc, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 8e0b4c8..a483342 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -30,10 +30,17 @@ #include static DEFINE_MUTEX(nf_ct_helper_mutex); -static struct hlist_head *nf_ct_helper_hash __read_mostly; static unsigned int nf_ct_helper_hsize __read_mostly; static unsigned int nf_ct_helper_count __read_mostly; +#ifdef CONFIG_VE_IPTABLES +#define ve_nf_ct_helper_hash (get_exec_env()->_nf_conntrack->_nf_ct_helper_hash) +#define ve_nf_ct_helper_vmalloc (get_exec_env()->_nf_conntrack->_nf_ct_helper_vmalloc) +#else +static struct hlist_head *nf_ct_helper_hash __read_mostly; static int nf_ct_helper_vmalloc; +#define ve_nf_ct_helper_hash nf_ct_helper_hash +#define ve_nf_ct_helper_vmalloc nf_ct_helper_vmalloc +#endif /* Stupid hash, but collision free for the default registrations of the @@ -56,7 +63,7 @@ __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) return NULL; h = helper_hash(tuple); - hlist_for_each_entry_rcu(helper, n, &nf_ct_helper_hash[h], hnode) { + hlist_for_each_entry_rcu(helper, n, &ve_nf_ct_helper_hash[h], hnode) { if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask)) return helper; } @@ -72,7 +79,7 @@ __nf_conntrack_helper_find_byname(const char *name) unsigned int i; for (i = 0; i < nf_ct_helper_hsize; i++) { - hlist_for_each_entry_rcu(h, n, &nf_ct_helper_hash[i], hnode) { + hlist_for_each_entry_rcu(h, n, &ve_nf_ct_helper_hash[i], hnode) { if (!strcmp(h->name, name)) return h; } @@ -115,7 +122,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); mutex_lock(&nf_ct_helper_mutex); - hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); + hlist_add_head_rcu(&me->hnode, &ve_nf_ct_helper_hash[h]); nf_ct_helper_count++; mutex_unlock(&nf_ct_helper_mutex); @@ -145,7 +152,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) /* Get rid of expectations */ for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, n, next, - &nf_ct_expect_hash[i], hnode) { + &ve_nf_ct_expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((help->helper == me || exp->helper == me) && del_timer(&exp->timeout)) { @@ -156,10 +163,10 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) } /* Get rid of expecteds, set helpers to NULL. */ - hlist_for_each_entry(h, n, &unconfirmed, hnode) + hlist_for_each_entry(h, n, &ve_unconfirmed, hnode) unhelp(h, me); for (i = 0; i < nf_conntrack_htable_size; i++) { - hlist_for_each_entry(h, n, &nf_conntrack_hash[i], hnode) + hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[i], hnode) unhelp(h, me); } spin_unlock_bh(&nf_conntrack_lock); @@ -177,26 +184,29 @@ int nf_conntrack_helper_init(void) int err; nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ - nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, - &nf_ct_helper_vmalloc); - if (!nf_ct_helper_hash) + ve_nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, + &ve_nf_ct_helper_vmalloc); + if (!ve_nf_ct_helper_hash) return -ENOMEM; - err = nf_ct_extend_register(&helper_extend); - if (err < 0) - goto err1; + if (ve_is_super(get_exec_env())) { + err = nf_ct_extend_register(&helper_extend); + if (err < 0) + goto err1; + } return 0; err1: - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, + nf_ct_free_hashtable(ve_nf_ct_helper_hash, ve_nf_ct_helper_vmalloc, nf_ct_helper_hsize); return err; } void nf_conntrack_helper_fini(void) { - nf_ct_extend_unregister(&helper_extend); - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, + if (ve_is_super(get_exec_env())) + nf_ct_extend_unregister(&helper_extend); + nf_ct_free_hashtable(ve_nf_ct_helper_hash, ve_nf_ct_helper_vmalloc, nf_ct_helper_hsize); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index a875203..a09d0e1 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -45,6 +46,8 @@ #include #include +#include +#include MODULE_LICENSE("GPL"); @@ -549,7 +552,8 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conn *)cb->args[1]; for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: - hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[cb->args[0]], + hlist_for_each_entry_rcu(h, n, + &ve_nf_conntrack_hash[cb->args[0]], hnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; @@ -1118,14 +1122,15 @@ static int ctnetlink_create_conntrack(struct nlattr *cda[], struct nf_conntrack_tuple *otuple, struct nf_conntrack_tuple *rtuple, - struct nf_conn *master_ct) + struct nf_conn *master_ct, + struct user_beancounter *ub) { struct nf_conn *ct; int err = -EINVAL; struct nf_conn_help *help; struct nf_conntrack_helper *helper; - ct = nf_conntrack_alloc(otuple, rtuple, GFP_KERNEL); + ct = nf_conntrack_alloc(otuple, rtuple, ub, GFP_KERNEL); if (ct == NULL || IS_ERR(ct)) return -ENOMEM; @@ -1241,11 +1246,19 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, spin_unlock_bh(&nf_conntrack_lock); err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) + if (nlh->nlmsg_flags & NLM_F_CREATE) { + struct user_beancounter *ub = NULL; + +#ifdef CONFIG_BEANCOUNTERS + if (skb->sk) + ub = sock_bc(skb->sk)->ub; +#endif err = ctnetlink_create_conntrack(cda, &otuple, &rtuple, - master_ct); + master_ct, + ub); + } if (err < 0 && master_ct) nf_ct_put(master_ct); @@ -1467,7 +1480,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: - hlist_for_each_entry(exp, n, &nf_ct_expect_hash[cb->args[0]], + hlist_for_each_entry(exp, n, &ve_nf_ct_expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; @@ -1613,7 +1626,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, } for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, n, next, - &nf_ct_expect_hash[i], + &ve_nf_ct_expect_hash[i], hnode) { m_help = nfct_help(exp->master); if (m_help->helper == h @@ -1629,7 +1642,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, spin_lock_bh(&nf_conntrack_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, n, next, - &nf_ct_expect_hash[i], + &ve_nf_ct_expect_hash[i], hnode) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index a49fc93..67c53a7 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -28,7 +28,7 @@ #include #include -static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; +struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_l3protos); @@ -40,7 +40,8 @@ nf_ct_register_sysctl(struct ctl_table_header **header, struct ctl_path *path, struct ctl_table *table, unsigned int *users) { if (*header == NULL) { - *header = register_sysctl_paths(path, table); + *header = register_net_sysctl_table(get_exec_env()->ve_netns, + path, table); if (*header == NULL) return -ENOMEM; } @@ -56,7 +57,7 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header, if (users != NULL && --*users > 0) return; - unregister_sysctl_table(*header); + unregister_net_sysctl_table(*header); *header = NULL; } #endif @@ -64,10 +65,10 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header, struct nf_conntrack_l4proto * __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) { - if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL)) - return &nf_conntrack_l4proto_generic; + if (unlikely(l3proto >= AF_MAX || ve_nf_ct_protos[l3proto] == NULL)) + return ve_nf_conntrack_l4proto_generic; - return rcu_dereference(nf_ct_protos[l3proto][l4proto]); + return rcu_dereference(ve_nf_ct_protos[l3proto][l4proto]); } EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); @@ -81,7 +82,7 @@ nf_ct_l4proto_find_get(u_int16_t l3proto, u_int8_t l4proto) rcu_read_lock(); p = __nf_ct_l4proto_find(l3proto, l4proto); if (!try_module_get(p->me)) - p = &nf_conntrack_l4proto_generic; + p = ve_nf_conntrack_l4proto_generic; rcu_read_unlock(); return p; @@ -188,7 +189,8 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) return -EBUSY; mutex_lock(&nf_ct_proto_mutex); - if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) { + if (ve_nf_ct_l3protos[proto->l3proto] != + &nf_conntrack_l3proto_generic) { ret = -EBUSY; goto out_unlock; } @@ -197,7 +199,7 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) if (ret < 0) goto out_unlock; - rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); + rcu_assign_pointer(ve_nf_ct_l3protos[proto->l3proto], proto); out_unlock: mutex_unlock(&nf_ct_proto_mutex); @@ -210,8 +212,8 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) BUG_ON(proto->l3proto >= AF_MAX); mutex_lock(&nf_ct_proto_mutex); - BUG_ON(nf_ct_l3protos[proto->l3proto] != proto); - rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], + BUG_ON(ve_nf_ct_l3protos[proto->l3proto] != proto); + rcu_assign_pointer(ve_nf_ct_l3protos[proto->l3proto], &nf_conntrack_l3proto_generic); nf_ct_l3proto_unregister_sysctl(proto); mutex_unlock(&nf_ct_proto_mutex); @@ -279,7 +281,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) return -EBUSY; mutex_lock(&nf_ct_proto_mutex); - if (!nf_ct_protos[l4proto->l3proto]) { + if (!ve_nf_ct_protos[l4proto->l3proto]) { /* l3proto may be loaded latter. */ struct nf_conntrack_l4proto **proto_array; int i; @@ -293,10 +295,10 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) } for (i = 0; i < MAX_NF_CT_PROTO; i++) - proto_array[i] = &nf_conntrack_l4proto_generic; - nf_ct_protos[l4proto->l3proto] = proto_array; - } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != - &nf_conntrack_l4proto_generic) { + proto_array[i] = ve_nf_conntrack_l4proto_generic; + ve_nf_ct_protos[l4proto->l3proto] = proto_array; + } else if (ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != + ve_nf_conntrack_l4proto_generic) { ret = -EBUSY; goto out_unlock; } @@ -305,7 +307,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) if (ret < 0) goto out_unlock; - rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + rcu_assign_pointer(ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto], l4proto); out_unlock: @@ -319,9 +321,9 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) BUG_ON(l4proto->l3proto >= PF_MAX); mutex_lock(&nf_ct_proto_mutex); - BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto); - rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], - &nf_conntrack_l4proto_generic); + BUG_ON(ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto); + rcu_assign_pointer(ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + ve_nf_conntrack_l4proto_generic); nf_ct_l4proto_unregister_sysctl(l4proto); mutex_unlock(&nf_ct_proto_mutex); @@ -337,12 +339,12 @@ int nf_conntrack_proto_init(void) unsigned int i; int err; - err = nf_ct_l4proto_register_sysctl(&nf_conntrack_l4proto_generic); + err = nf_ct_l4proto_register_sysctl(ve_nf_conntrack_l4proto_generic); if (err < 0) return err; for (i = 0; i < AF_MAX; i++) - rcu_assign_pointer(nf_ct_l3protos[i], + rcu_assign_pointer(ve_nf_ct_l3protos[i], &nf_conntrack_l3proto_generic); return 0; } @@ -351,9 +353,9 @@ void nf_conntrack_proto_fini(void) { unsigned int i; - nf_ct_l4proto_unregister_sysctl(&nf_conntrack_l4proto_generic); + nf_ct_l4proto_unregister_sysctl(ve_nf_conntrack_l4proto_generic); /* free l3proto protocol tables */ for (i = 0; i < PF_MAX; i++) - kfree(nf_ct_protos[i]); + kfree(ve_nf_ct_protos[i]); } diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index e31b0e7..24b0e29 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -48,7 +49,7 @@ static int packet(struct nf_conn *ct, int pf, unsigned int hooknum) { - nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_generic_timeout); + nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_generic_timeout); return NF_ACCEPT; } @@ -107,3 +108,64 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly = #endif #endif }; + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +int nf_ct_proto_generic_sysctl_init(void) +{ + struct nf_conntrack_l4proto *generic; + + if (ve_is_super(get_exec_env())) { + generic = &nf_conntrack_l4proto_generic; + goto out; + } + + generic = kmemdup(&nf_conntrack_l4proto_generic, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (generic == NULL) + goto no_mem_ct; + + generic->ctl_table_header = &ve_generic_sysctl_header; + generic->ctl_table = kmemdup(generic_sysctl_table, + sizeof(generic_sysctl_table), GFP_KERNEL); + if (generic->ctl_table == NULL) + goto no_mem_sys; + + generic->ctl_table[0].data = &ve_nf_ct_generic_timeout; +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + generic->ctl_compat_table_header = ve_generic_compat_sysctl_header; + generic->ctl_compat_table = kmemdup(generic_compat_sysctl_table, + sizeof(generic_compat_sysctl_table), GFP_KERNEL); + if (generic->ctl_compat_table == NULL) + goto no_mem_compat; + generic->ctl_compat_table[0].data = &ve_nf_ct_generic_timeout; +#endif +out: + ve_nf_ct_generic_timeout = nf_ct_generic_timeout; + + ve_nf_conntrack_l4proto_generic = generic; + return 0; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +no_mem_compat: + kfree(generic->ctl_table); +#endif +no_mem_sys: + kfree(generic); +no_mem_ct: + return -ENOMEM; +} +EXPORT_SYMBOL(nf_ct_proto_generic_sysctl_init); + +void nf_ct_proto_generic_sysctl_cleanup(void) +{ + if (!ve_is_super(get_exec_env())) { +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + kfree(ve_nf_conntrack_l4proto_generic->ctl_compat_table); +#endif + kfree(ve_nf_conntrack_l4proto_generic->ctl_table); + + kfree(ve_nf_conntrack_l4proto_generic); + } +} +EXPORT_SYMBOL(nf_ct_proto_generic_sysctl_cleanup); +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 6f61261..844ae68 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -666,7 +667,7 @@ static bool tcp_in_window(const struct nf_conn *ct, } else { res = false; if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || - nf_ct_tcp_be_liberal) + ve_nf_ct_tcp_be_liberal) res = true; if (!res && LOG_INVALID(IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, @@ -957,15 +958,15 @@ static int tcp_packet(struct nf_conn *ct, && new_state == TCP_CONNTRACK_FIN_WAIT) ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; - if (ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans && - tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans) - timeout = nf_ct_tcp_timeout_max_retrans; + if (ct->proto.tcp.retrans >= ve_nf_ct_tcp_max_retrans && + tcp_timeouts[new_state] > ve_nf_ct_tcp_timeout_max_retrans) + timeout = ve_nf_ct_tcp_timeout_max_retrans; else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) & IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && - tcp_timeouts[new_state] > nf_ct_tcp_timeout_unacknowledged) - timeout = nf_ct_tcp_timeout_unacknowledged; + tcp_timeouts[new_state] > ve_nf_ct_tcp_timeout_unacknowledged) + timeout = ve_nf_ct_tcp_timeout_unacknowledged; else - timeout = tcp_timeouts[new_state]; + timeout = ve_nf_ct_tcp_timeouts[new_state]; write_unlock_bh(&tcp_lock); nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); @@ -1033,7 +1034,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); ct->proto.tcp.seen[1].flags = 0; - } else if (nf_ct_tcp_loose == 0) { + } else if (ve_nf_ct_tcp_loose == 0) { /* Don't try to pick up connections. */ return false; } else { @@ -1435,3 +1436,117 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly = #endif }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6); + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +int nf_ct_proto_tcp_sysctl_init(void) +{ + struct nf_conntrack_l4proto *tcp4, *tcp6; + + if (ve_is_super(get_exec_env())) { + tcp4 = &nf_conntrack_l4proto_tcp4; + tcp6 = &nf_conntrack_l4proto_tcp6; + goto out; + } + + tcp4 = kmemdup(&nf_conntrack_l4proto_tcp4, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (tcp4 == NULL) + goto no_mem_ct4; + + tcp4->ctl_table_users = &ve_tcp_sysctl_table_users; + tcp4->ctl_table_header = &ve_tcp_sysctl_header; + tcp4->ctl_table = kmemdup(tcp_sysctl_table, + sizeof(tcp_sysctl_table), GFP_KERNEL); + if (tcp4->ctl_table == NULL) + goto no_mem_sys; + + tcp4->ctl_table[0].data = &ve_nf_ct_tcp_timeouts[1]; + tcp4->ctl_table[1].data = &ve_nf_ct_tcp_timeouts[2]; + tcp4->ctl_table[2].data = &ve_nf_ct_tcp_timeouts[3]; + tcp4->ctl_table[3].data = &ve_nf_ct_tcp_timeouts[4]; + tcp4->ctl_table[4].data = &ve_nf_ct_tcp_timeouts[5]; + tcp4->ctl_table[5].data = &ve_nf_ct_tcp_timeouts[6]; + tcp4->ctl_table[6].data = &ve_nf_ct_tcp_timeouts[7]; + tcp4->ctl_table[7].data = &ve_nf_ct_tcp_timeouts[8]; + tcp4->ctl_table[8].data = &ve_nf_ct_tcp_timeout_max_retrans; + tcp4->ctl_table[9].data = &ve_nf_ct_tcp_timeout_unacknowledged; + tcp4->ctl_table[10].data = &ve_nf_ct_tcp_loose; + tcp4->ctl_table[11].data = &ve_nf_ct_tcp_be_liberal; + tcp4->ctl_table[12].data = &ve_nf_ct_tcp_max_retrans; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + tcp4->ctl_compat_table_header = ve_tcp_compat_sysctl_header; + tcp4->ctl_compat_table = kmemdup(tcp_compat_sysctl_table, + sizeof(tcp_compat_sysctl_table), GFP_KERNEL); + if (tcp4->ctl_compat_table == NULL) + goto no_mem_compat; + + tcp4->ctl_compat_table[0].data = &ve_nf_ct_tcp_timeouts[1]; + tcp4->ctl_compat_table[1].data = &ve_nf_ct_tcp_timeouts[2]; + tcp4->ctl_compat_table[2].data = &ve_nf_ct_tcp_timeouts[3]; + tcp4->ctl_compat_table[3].data = &ve_nf_ct_tcp_timeouts[4]; + tcp4->ctl_compat_table[4].data = &ve_nf_ct_tcp_timeouts[5]; + tcp4->ctl_compat_table[5].data = &ve_nf_ct_tcp_timeouts[6]; + tcp4->ctl_compat_table[6].data = &ve_nf_ct_tcp_timeouts[7]; + tcp4->ctl_compat_table[7].data = &ve_nf_ct_tcp_timeouts[8]; + tcp4->ctl_compat_table[8].data = &ve_nf_ct_tcp_timeout_max_retrans; + tcp4->ctl_compat_table[9].data = &ve_nf_ct_tcp_loose; + tcp4->ctl_compat_table[10].data = &ve_nf_ct_tcp_be_liberal; + tcp4->ctl_compat_table[11].data = &ve_nf_ct_tcp_max_retrans; +#endif + + tcp6 = kmemdup(&nf_conntrack_l4proto_tcp6, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (!tcp6) + goto no_mem_ct6; + + tcp6->ctl_table_users = &ve_tcp_sysctl_table_users; + tcp6->ctl_table_header = &ve_tcp_sysctl_header; + tcp6->ctl_table = tcp4->ctl_table; +out: + ve_nf_ct_tcp_timeouts[1] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT]; + ve_nf_ct_tcp_timeouts[2] = tcp_timeouts[TCP_CONNTRACK_SYN_RECV]; + ve_nf_ct_tcp_timeouts[3] = tcp_timeouts[TCP_CONNTRACK_ESTABLISHED]; + ve_nf_ct_tcp_timeouts[4] = tcp_timeouts[TCP_CONNTRACK_FIN_WAIT]; + ve_nf_ct_tcp_timeouts[5] = tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT]; + ve_nf_ct_tcp_timeouts[6] = tcp_timeouts[TCP_CONNTRACK_LAST_ACK]; + ve_nf_ct_tcp_timeouts[7] = tcp_timeouts[TCP_CONNTRACK_TIME_WAIT]; + ve_nf_ct_tcp_timeouts[8] = tcp_timeouts[TCP_CONNTRACK_CLOSE]; + ve_nf_ct_tcp_timeout_max_retrans = nf_ct_tcp_timeout_max_retrans; + ve_nf_ct_tcp_timeout_unacknowledged = nf_ct_tcp_timeout_unacknowledged; + ve_nf_ct_tcp_loose = nf_ct_tcp_loose; + ve_nf_ct_tcp_be_liberal = nf_ct_tcp_be_liberal; + ve_nf_ct_tcp_max_retrans = nf_ct_tcp_max_retrans; + + ve_nf_conntrack_l4proto_tcp4 = tcp4; + ve_nf_conntrack_l4proto_tcp6 = tcp6; + return 0; + +no_mem_ct6: +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + kfree(tcp4->ctl_compat_table); +no_mem_compat: +#endif + kfree(tcp4->ctl_table); +no_mem_sys: + kfree(tcp4); +no_mem_ct4: + return -ENOMEM; +} +EXPORT_SYMBOL(nf_ct_proto_tcp_sysctl_init); + +void nf_ct_proto_tcp_sysctl_cleanup(void) +{ + if (!ve_is_super(get_exec_env())) { +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + kfree(ve_nf_conntrack_l4proto_tcp4->ctl_compat_table); +#endif + kfree(ve_nf_conntrack_l4proto_tcp4->ctl_table); + kfree(ve_nf_conntrack_l4proto_tcp4); + + kfree(ve_nf_conntrack_l4proto_tcp6); + } +} +EXPORT_SYMBOL(nf_ct_proto_tcp_sysctl_cleanup); +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ + diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 8b21762..b01823e 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -72,12 +73,13 @@ static int udp_packet(struct nf_conn *ct, /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout_stream); + nf_ct_refresh_acct(ct, ctinfo, skb, + ve_nf_ct_udp_timeout_stream); /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_STATUS, skb); } else - nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout); + nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_udp_timeout); return NF_ACCEPT; } @@ -229,3 +231,85 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = #endif }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +int nf_ct_proto_udp_sysctl_init(void) +{ + struct nf_conntrack_l4proto *udp4, *udp6; + + if (ve_is_super(get_exec_env())) { + udp4 = &nf_conntrack_l4proto_udp4; + udp6 = &nf_conntrack_l4proto_udp6; + goto out; + } + + udp4 = kmemdup(&nf_conntrack_l4proto_udp4, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (udp4 == NULL) + goto no_mem_ct4; + + udp4->ctl_table_users = &ve_udp_sysctl_table_users; + udp4->ctl_table_header = &ve_udp_sysctl_header; + udp4->ctl_table = kmemdup(udp_sysctl_table, + sizeof(udp_sysctl_table), GFP_KERNEL); + if (udp4->ctl_table == NULL) + goto no_mem_sys; + udp4->ctl_table[0].data = &ve_nf_ct_udp_timeout; + udp4->ctl_table[1].data = &ve_nf_ct_udp_timeout_stream; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + udp4->ctl_compat_table_header = ve_udp_compat_sysctl_header; + udp4->ctl_compat_table = kmemdup(udp_compat_sysctl_table, + sizeof(udp_compat_sysctl_table), GFP_KERNEL); + if (udp4->ctl_compat_table == NULL) + goto no_mem_compat; + udp4->ctl_compat_table[0].data = &ve_nf_ct_udp_timeout; + udp4->ctl_compat_table[1].data = &ve_nf_ct_udp_timeout_stream; +#endif + + udp6 = kmemdup(&nf_conntrack_l4proto_udp6, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (!udp6) + goto no_mem_ct6; + + udp6->ctl_table_users = &ve_udp_sysctl_table_users; + udp6->ctl_table_header = &ve_udp_sysctl_header; + udp6->ctl_table = udp4->ctl_table; + + udp6->ctl_table[0].data = &ve_nf_ct_udp_timeout; + udp6->ctl_table[1].data = &ve_nf_ct_udp_timeout_stream; +out: + ve_nf_ct_udp_timeout = nf_ct_udp_timeout; + ve_nf_ct_udp_timeout_stream = nf_ct_udp_timeout_stream; + + ve_nf_conntrack_l4proto_udp4 = udp4; + ve_nf_conntrack_l4proto_udp6 = udp6; + return 0; + +no_mem_ct6: +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + kfree(udp4->ctl_compat_table); +no_mem_compat: +#endif + kfree(udp4->ctl_table); +no_mem_sys: + kfree(udp4); +no_mem_ct4: + return -ENOMEM; +} +EXPORT_SYMBOL(nf_ct_proto_udp_sysctl_init); + +void nf_ct_proto_udp_sysctl_cleanup(void) +{ + if (!ve_is_super(get_exec_env())) { +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + kfree(ve_nf_conntrack_l4proto_udp4->ctl_compat_table); +#endif + kfree(ve_nf_conntrack_l4proto_udp4->ctl_table); + kfree(ve_nf_conntrack_l4proto_udp4); + + kfree(ve_nf_conntrack_l4proto_udp6); + } +} +EXPORT_SYMBOL(nf_ct_proto_udp_sysctl_cleanup); +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 8509db1..8951637 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -18,6 +19,7 @@ #ifdef CONFIG_SYSCTL #include #endif +#include #include #include @@ -29,6 +31,10 @@ MODULE_LICENSE("GPL"); +int ip_conntrack_disable_ve0 = 0; +module_param(ip_conntrack_disable_ve0, int, 0440); +EXPORT_SYMBOL(ip_conntrack_disable_ve0); + #ifdef CONFIG_PROC_FS int print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, @@ -51,7 +57,7 @@ static struct hlist_node *ct_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < nf_conntrack_htable_size; st->bucket++) { - n = rcu_dereference(nf_conntrack_hash[st->bucket].first); + n = rcu_dereference(ve_nf_conntrack_hash[st->bucket].first); if (n) return n; } @@ -67,7 +73,7 @@ static struct hlist_node *ct_get_next(struct seq_file *seq, while (head == NULL) { if (++st->bucket >= nf_conntrack_htable_size) return NULL; - head = rcu_dereference(nf_conntrack_hash[st->bucket].first); + head = rcu_dereference(ve_nf_conntrack_hash[st->bucket].first); } return head; } @@ -226,7 +232,7 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) static int ct_cpu_seq_show(struct seq_file *seq, void *v) { - unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); + unsigned int nr_conntracks = atomic_read(&ve_nf_conntrack_count); const struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { @@ -280,27 +286,30 @@ static const struct file_operations ct_cpu_seq_fops = { static int nf_conntrack_standalone_init_proc(void) { struct proc_dir_entry *pde; + struct net *net = get_exec_env()->ve_netns; - pde = proc_net_fops_create(&init_net, "nf_conntrack", 0440, &ct_file_ops); + pde = proc_net_fops_create(net, "nf_conntrack", 0440, &ct_file_ops); if (!pde) goto out_nf_conntrack; - pde = proc_create("nf_conntrack", S_IRUGO, init_net.proc_net_stat, + pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat, &ct_cpu_seq_fops); if (!pde) goto out_stat_nf_conntrack; return 0; out_stat_nf_conntrack: - proc_net_remove(&init_net, "nf_conntrack"); + proc_net_remove(net, "nf_conntrack"); out_nf_conntrack: return -ENOMEM; } static void nf_conntrack_standalone_fini_proc(void) { - remove_proc_entry("nf_conntrack", init_net.proc_net_stat); - proc_net_remove(&init_net, "nf_conntrack"); + struct net *net = get_exec_env()->ve_netns; + + remove_proc_entry("nf_conntrack", net->proc_net_stat); + proc_net_remove(net, "nf_conntrack"); } #else static int nf_conntrack_standalone_init_proc(void) @@ -323,8 +332,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_checksum); static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; +#if ! (defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL)) static struct ctl_table_header *nf_ct_sysctl_header; static struct ctl_table_header *nf_ct_netfilter_header; +#endif static ctl_table nf_ct_sysctl_table[] = { { @@ -404,21 +415,54 @@ EXPORT_SYMBOL_GPL(nf_ct_log_invalid); static int nf_conntrack_standalone_init_sysctl(void) { - nf_ct_netfilter_header = - register_sysctl_paths(nf_ct_path, nf_ct_netfilter_table); - if (!nf_ct_netfilter_header) - goto out; + struct ctl_table *nf_table, *ct_table; + struct net *net = get_exec_env()->ve_netns; + + nf_table = nf_ct_netfilter_table; + ct_table = nf_ct_sysctl_table; + + if (!ve_is_super(get_exec_env())) { + nf_table = kmemdup(nf_table, sizeof(nf_ct_netfilter_table), + GFP_KERNEL); + if (nf_table == NULL) + goto out; + + ct_table = kmemdup(ct_table, sizeof(nf_ct_sysctl_table), + GFP_KERNEL); + if (ct_table == NULL) + goto err_ctt; + } - nf_ct_sysctl_header = - register_sysctl_paths(nf_net_netfilter_sysctl_path, - nf_ct_sysctl_table); - if (!nf_ct_sysctl_header) - goto out_unregister_netfilter; + nf_table[0].data = &ve_nf_conntrack_max; + ct_table[0].data = &ve_nf_conntrack_max; + ct_table[1].data = &ve_nf_conntrack_count; + /* nf_conntrack_htable_size is shared and readonly */ + ct_table[3].data = &ve_nf_conntrack_checksum; + ct_table[4].data = &ve_nf_ct_log_invalid; + ct_table[5].data = &ve_nf_ct_expect_max; + + ve_nf_ct_netfilter_header = register_net_sysctl_table(net, + nf_ct_path, nf_table); + if (!ve_nf_ct_netfilter_header) + goto err_reg_nf_table; + + ve_nf_ct_sysctl_header = + register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, + ct_table); + if (!ve_nf_ct_sysctl_header) + goto err_reg_ct_table; return 0; -out_unregister_netfilter: - unregister_sysctl_table(nf_ct_netfilter_header); +err_reg_ct_table: + unregister_net_sysctl_table(ve_nf_ct_netfilter_header); +err_reg_nf_table: + if (ct_table != nf_ct_sysctl_table) + kfree(ct_table); +err_ctt: + if (nf_table != nf_ct_netfilter_table) + kfree(nf_table); out: printk("nf_conntrack: can't register to sysctl.\n"); return -ENOMEM; @@ -426,8 +470,16 @@ out: static void nf_conntrack_standalone_fini_sysctl(void) { - unregister_sysctl_table(nf_ct_netfilter_header); - unregister_sysctl_table(nf_ct_sysctl_header); + struct ctl_table *table = ve_nf_ct_sysctl_header->ctl_table_arg; + struct ctl_table *table2 = ve_nf_ct_netfilter_header->ctl_table_arg; + + unregister_net_sysctl_table(ve_nf_ct_sysctl_header); + unregister_net_sysctl_table(ve_nf_ct_netfilter_header); + + if (!ve_is_super(get_exec_env())) { + kfree(table); + kfree(table2); + } } #else static int nf_conntrack_standalone_init_sysctl(void) @@ -440,7 +492,7 @@ static void nf_conntrack_standalone_fini_sysctl(void) } #endif /* CONFIG_SYSCTL */ -static int __init nf_conntrack_standalone_init(void) +static int nf_conntrack_init_ve(void) { int ret; @@ -453,8 +505,19 @@ static int __init nf_conntrack_standalone_init(void) ret = nf_conntrack_standalone_init_sysctl(); if (ret < 0) goto out_sysctl; + ret = nf_ct_proto_tcp_sysctl_init(); + if (ret < 0) + goto out_tcp_sysctl; + ret = nf_ct_proto_udp_sysctl_init(); + if (ret < 0) + goto out_udp_sysctl; + return 0; +out_udp_sysctl: + nf_ct_proto_tcp_sysctl_cleanup(); +out_tcp_sysctl: + nf_conntrack_standalone_fini_sysctl(); out_sysctl: nf_conntrack_standalone_fini_proc(); out_proc: @@ -463,13 +526,36 @@ out: return ret; } -static void __exit nf_conntrack_standalone_fini(void) +static void nf_conntrack_cleanup_ve(void) { + nf_ct_proto_udp_sysctl_cleanup(); + nf_ct_proto_tcp_sysctl_cleanup(); nf_conntrack_standalone_fini_sysctl(); nf_conntrack_standalone_fini_proc(); nf_conntrack_cleanup(); } +static int __init nf_conntrack_standalone_init(void) +{ +#ifdef CONFIG_VE_IPTABLES + KSYMRESOLVE(nf_conntrack_init_ve); + KSYMRESOLVE(nf_conntrack_cleanup_ve); + KSYMMODRESOLVE(nf_conntrack); +#endif + + return nf_conntrack_init_ve(); +} + +static void __exit nf_conntrack_standalone_fini(void) +{ +#ifdef CONFIG_VE_IPTABLES + KSYMMODUNRESOLVE(nf_conntrack); + KSYMUNRESOLVE(nf_conntrack_init_ve); + KSYMUNRESOLVE(nf_conntrack_cleanup_ve); +#endif + nf_conntrack_cleanup_ve(); +} + module_init(nf_conntrack_standalone_init); module_exit(nf_conntrack_standalone_fini); diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c index 0148968..aa01c54 100644 --- a/net/netfilter/nf_sockopt.c +++ b/net/netfilter/nf_sockopt.c @@ -65,9 +65,6 @@ static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, int pf, { struct nf_sockopt_ops *ops; - if (!net_eq(sock_net(sk), &init_net)) - return ERR_PTR(-ENOPROTOOPT); - if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) return ERR_PTR(-EINTR); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index b75c9c4..04491ab 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -124,7 +124,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) const struct nfnetlink_subsystem *ss; int type, err; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) return -EPERM; /* All the messages must at least contain nfgenmsg */ diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 8c86011..d3ad11e 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -555,9 +555,6 @@ nfqnl_rcv_dev_event(struct notifier_block *this, { struct net_device *dev = ptr; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) nfqnl_dev_drop(dev->ifindex); @@ -586,8 +583,7 @@ nfqnl_rcv_nl_event(struct notifier_block *this, struct hlist_head *head = &instance_table[i]; hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { - if ((n->net == &init_net) && - (n->pid == inst->peer_pid)) + if (n->pid == inst->peer_pid) __instance_destroy(inst); } } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 5d75cd8..7ffe66a 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -24,6 +24,8 @@ #include #include +#include + #include #include @@ -64,6 +66,46 @@ static const char *const xt_prefix[NPROTO] = { [NF_ARP] = "arp", }; +#ifdef CONFIG_BEANCOUNTERS +static inline struct user_beancounter *xt_table_ub(struct xt_table_info *info) +{ + struct user_beancounter *ub; + + for (ub = mem_ub(info); ub->parent != NULL; ub = ub->parent); + return ub; +} + +static void uncharge_xtables(struct xt_table_info *info, unsigned long size) +{ + struct user_beancounter *ub; + + ub = xt_table_ub(info); + uncharge_beancounter(ub, UB_NUMXTENT, size); +} + +static int recharge_xtables(int check_ub, + struct xt_table_info *new, struct xt_table_info *old) +{ + struct user_beancounter *ub; + long change; + + ub = xt_table_ub(new); + BUG_ON(check_ub && ub != xt_table_ub(old)); + + change = (long)new->number - (long)old->number; + if (change > 0) { + if (charge_beancounter(ub, UB_NUMXTENT, change, UB_SOFT)) + return -ENOMEM; + } else if (change < 0) + uncharge_beancounter(ub, UB_NUMXTENT, -change); + + return 0; +} +#else +#define recharge_xtables(c, new, old) (0) +#define uncharge_xtables(info, s) do { } while (0) +#endif /* CONFIG_BEANCOUNTERS */ + /* Registration hooks for targets. */ int xt_register_target(struct xt_target *target) @@ -312,23 +354,23 @@ int xt_check_match(const struct xt_match *match, unsigned short family, unsigned short proto, int inv_proto) { if (XT_ALIGN(match->matchsize) != size) { - printk("%s_tables: %s match: invalid size %Zu != %u\n", + ve_printk(VE_LOG, "%s_tables: %s match: invalid size %Zu != %u\n", xt_prefix[family], match->name, XT_ALIGN(match->matchsize), size); return -EINVAL; } if (match->table && strcmp(match->table, table)) { - printk("%s_tables: %s match: only valid in %s table, not %s\n", + ve_printk(VE_LOG, "%s_tables: %s match: only valid in %s table, not %s\n", xt_prefix[family], match->name, match->table, table); return -EINVAL; } if (match->hooks && (hook_mask & ~match->hooks) != 0) { - printk("%s_tables: %s match: bad hook_mask %u/%u\n", + ve_printk(VE_LOG, "%s_tables: %s match: bad hook_mask %u/%u\n", xt_prefix[family], match->name, hook_mask, match->hooks); return -EINVAL; } if (match->proto && (match->proto != proto || inv_proto)) { - printk("%s_tables: %s match: only valid for protocol %u\n", + ve_printk(VE_LOG, "%s_tables: %s match: only valid for protocol %u\n", xt_prefix[family], match->name, match->proto); return -EINVAL; } @@ -453,24 +495,24 @@ int xt_check_target(const struct xt_target *target, unsigned short family, unsigned short proto, int inv_proto) { if (XT_ALIGN(target->targetsize) != size) { - printk("%s_tables: %s target: invalid size %Zu != %u\n", + ve_printk(VE_LOG, "%s_tables: %s target: invalid size %Zu != %u\n", xt_prefix[family], target->name, XT_ALIGN(target->targetsize), size); return -EINVAL; } if (target->table && strcmp(target->table, table)) { - printk("%s_tables: %s target: only valid in %s table, not %s\n", + ve_printk(VE_LOG, "%s_tables: %s target: only valid in %s table, not %s\n", xt_prefix[family], target->name, target->table, table); return -EINVAL; } if (target->hooks && (hook_mask & ~target->hooks) != 0) { - printk("%s_tables: %s target: bad hook_mask %u/%u\n", + ve_printk(VE_LOG, "%s_tables: %s target: bad hook_mask %u/%u\n", xt_prefix[family], target->name, hook_mask, target->hooks); return -EINVAL; } if (target->proto && (target->proto != proto || inv_proto)) { - printk("%s_tables: %s target: only valid for protocol %u\n", + ve_printk(VE_LOG, "%s_tables: %s target: only valid for protocol %u\n", xt_prefix[family], target->name, target->proto); return -EINVAL; } @@ -550,19 +592,19 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages) return NULL; - newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); + newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL_UBC); if (!newinfo) return NULL; - newinfo->size = size; + newinfo->alloc_size = newinfo->size = size; for_each_possible_cpu(cpu) { if (size <= PAGE_SIZE) newinfo->entries[cpu] = kmalloc_node(size, - GFP_KERNEL, + GFP_KERNEL_UBC, cpu_to_node(cpu)); else - newinfo->entries[cpu] = vmalloc_node(size, + newinfo->entries[cpu] = ub_vmalloc_node(size, cpu_to_node(cpu)); if (newinfo->entries[cpu] == NULL) { @@ -580,7 +622,7 @@ void xt_free_table_info(struct xt_table_info *info) int cpu; for_each_possible_cpu(cpu) { - if (info->size <= PAGE_SIZE) + if (info->alloc_size <= PAGE_SIZE) kfree(info->entries[cpu]); else vfree(info->entries[cpu]); @@ -645,6 +687,13 @@ xt_replace_table(struct xt_table *table, return NULL; } oldinfo = private; + + if (recharge_xtables(num_counters != 0, newinfo, oldinfo)) { + write_unlock_bh(&table->lock); + *error = -ENOMEM; + return NULL; + } + table->private = newinfo; newinfo->initial_entries = oldinfo->initial_entries; write_unlock_bh(&table->lock); @@ -714,6 +763,7 @@ void *xt_unregister_table(struct xt_table *table) list_del(&table->list); mutex_unlock(&xt[table->af].mutex); kfree(table); + uncharge_xtables(private, private->number); return private; } diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index 97efd74..d0453de 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -73,7 +73,7 @@ dscp_tg_check(const char *tablename, const void *e_void, const u_int8_t dscp = ((struct xt_DSCP_info *)targinfo)->dscp; if (dscp > XT_DSCP_MAX) { - printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp); + ve_printk(VE_LOG, KERN_WARNING "DSCP: dscp %x out of range\n", dscp); return false; } return true; diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c index f9ce20b..030ba07 100644 --- a/net/netfilter/xt_MARK.c +++ b/net/netfilter/xt_MARK.c @@ -80,7 +80,7 @@ mark_tg_check_v0(const char *tablename, const void *entry, const struct xt_mark_target_info *markinfo = targinfo; if (markinfo->mark > 0xffffffff) { - printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + ve_printk(VE_LOG, KERN_WARNING "MARK: Only supports 32bit wide mark\n"); return false; } return true; @@ -96,12 +96,12 @@ mark_tg_check_v1(const char *tablename, const void *entry, if (markinfo->mode != XT_MARK_SET && markinfo->mode != XT_MARK_AND && markinfo->mode != XT_MARK_OR) { - printk(KERN_WARNING "MARK: unknown mode %u\n", + ve_printk(VE_LOG, KERN_WARNING "MARK: unknown mode %u\n", markinfo->mode); return false; } if (markinfo->mark > 0xffffffff) { - printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + ve_printk(VE_LOG, KERN_WARNING "MARK: Only supports 32bit wide mark\n"); return false; } return true; diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index beb5094..dbd407b 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -67,7 +67,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, badly. --RR */ if (tcplen != tcph->doff*4) { if (net_ratelimit()) - printk(KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n", + ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n", skb->len); return -1; } @@ -75,7 +75,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, if (info->mss == XT_TCPMSS_CLAMP_PMTU) { if (dst_mtu(skb->dst) <= minlen) { if (net_ratelimit()) - printk(KERN_ERR "xt_TCPMSS: " + ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: " "unknown or invalid path-MTU (%u)\n", dst_mtu(skb->dst)); return -1; @@ -253,13 +253,13 @@ tcpmss_tg4_check(const char *tablename, const void *entry, (hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { - printk("xt_TCPMSS: path-MTU clamping only supported in " + ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in " "FORWARD, OUTPUT and POSTROUTING hooks\n"); return false; } if (IPT_MATCH_ITERATE(e, find_syn_match)) return true; - printk("xt_TCPMSS: Only works on TCP SYN packets\n"); + ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n"); return false; } @@ -276,13 +276,13 @@ tcpmss_tg6_check(const char *tablename, const void *entry, (hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { - printk("xt_TCPMSS: path-MTU clamping only supported in " + ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in " "FORWARD, OUTPUT and POSTROUTING hooks\n"); return false; } if (IP6T_MATCH_ITERATE(e, find_syn_match)) return true; - printk("xt_TCPMSS: Only works on TCP SYN packets\n"); + ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n"); return false; } #endif diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index d9418a2..8a5736a 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -41,8 +42,13 @@ MODULE_ALIAS("ipt_hashlimit"); MODULE_ALIAS("ip6t_hashlimit"); /* need to declare this at the top */ +#ifdef CONFIG_VE_IPTABLES +#define hashlimit_procdir4 (get_exec_env()->_xt_hashlimit->hashlimit_procdir4) +#define hashlimit_procdir6 (get_exec_env()->_xt_hashlimit->hashlimit_procdir6) +#else static struct proc_dir_entry *hashlimit_procdir4; static struct proc_dir_entry *hashlimit_procdir6; +#endif static const struct file_operations dl_file_ops; /* hash table crap */ @@ -99,9 +105,16 @@ struct xt_hashlimit_htable { static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */ static DEFINE_MUTEX(hlimit_mutex); /* additional checkentry protection */ +#ifdef CONFIG_VE_IPTABLES +#define hashlimit_htables (get_exec_env()->_xt_hashlimit->hashlimit_htables) +#else static HLIST_HEAD(hashlimit_htables); +#endif static struct kmem_cache *hashlimit_cachep __read_mostly; +static int init_xt_hashlimit(void); +static void fini_xt_hashlimit(void); + static inline bool dst_cmp(const struct dsthash_ent *ent, const struct dsthash_dst *b) { @@ -700,6 +713,9 @@ hashlimit_mt_check_v0(const char *tablename, const void *inf, if (r->name[sizeof(r->name) - 1] != '\0') return false; + if (init_xt_hashlimit()) + return 0; + /* This is the best we've got: We cannot release and re-grab lock, * since checkentry() is called before x_tables.c grabs xt_mutex. * We also cannot grab the hashtable spinlock, since htable_create will @@ -746,6 +762,9 @@ hashlimit_mt_check(const char *tablename, const void *inf, return false; } + if (init_xt_hashlimit()) + return 0; + /* This is the best we've got: We cannot release and re-grab lock, * since checkentry() is called before x_tables.c grabs xt_mutex. * We also cannot grab the hashtable spinlock, since htable_create will @@ -768,6 +787,8 @@ hashlimit_mt_destroy_v0(const struct xt_match *match, void *matchinfo) const struct xt_hashlimit_info *r = matchinfo; htable_put(r->hinfo); + if (!ve_is_super(get_exec_env()) && hlist_empty(&hashlimit_htables)) + fini_xt_hashlimit(); } static void @@ -776,6 +797,8 @@ hashlimit_mt_destroy(const struct xt_match *match, void *matchinfo) const struct xt_hashlimit_mtinfo1 *info = matchinfo; htable_put(info->hinfo); + if (!ve_is_super(get_exec_env()) && hlist_empty(&hashlimit_htables)) + fini_xt_hashlimit(); } #ifdef CONFIG_COMPAT @@ -978,6 +1001,78 @@ static const struct file_operations dl_file_ops = { .release = seq_release }; +static inline struct proc_dir_entry *proc_from_netns(void) +{ +#if defined(CONFIG_VE) + return get_exec_env()->ve_netns->proc_net; +#else + return init_net.proc_net; +#endif +} + +static int init_xt_hashlimit(void) +{ + struct proc_dir_entry *proc_net = proc_from_netns(); + +#if defined(CONFIG_VE_IPTABLES) + struct ve_struct *ve = get_exec_env(); + + if (ve->_xt_hashlimit) + return 0; + + ve->_xt_hashlimit = kzalloc(sizeof(struct ve_xt_hashlimit), GFP_KERNEL); + if (!ve->_xt_hashlimit) + goto err1; +#endif + INIT_HLIST_HEAD(&hashlimit_htables); + + hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", proc_net); + if (!hashlimit_procdir4) { + printk(KERN_ERR "xt_hashlimit: unable to create proc dir " + "entry\n"); + goto err2; + } +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", proc_net); + if (!hashlimit_procdir6) { + printk(KERN_ERR "xt_hashlimit: unable to create proc dir " + "entry\n"); + goto err3; + } +#endif + + return 0; + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +err3: + remove_proc_entry("ipt_hashlimit", proc_net); +#endif +err2: +#if defined(CONFIG_VE_IPTABLES) + kfree(ve->_xt_hashlimit); + ve->_xt_hashlimit = NULL; +err1: +#endif + return -ENOMEM; +} + +static void fini_xt_hashlimit(void) +{ + struct proc_dir_entry *proc_net = proc_from_netns(); +#ifdef CONFIG_VE_IPTABLES + struct ve_struct *ve = get_exec_env(); +#endif +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + remove_proc_entry("ip6t_hashlimit", proc_net); +#endif + remove_proc_entry("ipt_hashlimit", proc_net); + +#if defined(CONFIG_VE_IPTABLES) + kfree(ve->_xt_hashlimit); + ve->_xt_hashlimit = NULL; +#endif +} + static int __init hashlimit_mt_init(void) { int err; @@ -995,24 +1090,11 @@ static int __init hashlimit_mt_init(void) printk(KERN_ERR "xt_hashlimit: unable to create slab cache\n"); goto err2; } - hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", init_net.proc_net); - if (!hashlimit_procdir4) { - printk(KERN_ERR "xt_hashlimit: unable to create proc dir " - "entry\n"); + err = init_xt_hashlimit(); + if (err) goto err3; - } - err = 0; -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) - hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", init_net.proc_net); - if (!hashlimit_procdir6) { - printk(KERN_ERR "xt_hashlimit: unable to create proc dir " - "entry\n"); - err = -ENOMEM; - } -#endif if (!err) return 0; - remove_proc_entry("ipt_hashlimit", init_net.proc_net); err3: kmem_cache_destroy(hashlimit_cachep); err2: @@ -1024,10 +1106,7 @@ err1: static void __exit hashlimit_mt_exit(void) { - remove_proc_entry("ipt_hashlimit", init_net.proc_net); -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) - remove_proc_entry("ip6t_hashlimit", init_net.proc_net); -#endif + fini_xt_hashlimit(); kmem_cache_destroy(hashlimit_cachep); xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); } diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index aad9ab8..91570c7 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -105,7 +105,7 @@ limit_mt_check(const char *tablename, const void *inf, /* Check for overflow. */ if (r->burst == 0 || user2credits(r->avg * r->burst) < user2credits(r->avg)) { - printk("Overflow in xt_limit, try lower: %u/%u\n", + ve_printk(VE_LOG, "Overflow in xt_limit, try lower: %u/%u\n", r->avg, r->burst); return false; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b0eacc0..7c9c394 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -60,29 +60,14 @@ #include #include #include +#include + +#include +#include #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) -struct netlink_sock { - /* struct sock has to be the first member of netlink_sock */ - struct sock sk; - u32 pid; - u32 dst_pid; - u32 dst_group; - u32 flags; - u32 subscriptions; - u32 ngroups; - unsigned long *groups; - unsigned long state; - wait_queue_head_t wait; - struct netlink_callback *cb; - struct mutex *cb_mutex; - struct mutex cb_def_mutex; - void (*netlink_rcv)(struct sk_buff *skb); - struct module *module; -}; - #define NETLINK_KERNEL_SOCKET 0x1 #define NETLINK_RECV_PKTINFO 0x2 @@ -402,6 +387,8 @@ static int __netlink_create(struct net *net, struct socket *sock, sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto); if (!sk) return -ENOMEM; + if (ub_other_sock_charge(sk)) + goto out_free; sock_init_data(sock, sk); @@ -417,6 +404,10 @@ static int __netlink_create(struct net *net, struct socket *sock, sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; return 0; + +out_free: + sk_free(sk); + return -ENOMEM; } static int netlink_create(struct net *net, struct socket *sock, int protocol) @@ -523,7 +514,7 @@ static int netlink_autobind(struct socket *sock) struct hlist_head *head; struct sock *osk; struct hlist_node *node; - s32 pid = current->tgid; + s32 pid = task_tgid_vnr(current); int err; static s32 rover = -4097; @@ -559,7 +550,7 @@ retry: static inline int netlink_capable(struct socket *sock, unsigned int flag) { return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || - capable(CAP_NET_ADMIN); + capable(CAP_VE_NET_ADMIN); } static void @@ -764,12 +755,20 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk) { struct netlink_sock *nlk; + unsigned long chargesize; + int no_ubc; nlk = nlk_sk(sk); - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + chargesize = skb_charge_fullsize(skb); + no_ubc = ub_sock_getwres_other(sk, chargesize); + if (no_ubc || atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(0, &nlk->state)) { DECLARE_WAITQUEUE(wait, current); + + if (!no_ubc) + ub_sock_retwres_other(sk, chargesize, + SOCK_MIN_UBCSPACE_CH); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) netlink_overrun(sk); @@ -781,13 +780,20 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&nlk->wait, &wait); + /* this if can't be moved upper because ub_sock_snd_queue_add() + * may change task state to TASK_RUNNING */ + if (no_ubc) + ub_sock_sndqueueadd_other(sk, chargesize); + if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) && + test_bit(0, &nlk->state) || no_ubc) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); __set_current_state(TASK_RUNNING); remove_wait_queue(&nlk->wait, &wait); + if (no_ubc) + ub_sock_sndqueuedel(sk); sock_put(sk); if (signal_pending(current)) { @@ -797,6 +803,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, return 1; } skb_set_owner_r(skb, sk); + ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF); return 0; } @@ -962,6 +969,9 @@ static inline int do_one_broadcast(struct sock *sk, !test_bit(p->group - 1, nlk->groups)) goto out; + if (!ve_accessible_strict(get_exec_env(), sk->owner_env)) + goto out; + if (!net_eq(sock_net(sk), p->net)) goto out; @@ -1531,6 +1541,10 @@ static int netlink_dump(struct sock *sk) skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); if (!skb) goto errout; + if (ub_nlrcvbuf_charge(skb, sk) < 0) { + kfree_skb(skb); + return -EACCES; + } mutex_lock(nlk->cb_mutex); diff --git a/net/netlink/attr.c b/net/netlink/attr.c index 2d106cf..d9846a4 100644 --- a/net/netlink/attr.c +++ b/net/netlink/attr.c @@ -164,7 +164,7 @@ int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, } if (unlikely(rem > 0)) - printk(KERN_WARNING "netlink: %d bytes leftover after parsing " + ve_printk(VE_LOG, KERN_WARNING "netlink: %d bytes leftover after parsing " "attributes.\n", rem); err = 0; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 3e1191c..f5c0578 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -437,7 +437,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EOPNOTSUPP; if ((ops->flags & GENL_ADMIN_PERM) && - security_netlink_recv(skb, CAP_NET_ADMIN)) + security_netlink_recv(skb, CAP_VE_NET_ADMIN)) return -EPERM; if (nlh->nlmsg_flags & NLM_F_DUMP) { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c718e7e..7fcd47b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -78,6 +78,8 @@ #include #include +#include + #ifdef CONFIG_INET #include #endif @@ -493,6 +495,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet if (dev_net(dev) != sock_net(sk)) goto drop; + skb_orphan(skb); + skb->dev = dev; if (dev->header_ops) { @@ -556,6 +560,9 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet if (pskb_trim(skb, snaplen)) goto drop_n_acct; + if (ub_sockrcvbuf_charge(sk, skb)) + goto drop_n_acct; + skb_set_owner_r(skb, sk); skb->dev = NULL; dst_release(skb->dst); @@ -615,6 +622,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe if (dev_net(dev) != sock_net(sk)) goto drop; + skb_orphan(skb); + if (dev->header_ops) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); @@ -664,6 +673,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe snaplen = 0; } + if (copy_skb && + ub_sockrcvbuf_charge(sk, copy_skb)) { + spin_lock(&sk->sk_receive_queue.lock); + goto ring_is_full; + } + spin_lock(&sk->sk_receive_queue.lock); h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL); if (!h.raw) @@ -1049,6 +1064,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol) sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); if (sk == NULL) goto out; + if (ub_other_sock_charge(sk)) + goto out_free; sock->ops = &packet_ops; if (sock->type == SOCK_PACKET) @@ -1086,6 +1103,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol) sk_add_node(sk, &net->packet.sklist); write_unlock_bh(&net->packet.sklist_lock); return(0); + +out_free: + sk_free(sk); out: return err; } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 8b06fa9..9a20704 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -906,8 +906,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio) if (cl->deficit <= 0) { q->active[prio] = cl; - cl = cl->next_alive; cl->deficit += cl->quantum; + cl = cl->next_alive; } return skb; @@ -1080,17 +1080,19 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) for (h = 0; h < q->clhash.hashsize; h++) { hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + long mtu; /* BUGGGG... Beware! This expression suffer of arithmetic overflows! */ if (cl->priority == prio) { - cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ - q->quanta[prio]; - } - if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) { - printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum); - cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; + cl->quantum = (cl->weight * cl->allot) / + (q->quanta[prio] / q->nclasses[prio]); } + mtu = qdisc_dev(cl->qdisc)->mtu; + if (cl->quantum <= mtu/2) + cl->quantum = mtu/2 + 1; + else if (cl->quantum > 32*mtu) + cl->quantum = 32*mtu; } } } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ec0a083..3777682 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -121,11 +121,13 @@ static inline int qdisc_restart(struct Qdisc *q) struct net_device *dev; spinlock_t *root_lock; struct sk_buff *skb; + struct ve_struct *old_ve; /* Dequeue packet */ if (unlikely((skb = dequeue_skb(q)) == NULL)) return 0; + old_ve = set_exec_env(skb->owner_env); root_lock = qdisc_lock(q); /* And release qdisc */ @@ -167,6 +169,8 @@ static inline int qdisc_restart(struct Qdisc *q) netif_tx_queue_frozen(txq))) ret = 0; + (void)set_exec_env(old_ve); + return ret; } diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index d35ef05..01ca7fb 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -182,6 +182,9 @@ static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt) struct teql_master *m = (struct teql_master*)sch->ops; struct teql_sched_data *q = qdisc_priv(sch); + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (dev->hard_header_len > m->dev->hard_header_len) return -EINVAL; diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index a1f654a..68605a7 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -701,7 +701,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, if (rx_count >= asoc->base.sk->sk_rcvbuf) { if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || - (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize))) + (!sk_rmem_schedule(asoc->base.sk, chunk->skb))) goto fail; } diff --git a/net/socket.c b/net/socket.c index 3e8d4e3..347066e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include @@ -162,15 +163,6 @@ static DEFINE_PER_CPU(int, sockets_in_use) = 0; * divide and look after the messy bits. */ -#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - - 16 for IP, 16 for IPX, - 24 for IPv6, - about 80 for AX.25 - must be at least one bigger than - the AF_UNIX size (see net/unix/af_unix.c - :unix_mkname()). - */ - /** * move_addr_to_kernel - copy a socket address into kernel space * @uaddr: Address in user space @@ -192,6 +184,7 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr) return -EFAULT; return audit_sockaddr(ulen, kaddr); } +EXPORT_SYMBOL(move_addr_to_kernel); /** * move_addr_to_user - copy an address to user space @@ -499,6 +492,8 @@ static struct socket *sock_alloc(void) return sock; } +EXPORT_SYMBOL(sock_alloc); + /* * In theory you can't get an open on this inode, but /proc provides * a back door. Remember to keep it shut otherwise you'll let the @@ -526,6 +521,9 @@ const struct file_operations bad_sock_fops = { void sock_release(struct socket *sock) { + if (sock->sk) + ub_sock_sndqueuedel(sock->sk); + if (sock->ops) { struct module *owner = sock->ops->owner; @@ -1093,6 +1091,49 @@ call_kill: return 0; } +int vz_security_family_check(int family) +{ +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) + return 0; + + switch (family) { + case PF_UNSPEC: + case PF_PACKET: + case PF_NETLINK: + case PF_UNIX: + case PF_INET: + case PF_INET6: + break; + default: + return -EAFNOSUPPORT; + } +#endif + return 0; +} +EXPORT_SYMBOL_GPL(vz_security_family_check); + +int vz_security_protocol_check(int protocol) +{ +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) + return 0; + + switch (protocol) { + case IPPROTO_IP: + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_RAW: + case IPPROTO_DCCP: + break; + default: + return -EAFNOSUPPORT; + } +#endif + return 0; +} +EXPORT_SYMBOL_GPL(vz_security_protocol_check); + static int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { @@ -1123,6 +1164,11 @@ static int __sock_create(struct net *net, int family, int type, int protocol, family = PF_PACKET; } + /* VZ compatibility layer */ + err = vz_security_family_check(family); + if (err < 0) + return err; + err = security_socket_create(family, type, protocol, kern); if (err) return err; @@ -2436,9 +2482,12 @@ int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg) { mm_segment_t oldfs = get_fs(); int err; + struct ve_struct *old_env; set_fs(KERNEL_DS); + old_env = set_exec_env(sock->sk->owner_env); err = sock->ops->ioctl(sock, cmd, arg); + (void)set_exec_env(old_env); set_fs(oldfs); return err; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 76739e9..33bb7d7 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -89,6 +90,35 @@ static void rpc_unregister_client(struct rpc_clnt *clnt) spin_unlock(&rpc_client_lock); } +/* + * Grand abort timeout (stop the client if occures) + */ +int xprt_abort_timeout = RPC_MAX_ABORT_TIMEOUT; + +static int rpc_abort_hard(struct rpc_task *task) +{ + struct rpc_clnt *clnt; + clnt = task->tk_client; + + if (clnt->cl_pr_time == 0) { + clnt->cl_pr_time = jiffies; + return 0; + } + if (xprt_abort_timeout == RPC_MAX_ABORT_TIMEOUT) + return 0; + if (time_before(jiffies, clnt->cl_pr_time + xprt_abort_timeout * HZ)) + return 0; + + clnt->cl_broken = 1; + rpc_killall_tasks(clnt); + return -ETIMEDOUT; +} + +static void rpc_abort_clear(struct rpc_task *task) +{ + task->tk_client->cl_pr_time = 0; +} + static int rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name) { @@ -178,6 +208,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru clnt->cl_vers = version->number; clnt->cl_stats = program->stats; clnt->cl_metrics = rpc_alloc_iostats(clnt); + clnt->cl_broken = 0; err = -ENOMEM; if (clnt->cl_metrics == NULL) goto out_no_stats; @@ -293,6 +324,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) xprt = xprt_create_transport(&xprtargs); if (IS_ERR(xprt)) return (struct rpc_clnt *)xprt; + xprt->owner_env = get_ve(get_exec_env()); /* * By default, kernel RPC client connects from a reserved port. @@ -305,13 +337,16 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) xprt->resvport = 0; clnt = rpc_new_client(args, xprt); - if (IS_ERR(clnt)) + if (IS_ERR(clnt)) { + put_ve(xprt->owner_env); return clnt; + } if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { int err = rpc_ping(clnt, RPC_TASK_SOFT); if (err != 0) { rpc_shutdown_client(clnt); + put_ve(xprt->owner_env); return ERR_PTR(err); } } @@ -519,6 +554,9 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) { struct rpc_task *task, *ret; + if (task_setup_data->rpc_client->cl_broken) + return ERR_PTR(-EIO); + task = rpc_new_task(task_setup_data); if (task == NULL) { rpc_release_calldata(task_setup_data->callback_ops, @@ -936,6 +974,7 @@ call_bind_status(struct rpc_task *task) if (task->tk_status >= 0) { dprint_status(task); + rpc_abort_clear(task); task->tk_status = 0; task->tk_action = call_connect; return; @@ -959,6 +998,10 @@ call_bind_status(struct rpc_task *task) case -ETIMEDOUT: dprintk("RPC: %5u rpcbind request timed out\n", task->tk_pid); + if (rpc_abort_hard(task)) { + status = -EIO; + break; + } goto retry_timeout; case -EPFNOSUPPORT: /* server doesn't support any rpcbind version we know of */ @@ -1024,18 +1067,21 @@ call_connect_status(struct rpc_task *task) /* Something failed: remote service port may have changed */ rpc_force_rebind(clnt); + if (rpc_abort_hard(task)) + goto exit; switch (status) { case -ENOTCONN: case -EAGAIN: task->tk_action = call_bind; - if (!RPC_IS_SOFT(task)) + if (RPC_IS_SOFT(task) || rpc_abort_hard(task)) return; /* if soft mounted, test if we've timed out */ case -ETIMEDOUT: task->tk_action = call_timeout; return; } +exit: rpc_exit(task, -EIO); } @@ -1174,7 +1220,7 @@ call_timeout(struct rpc_task *task) dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid); task->tk_timeouts++; - if (RPC_IS_SOFT(task)) { + if (RPC_IS_SOFT(task) || rpc_abort_hard(task)) { if (clnt->cl_chatty) printk(KERN_NOTICE "%s: server %s not responding, timed out\n", clnt->cl_protname, clnt->cl_server); @@ -1222,6 +1268,7 @@ call_decode(struct rpc_task *task) task->tk_flags &= ~RPC_CALL_MAJORSEEN; } + rpc_abort_clear(task); /* * Ensure that we see all writes made by xprt_complete_rqst() * before it changed req->rq_received. @@ -1234,7 +1281,7 @@ call_decode(struct rpc_task *task) sizeof(req->rq_rcv_buf)) != 0); if (req->rq_rcv_buf.len < 12) { - if (!RPC_IS_SOFT(task)) { + if (!RPC_IS_SOFT(task) && !rpc_abort_hard(task)) { task->tk_action = call_bind; clnt->cl_stats->rpcretrans++; goto out_retry; @@ -1581,3 +1628,67 @@ void rpc_show_tasks(void) spin_unlock(&rpc_client_lock); } #endif + +#ifdef CONFIG_VE +static int ve_sunrpc_start(void *data) +{ + return 0; +} + +void ve_sunrpc_stop(void *data) +{ + struct ve_struct *ve = (struct ve_struct *)data; + struct rpc_clnt *clnt; + struct rpc_task *rovr; + + dprintk("RPC: killing all tasks for VE %d\n", ve->veid); + + spin_lock(&rpc_client_lock); + list_for_each_entry(clnt, &all_clients, cl_clients) { + if (clnt->cl_xprt->owner_env != ve) + continue; + + spin_lock(&clnt->cl_lock); + list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) { + if (!RPC_IS_ACTIVATED(rovr)) + continue; + printk(KERN_WARNING "RPC: Killing task %d client %p\n", + rovr->tk_pid, clnt); + + rovr->tk_flags |= RPC_TASK_KILLED; + rpc_exit(rovr, -EIO); + rpc_wake_up_queued_task(rovr->tk_waitqueue, rovr); + } + schedule_work(&clnt->cl_xprt->task_cleanup); + spin_unlock(&clnt->cl_lock); + } + spin_unlock(&rpc_client_lock); + + flush_scheduled_work(); +} + +static struct ve_hook sunrpc_hook = { + .init = ve_sunrpc_start, + .fini = ve_sunrpc_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET_PRE, +}; + +void ve_sunrpc_hook_register(void) +{ + ve_hook_register(VE_SS_CHAIN, &sunrpc_hook); +} + +void ve_sunrpc_hook_unregister(void) +{ + ve_hook_unregister(&sunrpc_hook); +} +#else +void ve_sunrpc_hook_register(void) +{ +} + +void ve_sunrpc_hook_unregister(void) +{ +} +#endif diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 23a2b8f..469a783 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -894,6 +894,7 @@ static struct file_system_type rpc_pipe_fs_type = { .name = "rpc_pipefs", .get_sb = rpc_get_sb, .kill_sb = kill_litter_super, + .fs_flags = FS_VIRTUALIZED, }; static void diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 385f427..08d0209 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -608,7 +608,9 @@ void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata) static void __rpc_execute(struct rpc_task *task) { int status = 0; + struct ve_struct *env; + env = set_exec_env(task->tk_client->cl_xprt->owner_env); dprintk("RPC: %5u __rpc_execute flags=0x%x\n", task->tk_pid, task->tk_flags); @@ -650,10 +652,14 @@ static void __rpc_execute(struct rpc_task *task) rpc_clear_running(task); if (RPC_IS_ASYNC(task)) { /* Careful! we may have raced... */ - if (RPC_IS_QUEUED(task)) + if (RPC_IS_QUEUED(task)) { + (void)set_exec_env(env); return; - if (rpc_test_and_set_running(task)) + } + if (rpc_test_and_set_running(task)) { + (void)set_exec_env(env); return; + } continue; } @@ -682,6 +688,7 @@ static void __rpc_execute(struct rpc_task *task) task->tk_status); /* Release all resources associated with the task */ rpc_release_task(task); + (void)set_exec_env(env); } /* diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 843629f..94c3fb0 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -24,6 +24,9 @@ extern struct cache_detail ip_map_cache, unix_gid_cache; +extern void ve_sunrpc_hook_register(void); +extern void ve_sunrpc_hook_unregister(void); + static int __init init_sunrpc(void) { @@ -46,6 +49,7 @@ init_sunrpc(void) svc_init_xprt_sock(); /* svc sock transport */ init_socket_xprt(); /* clnt sock transport */ rpcauth_init_module(); + ve_sunrpc_hook_register(); out: return err; } @@ -53,6 +57,7 @@ out: static void __exit cleanup_sunrpc(void) { + ve_sunrpc_hook_unregister(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 3e65719..0d49dfc 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -178,6 +178,9 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) unsigned int pglen = xdr->page_len; unsigned int flags = MSG_MORE; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + struct ve_struct *old_env; + + old_env = set_exec_env(sock->sk->owner_env); slen = xdr->len; @@ -238,6 +241,8 @@ out: svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); + (void)set_exec_env(old_env); + return len; } @@ -1225,8 +1230,9 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, error = sock_create_kern(sin->sa_family, type, protocol, &sock); if (error < 0) - return ERR_PTR(error); + return ERR_PTR(-ENOMEM); + sk_change_net_get(sock->sk, get_exec_env()->ve_netns); svc_reclassify_socket(sock); if (type == SOCK_STREAM) @@ -1267,6 +1273,8 @@ static void svc_sock_detach(struct svc_xprt *xprt) dprintk("svc: svc_sock_detach(%p)\n", svsk); + /* XXX: serialization? */ + sk->sk_user_data = NULL; /* put back the old socket callbacks */ sk->sk_state_change = svsk->sk_ostate; sk->sk_data_ready = svsk->sk_odata; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 99a52aa..9880f38 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -568,10 +568,13 @@ static void xprt_autoclose(struct work_struct *work) { struct rpc_xprt *xprt = container_of(work, struct rpc_xprt, task_cleanup); + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); xprt->ops->close(xprt); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); xprt_release_write(xprt, NULL); + (void)set_exec_env(ve); } /** @@ -638,7 +641,9 @@ static void xprt_init_autodisconnect(unsigned long data) { struct rpc_xprt *xprt = (struct rpc_xprt *)data; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); spin_lock(&xprt->transport_lock); if (!list_empty(&xprt->recv) || xprt->shutdown) goto out_abort; @@ -649,9 +654,11 @@ xprt_init_autodisconnect(unsigned long data) xprt_release_write(xprt, NULL); else queue_work(rpciod_workqueue, &xprt->task_cleanup); + (void)set_exec_env(ve); return; out_abort: spin_unlock(&xprt->transport_lock); + (void)set_exec_env(ve); } /** @@ -1044,6 +1051,7 @@ found: xprt->last_used = jiffies; xprt->cwnd = RPC_INITCWND; xprt->bind_index = 0; + xprt->owner_env = get_exec_env(); rpc_init_wait_queue(&xprt->binding, "xprt_binding"); rpc_init_wait_queue(&xprt->pending, "xprt_pending"); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 4486c59..ea790c1 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -64,6 +64,8 @@ static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT; static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; +static int xprt_min_abort_timeout = RPC_MIN_ABORT_TIMEOUT; +static int xprt_max_abort_timeout = RPC_MAX_ABORT_TIMEOUT; static struct ctl_table_header *sunrpc_table_header; @@ -117,6 +119,16 @@ static ctl_table xs_tunables_table[] = { .extra2 = &xprt_max_resvport_limit }, { + .procname = "abort_timeout", + .data = &xprt_abort_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xprt_min_abort_timeout, + .extra2 = &xprt_max_abort_timeout + }, + { .ctl_name = 0, }, }; @@ -752,18 +764,23 @@ out_release: static void xs_close(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - struct socket *sock = transport->sock; - struct sock *sk = transport->inet; - - if (!sk) - goto clear_close_wait; + struct socket *sock; + struct sock *sk; dprintk("RPC: xs_close xprt %p\n", xprt); - write_lock_bh(&sk->sk_callback_lock); + spin_lock_bh(&xprt->transport_lock); + if (transport->sock == NULL) { + spin_unlock_bh(&xprt->transport_lock); + goto clear_close_wait; + } + sock = transport->sock; + sk = transport->inet; transport->inet = NULL; transport->sock = NULL; + spin_unlock_bh(&xprt->transport_lock); + write_lock_bh(&sk->sk_callback_lock); sk->sk_user_data = NULL; sk->sk_data_ready = transport->old_data_ready; sk->sk_state_change = transport->old_state_change; @@ -1487,7 +1504,12 @@ static void xs_udp_connect_worker4(struct work_struct *work) struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; int err, status = -EIO; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); + down_read(&xprt->owner_env->op_sem); + if (!xprt->owner_env->is_running) + goto out; if (xprt->shutdown || !xprt_bound(xprt)) goto out; @@ -1498,6 +1520,7 @@ static void xs_udp_connect_worker4(struct work_struct *work) dprintk("RPC: can't create UDP transport socket (%d).\n", -err); goto out; } + sk_change_net_get(sock->sk, xprt->owner_env->ve_netns); xs_reclassify_socket4(sock); if (xs_bind4(transport, sock)) { @@ -1513,6 +1536,8 @@ static void xs_udp_connect_worker4(struct work_struct *work) out: xprt_wake_pending_tasks(xprt, status); xprt_clear_connecting(xprt); + up_read(&xprt->owner_env->op_sem); + (void)set_exec_env(ve); } /** @@ -1528,7 +1553,12 @@ static void xs_udp_connect_worker6(struct work_struct *work) struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; int err, status = -EIO; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); + down_read(&xprt->owner_env->op_sem); + if (!xprt->owner_env->is_running) + goto out; if (xprt->shutdown || !xprt_bound(xprt)) goto out; @@ -1539,6 +1569,7 @@ static void xs_udp_connect_worker6(struct work_struct *work) dprintk("RPC: can't create UDP transport socket (%d).\n", -err); goto out; } + sk_change_net_get(sock->sk, xprt->owner_env->ve_netns); xs_reclassify_socket6(sock); if (xs_bind6(transport, sock) < 0) { @@ -1554,6 +1585,8 @@ static void xs_udp_connect_worker6(struct work_struct *work) out: xprt_wake_pending_tasks(xprt, status); xprt_clear_connecting(xprt); + up_read(&xprt->owner_env->op_sem); + (void)set_exec_env(ve); } /* @@ -1632,7 +1665,12 @@ static void xs_tcp_connect_worker4(struct work_struct *work) struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; int err, status = -EIO; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); + down_read(&xprt->owner_env->op_sem); + if (!xprt->owner_env->is_running) + goto out; if (xprt->shutdown || !xprt_bound(xprt)) goto out; @@ -1642,6 +1680,7 @@ static void xs_tcp_connect_worker4(struct work_struct *work) dprintk("RPC: can't create TCP transport socket (%d).\n", -err); goto out; } + sk_change_net_get(sock->sk, xprt->owner_env->ve_netns); xs_reclassify_socket4(sock); if (xs_bind4(transport, sock) < 0) { @@ -1677,6 +1716,8 @@ out: xprt_wake_pending_tasks(xprt, status); out_clear: xprt_clear_connecting(xprt); + up_read(&xprt->owner_env->op_sem); + (void)set_exec_env(ve); } /** @@ -1692,7 +1733,12 @@ static void xs_tcp_connect_worker6(struct work_struct *work) struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; int err, status = -EIO; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); + down_read(&xprt->owner_env->op_sem); + if (!xprt->owner_env->is_running) + goto out; if (xprt->shutdown || !xprt_bound(xprt)) goto out; @@ -1702,6 +1748,7 @@ static void xs_tcp_connect_worker6(struct work_struct *work) dprintk("RPC: can't create TCP transport socket (%d).\n", -err); goto out; } + sk_change_net_get(sock->sk, xprt->owner_env->ve_netns); xs_reclassify_socket6(sock); if (xs_bind6(transport, sock) < 0) { @@ -1736,6 +1783,8 @@ out: xprt_wake_pending_tasks(xprt, status); out_clear: xprt_clear_connecting(xprt); + up_read(&xprt->owner_env->op_sem); + (void)set_exec_env(ve); } /** diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 015606b..b3f85a4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -115,6 +115,9 @@ #include #include +#include +#include + static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; static DEFINE_SPINLOCK(unix_table_lock); static atomic_t unix_nr_socks = ATOMIC_INIT(0); @@ -591,6 +594,8 @@ static struct sock * unix_create1(struct net *net, struct socket *sock) sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); if (!sk) goto out; + if (ub_other_sock_charge(sk)) + goto out_sk_free; sock_init_data(sock,sk); lockdep_set_class(&sk->sk_receive_queue.lock, @@ -612,6 +617,9 @@ out: if (sk == NULL) atomic_dec(&unix_nr_socks); return sk; +out_sk_free: + sk_free(sk); + return NULL; } static int unix_create(struct net *net, struct socket *sock, int protocol) @@ -1013,6 +1021,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, int st; int err; long timeo; + unsigned long chargesize; err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) @@ -1041,6 +1050,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (skb == NULL) goto out; + chargesize = skb_charge_fullsize(skb); + if (ub_sock_getwres_other(newsk, chargesize) < 0) + goto out; + ub_skb_set_charge(skb, newsk, chargesize, UB_OTHERSOCKBUF); restart: /* Find listening sock. */ @@ -1288,7 +1301,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) unix_notinflight(scm->fp->fp[i]); } -static void unix_destruct_fds(struct sk_buff *skb) +void unix_destruct_fds(struct sk_buff *skb) { struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); @@ -1299,6 +1312,7 @@ static void unix_destruct_fds(struct sk_buff *skb) scm_destroy(&scm); sock_wfree(skb); } +EXPORT_SYMBOL_GPL(unix_destruct_fds); static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { @@ -1510,6 +1524,16 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, size = len-sent; + if (msg->msg_flags & MSG_DONTWAIT) + ub_sock_makewres_other(sk, skb_charge_size(size)); + if (sock_bc(sk) != NULL && + sock_bc(sk)->poll_reserv >= + SOCK_MIN_UBCSPACE && + skb_charge_size(size) > + sock_bc(sk)->poll_reserv) + size = skb_charge_datalen(sock_bc(sk)->poll_reserv); + + /* Keep two messages in the pipe so it schedules better */ if (size > ((sk->sk_sndbuf >> 1) - 64)) size = (sk->sk_sndbuf >> 1) - 64; @@ -1521,7 +1545,9 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, * Grab a buffer */ - skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err); + + skb = sock_alloc_send_skb2(sk, size, SOCK_MIN_UBCSPACE, + msg->msg_flags&MSG_DONTWAIT, &err); if (skb==NULL) goto out_err; @@ -1961,6 +1987,7 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl { struct sock *sk = sock->sk; unsigned int mask; + int no_ub_res; poll_wait(file, sk->sk_sleep, wait); mask = 0; @@ -1973,6 +2000,10 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP; + no_ub_res = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); + if (no_ub_res) + ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); + /* readable? */ if (!skb_queue_empty(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) @@ -1986,7 +2017,7 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ - if (unix_writable(sk)) + if (!no_ub_res && unix_writable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; return mask; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2a27b84..d4dddb7 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include @@ -151,6 +152,7 @@ void unix_notinflight(struct file *fp) spin_unlock(&unix_gc_lock); } } +EXPORT_SYMBOL_GPL(unix_notinflight); static inline struct sk_buff *sock_queue_head(struct sock *sk) { diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 04c4150..aa0bad6 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1947,7 +1947,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) link = &xfrm_dispatch[type]; /* All operations require privileges, even GET */ - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) return -EPERM; if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || diff --git a/security/Kconfig b/security/Kconfig index 5592939..8447040 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -41,7 +41,7 @@ config KEYS_DEBUG_PROC_KEYS config SECURITY bool "Enable different security models" - depends on SYSFS + depends on SYSFS && !VE help This allows you to choose different security modules to be configured into your kernel. diff --git a/security/commoncap.c b/security/commoncap.c index e4c4b3f..3d956c0 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -35,6 +35,10 @@ int cap_netlink_send(struct sock *sk, struct sk_buff *skb) int cap_netlink_recv(struct sk_buff *skb, int cap) { + if (likely(cap == CAP_VE_NET_ADMIN) && + cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) + return 0; + if (!cap_raised(NETLINK_CB(skb).eff_cap, cap)) return -EPERM; return 0; @@ -420,7 +424,7 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name, return 0; } else if (!strncmp(name, XATTR_SECURITY_PREFIX, sizeof(XATTR_SECURITY_PREFIX) - 1) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) return -EPERM; return 0; } @@ -433,7 +437,7 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name) return 0; } else if (!strncmp(name, XATTR_SECURITY_PREFIX, sizeof(XATTR_SECURITY_PREFIX) - 1) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) return -EPERM; return 0; } @@ -696,7 +700,7 @@ void cap_task_reparent_to_init (struct task_struct *p) int cap_syslog (int type) { - if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) + if ((type != 3 && type != 10) && !capable(CAP_VE_SYS_ADMIN)) return -EPERM; return 0; } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 46f2397..f782966 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -10,11 +10,23 @@ #include #include #include +#include +#include +#include #define ACC_MKNOD 1 #define ACC_READ 2 #define ACC_WRITE 4 -#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE) +#define ACC_QUOTA 8 +#define ACC_HIDDEN 16 +#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE | ACC_QUOTA) + +static inline int convert_bits(int acc) +{ + /* ...10x <-> ...01x trial: guess hwy */ + return ((((acc & 06) == 00) || ((acc & 06) == 06)) ? acc : acc ^06) & + (ACC_READ | ACC_WRITE | ACC_QUOTA); +} #define DEV_BLOCK 1 #define DEV_CHAR 2 @@ -79,6 +91,38 @@ static int devcgroup_can_attach(struct cgroup_subsys *ss, /* * called under cgroup_lock() */ +#ifdef CONFIG_VE +static struct dev_whitelist_item default_whitelist_items[] = { + { ~0, ~0, DEV_ALL, ACC_MKNOD }, + { UNIX98_PTY_SLAVE_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE }, + { UNIX98_PTY_SLAVE_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE }, + { PTY_MASTER_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE }, + { PTY_SLAVE_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE }, + { MEM_MAJOR, /* null */ 3, DEV_CHAR, ACC_READ | ACC_WRITE }, + { MEM_MAJOR, /* zero */ 5, DEV_CHAR, ACC_READ | ACC_WRITE }, + { MEM_MAJOR, /* full */ 7, DEV_CHAR, ACC_READ | ACC_WRITE }, + { TTYAUX_MAJOR, /* tty */ 0, DEV_CHAR, ACC_READ | ACC_WRITE }, + { TTYAUX_MAJOR, /* ptmx */ 2, DEV_CHAR, ACC_READ | ACC_WRITE }, + { MEM_MAJOR, /* random */ 8, DEV_CHAR, ACC_READ }, + { MEM_MAJOR, /* urandom */ 9, DEV_CHAR, ACC_READ }, +}; + +static LIST_HEAD(default_perms); +#define parent_whitelist(p) (&default_perms) +static void prepare_def_perms(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(default_whitelist_items); i++) { + default_whitelist_items[i].access |= ACC_HIDDEN; + list_add(&default_whitelist_items[i].list, &default_perms); + } +} +#else +#define prepare_def_perms() do { } while(0) +#define parent_whitelist(p) (&parent_dev_cgroup->whitelist) +#endif + static int dev_whitelist_copy(struct list_head *dest, struct list_head *orig) { struct dev_whitelist_item *wh, *tmp, *new; @@ -204,10 +248,12 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup_subsys *ss, wh->type = DEV_ALL; wh->access = ACC_MASK; list_add(&wh->list, &dev_cgroup->whitelist); + + prepare_def_perms(); } else { parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup); ret = dev_whitelist_copy(&dev_cgroup->whitelist, - &parent_dev_cgroup->whitelist); + parent_whitelist(parent_dev_cgroup)); if (ret) { kfree(dev_cgroup); return ERR_PTR(ret); @@ -282,8 +328,15 @@ static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, set_access(acc, wh->access); set_majmin(maj, wh->major); set_majmin(min, wh->minor); - seq_printf(m, "%c %s:%s %s\n", type_to_char(wh->type), - maj, min, acc); + + if (cft != NULL) + seq_printf(m, "%c %s:%s %s\n", type_to_char(wh->type), + maj, min, acc); + else if (!(wh->access & ACC_HIDDEN)) + seq_printf(m, "%10u %c %03o %s:%s\n", + (unsigned)(unsigned long)m->private, + type_to_char(wh->type), + convert_bits(wh->access), maj, min); } rcu_read_unlock(); @@ -498,37 +551,35 @@ struct cgroup_subsys devices_subsys = { .subsys_id = devices_subsys_id, }; -int devcgroup_inode_permission(struct inode *inode, int mask) +static int __devcgroup_inode_permission(int blk, dev_t device, int mask) { struct dev_cgroup *dev_cgroup; struct dev_whitelist_item *wh; - dev_t device = inode->i_rdev; if (!device) return 0; - if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode)) - return 0; rcu_read_lock(); - dev_cgroup = task_devcgroup(current); list_for_each_entry_rcu(wh, &dev_cgroup->whitelist, list) { if (wh->type & DEV_ALL) goto acc_check; - if ((wh->type & DEV_BLOCK) && !S_ISBLK(inode->i_mode)) + if ((wh->type & DEV_BLOCK) && !blk) continue; - if ((wh->type & DEV_CHAR) && !S_ISCHR(inode->i_mode)) + if ((wh->type & DEV_CHAR) && blk) continue; - if (wh->major != ~0 && wh->major != imajor(inode)) + if (wh->major != ~0 && wh->major != MAJOR(device)) continue; - if (wh->minor != ~0 && wh->minor != iminor(inode)) + if (wh->minor != ~0 && wh->minor != MINOR(device)) continue; acc_check: if ((mask & MAY_WRITE) && !(wh->access & ACC_WRITE)) continue; if ((mask & MAY_READ) && !(wh->access & ACC_READ)) continue; + if ((mask & MAY_QUOTACTL) && !(wh->access & ACC_QUOTA)) + continue; rcu_read_unlock(); return 0; } @@ -538,6 +589,15 @@ acc_check: return -EPERM; } +int devcgroup_inode_permission(struct inode *inode, int mask) +{ + if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode)) + return 0; + + return __devcgroup_inode_permission(S_ISBLK(inode->i_mode), + inode->i_rdev, mask); +} + int devcgroup_inode_mknod(int mode, dev_t dev) { struct dev_cgroup *dev_cgroup; @@ -569,3 +629,75 @@ acc_check: return -EPERM; } + +#ifdef CONFIG_VE +int get_device_perms_ve(int dev_type, dev_t dev, int access_mode) +{ + int mask = 0; + + mask |= (access_mode & FMODE_READ ? MAY_READ : 0); + mask |= (access_mode & FMODE_WRITE ? MAY_WRITE : 0); + mask |= (access_mode & FMODE_QUOTACTL ? MAY_QUOTACTL : 0); + + return __devcgroup_inode_permission(dev_type == S_IFBLK, dev, mask); +} +EXPORT_SYMBOL(get_device_perms_ve); + +int set_device_perms_ve(struct ve_struct *ve, + unsigned type, dev_t dev, unsigned mask) +{ + int err = -EINVAL; + struct dev_whitelist_item *new; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) + return -ENOMEM; + + if ((type & S_IFMT) == S_IFBLK) + new->type = DEV_BLOCK; + else if ((type & S_IFMT) == S_IFCHR) + new->type = DEV_CHAR; + else + goto out; + + new->access = convert_bits(mask); + new->major = new->minor = ~0; + + switch (type & VE_USE_MASK) { + default: + new->minor = MINOR(dev); + case VE_USE_MAJOR: + new->major = MAJOR(dev); + case 0: + ; + } + + err = dev_whitelist_add(cgroup_to_devcgroup(ve->ve_cgroup), new); +out: + if (err < 0) + kfree(new); + return err; +} +EXPORT_SYMBOL(set_device_perms_ve); + +#ifdef CONFIG_PROC_FS +int devperms_seq_show(struct seq_file *m, void *v) +{ + struct ve_struct *ve = list_entry(v, struct ve_struct, ve_list); + + if (m->private == (void *)0) { + seq_printf(m, "Version: 2.7\n"); + m->private = (void *)-1; + } + + if (ve_is_super(ve)) { + seq_printf(m, "%10u b 016 *:*\n%10u c 006 *:*\n", 0, 0); + return 0; + } + + m->private = (void *)(unsigned long)ve->veid; + return devcgroup_seq_read(ve->ve_cgroup, NULL, m); +} +EXPORT_SYMBOL(devperms_seq_show); +#endif +#endif diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig index a436d1c..130a8be 100644 --- a/security/selinux/Kconfig +++ b/security/selinux/Kconfig @@ -1,6 +1,6 @@ config SECURITY_SELINUX bool "NSA SELinux Support" - depends on SECURITY_NETWORK && AUDIT && NET && INET + depends on SECURITY_NETWORK && AUDIT && NET && INET && !VE select NETWORK_SECMARK default n help diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 03fc6a8..ed0f080 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -5225,12 +5225,12 @@ static int selinux_setprocattr(struct task_struct *p, struct task_struct *g, *t; struct mm_struct *mm = p->mm; read_lock(&tasklist_lock); - do_each_thread(g, t) { + do_each_thread_ve(g, t) { if (t->mm == mm && t != p) { read_unlock(&tasklist_lock); return -EPERM; } - } while_each_thread(g, t); + } while_each_thread_ve(g, t); read_unlock(&tasklist_lock); }